Source code for psdr.gp

from __future__ import print_function
import numpy as np
from scipy.spatial.distance import cdist, pdist, squareform
import scipy.linalg
from scipy.linalg import eigh, expm, logm
from scipy.optimize import fmin_l_bfgs_b
import scipy.optimize
from itertools import product
#from opt import check_gradient
from .basis import LegendreTensorBasis
from .function import BaseFunction
__all__ = ['GaussianProcess']

	
	
[docs]class GaussianProcess(BaseFunction):
	r""" Fits a Gaussian Process by maximizing the marginal likelihood

	Given :math:`M` pairs of :math:`\mathbf{x}_i \in \mathbb{R}^m` and :math:`y_i \in \mathbb{R}`,
	this fits a model 

	.. math::

		g(\mathbf{x}) = 
			\sum_{i=1}^M \alpha_i e^{-\| \mathbf{L}( \mathbf{x} - \mathbf{x}_i) \|_2^2} 
			+ \sum_{j} \beta_j \psi_j(x)

	where :math:`\mathbf{L}` is a lower triangular matrix,
	:math:`\lbrace \psi_j \rbrace_j` is a polynomial basis (e.g., linear),
	and :math:`\boldsymbol{\alpha}, \boldsymbol{\beta}` are vectors of weights.
	These parameters are choosen to maximize the log-marginal likelihood (see [RW06]_, Sec. 2.7), 
	or equivalently minimize

	.. math::

		\min_{\mathbf{L}} & \ 
			\frac12 \mathbf{y}^\top \boldsymbol{\alpha}
			+ \frac12 \log \det (\mathbf{K} + \tau \mathbf{I}), & \quad 
			[\mathbf{K}]_{i,j} &= e^{-\frac12 \| \mathbf{L}(\mathbf{x}_i -\mathbf{x}_j)\|_2^2} \\
		\text{where} & \ 
			\begin{bmatrix}
				\mathbf{K} + \tau \mathbf{I} & \mathbf{V} \\ \mathbf{V}^\top & \mathbf{0}
			\end{bmatrix} 
			\begin{bmatrix}
				\boldsymbol{\alpha} \\
				\boldsymbol{\beta} 
			\end{bmatrix}
			= 
			\begin{bmatrix}
				\mathbf{y} \\ \mathbf{0}
			\end{bmatrix},
			&
			\quad [\mathbf{V}]_{i,j} &= \psi_j(\mathbf{x}_i).
	
	Here :math:`\tau` denotes the nugget--a Tikhonov-like regularization term commonly used to 
	combat the ill-conditioning of the kernel matrix :math:`\mathbf{K}`.
	The additional constraint for the polynomial basis is described by Jones
	([Jon01]_ eq. (3) and (4)).


	This class allows for four kinds of distance matrices :math:`\mathbf{L}`:

	================ 	=============================================
	scalar   			:math:`\mathbf{L} = \ell \mathbf{I}`
	scalar multiple 	:math:`\mathbf{L} = \ell \mathbf{L}_0`
	diagonal 			:math:`\mathbf{L} = \text{diag}(\boldsymbol{\ell})` 
	lower triangular	:math:`\mathbf{L}` is lower triangular
	================ 	=============================================
		
	Internally, we optimize with respect to the matrix log;
	specificially, we parameterize :math:`\mathbf{L}` in terms of
	some scalar or vector :math:`\boldsymbol{\ell}`; i.e., 
	

	================ 	=============================================
	scalar   			:math:`\mathbf{L}(\ell) = e^{\ell}\mathbf{I}`
	scalar multiple 	:math:`\mathbf{L}(\ell) = e^{\ell} \mathbf{L}_0`
	diagonal 			:math:`\mathbf{L}(\ell) = \text{diag}(\lbrace e^{\ell_i} \rbrace_{i})` 
	lower triangular	:math:`\mathbf{L}(\ell) = e^{\text{tril}(\boldsymbol{\ell})}`
	================ 	=============================================

	In the constant and diagonal cases, this corresponds to the standard practice of
	optimizing with respect to the (scalar) log of the scaling parameters. 
	Our experience suggests that the matrix log similarly increases the accuracy
	when working with the lower triangular parameterization.  
	For the first three classes we have simple expressions for the derivatives,
	but for the lower triangular parameterization we use complex step
	approximation to compute the derivative [MN10]_. 
	With this derivative information,
	we solve the optimization problem using L-BFGS as implemented in scipy.
	



	Parameters
	----------
	structure: ['tril', 'diag', 'const', 'scalar_mult']
		Structure of the matrix L, either

		* const: constant * eye
		* scalar_mult
		* tril: lower triangular
		* diag: diagonal	



	Returns
	-------
	L: np.ndarray(m,m)
		Distance matrix
	alpha: np.ndarray(M)
		Weights for the Gaussian process kernel
	beta: np.ndarray(N)
		Weights for the polynomial component in the LegendreTensorBasis	
	obj: float
		Log-likelihood objective function value

	References
	----------
	.. [RW06] Gaussian Processes for Machine Learning, Carl Edward Rasmussen and Christopher K. I. Williams,
		2006 MIT Press

	.. [Jon01] A Taxonomy of Global Optimization Methods Based on Response Surfaces,
		Donald R. Jones, Journal of Global Optimization, 21, pp. 345--383, 2001.

	.. [MN10] "The complex step approximation to the Frechet derivative of a matrix function", 
		Awad H. Al-Mohy and Nicholas J. Higham, Numerical Algorithms, 2010 (53), pp. 133--148.
	"""
	def __init__(self, structure = 'const', degree = None, nugget = None, Lfixed = None,
		n_init = 1):
		self.structure = structure
		
		self.rank = None # currently disabled due to conditioning issues
		self.n_init = n_init
		self.degree = degree

		self._best_score = np.inf

		if nugget is None:
			nugget = 5*np.finfo(float).eps
		self.nugget = nugget

		if structure == 'scalar_mult':
			assert Lfixed is not None, "Must specify 'Lfixed' to use scalar_mult"
			self.Lfixed = Lfixed

	def _make_L(self, ell):
		r""" Constructs the L matrix from the parameterization corresponding to the structure
		"""


		if self.structure == 'const':
			return np.exp(ell)*np.eye(self.m)
		elif self.structure == 'scalar_mult':
			return np.exp(ell)*self.Lfixed
		elif self.structure == 'diag':
			return np.diag(np.exp(ell))	
		elif self.structure == 'tril':
			# Construct the L matrix	
			L = np.zeros((self.m*self.m,), dtype = ell.dtype)
			L[self.tril_flat] = ell
			L = L.reshape(self.m,self.m)
			
			# This is a more numerically stable way to compute expm(L) - I
			#Lexp = L.dot(scipy.linalg.expm(L))
			
			# JMH 8 Aug 2019: I'm less sure about the value of parameterizing as L*expm(L)
			# Specifically, having the zero matrix easily accessible doesn't seem like a good thing
			# and he gradient is more accurately computed using this form. 
			Lexp = scipy.linalg.expm(L)
			
			return Lexp

	def _log_marginal_likelihood(self, ell, X = None, y = None, return_obj = True, return_grad = False, return_alpha_beta = False):
		
		if X is None: X = self.X
		if y is None: y = self.y 

		# Extract basic constants
		M = X.shape[0]	
		m = X.shape[1]	


		L = self._make_L(ell)
		# Compute the squared distance
		Y = np.dot(L, X.T).T
		dij = pdist(Y, 'sqeuclidean') 
		
		# Covariance matrix
		K = np.exp(-0.5*squareform(dij))

		# Solve the linear system to compute the coefficients
		#alpha = np.dot(ev,(1./(ew+tikh))*np.dot(ev.T,y)) 
		A = np.vstack([np.hstack([K + self.nugget*np.eye(K.shape[0]), self.V]), 
					   np.hstack([self.V.T, np.zeros((self.V.shape[1], self.V.shape[1]))])])
		b = np.hstack([y, np.zeros(self.V.shape[1])])

		# As A can be singular, we use an eigendecomposition based inverse
		ewA, evA = eigh(A)
		I = (np.abs(ewA) > 5*np.finfo(float).eps)
		x = np.dot(evA[:,I],(1./ewA[I])*np.dot(evA[:,I].T,b))

		alpha = x[:M]
		beta = x[M:]

		if return_alpha_beta:
			return alpha, beta

		ew, ev = scipy.linalg.eigh(K + self.nugget*np.eye(K.shape[0]))
		#if np.min(ew) <= 0:
		#	bonus_regularization = -2*np.min(ew)+1e-14 
		#	ew += bonus_regularization
		#	K += bonus_regularization*np.eye(K.shape[0])

		if return_obj:
			# Should this be with yhat or y?
			# yhat = y - np.dot(V, beta)
			# Doesn't matter because alpha in nullspace of V.T
			# RW06: (5.8)
			with np.errstate(invalid = 'ignore'):
				obj = 0.5*np.dot(y, alpha) + 0.5*np.sum(np.log(ew))
			if not return_grad:
				return obj
		
		# Now compute the gradient
		# Build the derivative of the covariance matrix K wrt to L

		if self.structure == 'tril':
			dK = np.zeros((M,M, len(ell)))
			for idx, (k, el) in enumerate(self.tril_ij):
				eidx = np.zeros(ell.shape)
				eidx[idx] = 1.
				# Approximation of the matrix exponential derivative [MH10]
				h = 1e-10
				dL = np.imag(self._make_L(ell + 1j*h*eidx))/h
				dY = np.dot(dL, X.T).T
				for i in range(M):
					# Evaluate the dot product
					# dK[i,j,idx] -= np.dot(Y[i] - Y[j], dY[i] - dY[j])
					dK[i,:,idx] -= np.sum((Y[i] - Y)*(dY[i] - dY), axis = 1)
			for idx in range(len(self.tril_ij)):
				dK[:,:,idx] *= K
	
		elif self.structure == 'diag':
			dK = np.zeros((M,M, len(ell)))
			for idx, (k, el) in enumerate(self.tril_ij):
				for i in range(M):
					dK[i,:,idx] -= (Y[i,k] - Y[:,k])*(Y[i,el] - Y[:,el])
			for idx in range(len(self.tril_ij)):
				dK[:,:,idx] *= K	

		elif self.structure in ['const', 'scalar_mult']:
			# In the scalar case everything drops and we 
			# simply need 
			# dK[i,j,1] = (Y[i] - Y[j])*(Y[i] - Y[j])*K
			# which we have already computed
			dK = -(squareform(dij)*K).reshape(M,M,1)
				
		# Now compute the gradient
		grad = np.zeros(len(ell))
	
		for k in range(len(ell)):
			#Kinv_dK = np.dot(ev, np.dot(np.diag(1./(ew+tikh)),np.dot(ev.T,dK[:,:,k])))
			#I = (ew > 0.1*np.sqrt(np.finfo(float).eps))
			#I = (ew>5*np.finfo(float).eps)
			#print "k", k, "dK", dK.shape

			Kinv_dK = np.dot(ev, (np.dot(ev.T,dK[:,:,k]).T/ew).T)
			# Note flipped signs from RW06 eq. 5.9
			grad[k] = 0.5*np.trace(Kinv_dK)
			grad[k] -= 0.5*np.dot(alpha, np.dot(alpha, dK[:,:,k]))

		if return_obj and return_grad:
			return obj, grad
		if not return_obj:
			return grad


	def _obj(self, ell, X = None, y = None):
		return self._log_marginal_likelihood(ell, X, y, 
			return_obj = True, return_grad = False, return_alpha_beta = False)
	
	def _grad(self, ell, X = None, y = None):
		return self._log_marginal_likelihood(ell, X, y, 
			return_obj = False, return_grad = True, return_alpha_beta = False)

	
	def _fit_init(self, X, y):
		m = self.m = X.shape[1]
		self.X = X
		self.y = y
		# Setup structure based properties
		if self.structure == 'tril':
			if self.rank is None: rank = m
			else: rank = self.rank
			
			self.tril_ij = [ (i,j) for i, j in zip(*np.tril_indices(m)) if i >= (m - rank)]
			self.tril_flat = np.array([ i*m + j for i,j in self.tril_ij])

		elif self.structure == 'diag':
			self.tril_ij = [ (i,i) for i in range(m)]


		# Cache Vandermonde matrix on sample points
		if self.degree is not None:
			self.basis = LegendreTensorBasis(self.degree, X = X)
			self.V = self.basis.V(X)
		else:
			self.V = np.zeros((X.shape[0],0))
		


[docs]	def fit(self, X, y, L0 = None):
		""" Fit a Gaussian process model

		Parameters
		----------
		X: array-like (M, m)
			M input coordinates of dimension m
		y: array-like (M,)
			y[i] is the output at X[i]
		"""
		X = np.array(X)
		y = np.array(y).flatten()
	
		# Initialized cached values for fit
		self._fit_init(X, y)	

	
		if L0 is None:
			L0 = np.eye(self.m)

		if self.structure == 'tril':
			ell0 = np.array([L0[i,j] for i, j in self.tril_ij])
		elif self.structure == 'diag':
			if len(L0.shape) == 1:
				ell0 = L0.flatten()
			else:
				ell0 = np.array([L0[i,i] for i, j in self.tril_ij])
		elif self.structure == 'scalar_mult':
			ell0 = np.array(L0.flatten()[0])
		elif self.structure == 'const':
			ell0 = np.array(L0.flatten()[0])

		# Actually do the fitting
		# TODO: Implement multiple initializations
		self._fit(ell0)


	def _fit(self, ell0):
		# the implementation in l_bfgs_b seems flaky when we have invalid values
		#ell, obj, d = fmin_l_bfgs_b(self._obj, ell0, fprime = self._grad, disp = True)
		res = scipy.optimize.minimize(self._obj, 
				ell0, 
				jac = self._grad,
				#method = 'L-BFGS-B',
				#options = {'disp': True}, 
			)
		ell = res.x
		self.L = self._make_L(ell)
		self.alpha, self.beta = self._log_marginal_likelihood(ell, 
			return_obj = False, return_grad = False, return_alpha_beta = True)
		self._ell = ell

	def eval(self, Xnew, return_cov = False):
		Y = np.dot(self.L, self.X.T).T
		Ynew = np.dot(self.L, Xnew.T).T
		dij = cdist(Ynew, Y, 'sqeuclidean')	
		K = np.exp(-0.5*dij)
		if self.degree is not None:
			V = self.basis.V(Xnew)
		else:
			V = np.zeros((Xnew.shape[0],0))

		fXnew = np.dot(K, self.alpha) + np.dot(V, self.beta)
		if return_cov:
			KK = np.exp(-0.5*squareform(pdist(Y, 'sqeuclidean')))
			ew, ev = eigh(KK)	
			I = (ew > 500*np.finfo(float).eps)
			z = ev[:,I].dot( np.diag(1./ew[I]).dot(ev[:,I].T.dot(K.T)))
			#z = np.dot(ev[:,I], np.dot(np.diag(1./ew[I]).dot(ev[:,I].T.dot( K.T)) ))
			cov = np.array([ 1 - np.dot(K[i,:], z[:,i]) for i in range(Xnew.shape[0])])
			cov[cov< 0] = 0.
			return fXnew, cov
		else:
			return fXnew