Source code for

from __future__ import print_function
import numpy as np
from scipy.spatial.distance import cdist, pdist, squareform
import scipy.linalg
from scipy.linalg import eigh, expm, logm
from scipy.optimize import fmin_l_bfgs_b
import scipy.optimize
from itertools import product
#from opt import check_gradient
from .basis import LegendreTensorBasis
from .function import BaseFunction
__all__ = ['GaussianProcess']

[docs]class GaussianProcess(BaseFunction): r""" Fits a Gaussian Process by maximizing the marginal likelihood Given :math:`M` pairs of :math:`\mathbf{x}_i \in \mathbb{R}^m` and :math:`y_i \in \mathbb{R}`, this fits a model .. math:: g(\mathbf{x}) = \sum_{i=1}^M \alpha_i e^{-\| \mathbf{L}( \mathbf{x} - \mathbf{x}_i) \|_2^2} + \sum_{j} \beta_j \psi_j(x) where :math:`\mathbf{L}` is a lower triangular matrix, :math:`\lbrace \psi_j \rbrace_j` is a polynomial basis (e.g., linear), and :math:`\boldsymbol{\alpha}, \boldsymbol{\beta}` are vectors of weights. These parameters are choosen to maximize the log-marginal likelihood (see [RW06]_, Sec. 2.7), or equivalently minimize .. math:: \min_{\mathbf{L}} & \ \frac12 \mathbf{y}^\top \boldsymbol{\alpha} + \frac12 \log \det (\mathbf{K} + \tau \mathbf{I}), & \quad [\mathbf{K}]_{i,j} &= e^{-\frac12 \| \mathbf{L}(\mathbf{x}_i -\mathbf{x}_j)\|_2^2} \\ \text{where} & \ \begin{bmatrix} \mathbf{K} + \tau \mathbf{I} & \mathbf{V} \\ \mathbf{V}^\top & \mathbf{0} \end{bmatrix} \begin{bmatrix} \boldsymbol{\alpha} \\ \boldsymbol{\beta} \end{bmatrix} = \begin{bmatrix} \mathbf{y} \\ \mathbf{0} \end{bmatrix}, & \quad [\mathbf{V}]_{i,j} &= \psi_j(\mathbf{x}_i). Here :math:`\tau` denotes the nugget--a Tikhonov-like regularization term commonly used to combat the ill-conditioning of the kernel matrix :math:`\mathbf{K}`. The additional constraint for the polynomial basis is described by Jones ([Jon01]_ eq. (3) and (4)). This class allows for four kinds of distance matrices :math:`\mathbf{L}`: ================ ============================================= scalar :math:`\mathbf{L} = \ell \mathbf{I}` scalar multiple :math:`\mathbf{L} = \ell \mathbf{L}_0` diagonal :math:`\mathbf{L} = \text{diag}(\boldsymbol{\ell})` lower triangular :math:`\mathbf{L}` is lower triangular ================ ============================================= Internally, we optimize with respect to the matrix log; specificially, we parameterize :math:`\mathbf{L}` in terms of some scalar or vector :math:`\boldsymbol{\ell}`; i.e., ================ ============================================= scalar :math:`\mathbf{L}(\ell) = e^{\ell}\mathbf{I}` scalar multiple :math:`\mathbf{L}(\ell) = e^{\ell} \mathbf{L}_0` diagonal :math:`\mathbf{L}(\ell) = \text{diag}(\lbrace e^{\ell_i} \rbrace_{i})` lower triangular :math:`\mathbf{L}(\ell) = e^{\text{tril}(\boldsymbol{\ell})}` ================ ============================================= In the constant and diagonal cases, this corresponds to the standard practice of optimizing with respect to the (scalar) log of the scaling parameters. Our experience suggests that the matrix log similarly increases the accuracy when working with the lower triangular parameterization. For the first three classes we have simple expressions for the derivatives, but for the lower triangular parameterization we use complex step approximation to compute the derivative [MN10]_. With this derivative information, we solve the optimization problem using L-BFGS as implemented in scipy. Parameters ---------- structure: ['tril', 'diag', 'const', 'scalar_mult'] Structure of the matrix L, either * const: constant * eye * scalar_mult * tril: lower triangular * diag: diagonal Returns ------- L: np.ndarray(m,m) Distance matrix alpha: np.ndarray(M) Weights for the Gaussian process kernel beta: np.ndarray(N) Weights for the polynomial component in the LegendreTensorBasis obj: float Log-likelihood objective function value References ---------- .. [RW06] Gaussian Processes for Machine Learning, Carl Edward Rasmussen and Christopher K. I. Williams, 2006 MIT Press .. [Jon01] A Taxonomy of Global Optimization Methods Based on Response Surfaces, Donald R. Jones, Journal of Global Optimization, 21, pp. 345--383, 2001. .. [MN10] "The complex step approximation to the Frechet derivative of a matrix function", Awad H. Al-Mohy and Nicholas J. Higham, Numerical Algorithms, 2010 (53), pp. 133--148. """ def __init__(self, structure = 'const', degree = None, nugget = None, Lfixed = None, n_init = 1): self.structure = structure self.rank = None # currently disabled due to conditioning issues self.n_init = n_init = degree self._best_score = np.inf if nugget is None: nugget = 5*np.finfo(float).eps self.nugget = nugget if structure == 'scalar_mult': assert Lfixed is not None, "Must specify 'Lfixed' to use scalar_mult" self.Lfixed = Lfixed def _make_L(self, ell): r""" Constructs the L matrix from the parameterization corresponding to the structure """ if self.structure == 'const': return np.exp(ell)*np.eye(self.m) elif self.structure == 'scalar_mult': return np.exp(ell)*self.Lfixed elif self.structure == 'diag': return np.diag(np.exp(ell)) elif self.structure == 'tril': # Construct the L matrix L = np.zeros((self.m*self.m,), dtype = ell.dtype) L[self.tril_flat] = ell L = L.reshape(self.m,self.m) # This is a more numerically stable way to compute expm(L) - I #Lexp = # JMH 8 Aug 2019: I'm less sure about the value of parameterizing as L*expm(L) # Specifically, having the zero matrix easily accessible doesn't seem like a good thing # and he gradient is more accurately computed using this form. Lexp = scipy.linalg.expm(L) return Lexp def _log_marginal_likelihood(self, ell, X = None, y = None, return_obj = True, return_grad = False, return_alpha_beta = False): if X is None: X = self.X if y is None: y = self.y # Extract basic constants M = X.shape[0] m = X.shape[1] L = self._make_L(ell) # Compute the squared distance Y =, X.T).T dij = pdist(Y, 'sqeuclidean') # Covariance matrix K = np.exp(-0.5*squareform(dij)) # Solve the linear system to compute the coefficients #alpha =,(1./(ew+tikh))*,y)) A = np.vstack([np.hstack([K + self.nugget*np.eye(K.shape[0]), self.V]), np.hstack([self.V.T, np.zeros((self.V.shape[1], self.V.shape[1]))])]) b = np.hstack([y, np.zeros(self.V.shape[1])]) # As A can be singular, we use an eigendecomposition based inverse ewA, evA = eigh(A) I = (np.abs(ewA) > 5*np.finfo(float).eps) x =[:,I],(1./ewA[I])*[:,I].T,b)) alpha = x[:M] beta = x[M:] if return_alpha_beta: return alpha, beta ew, ev = scipy.linalg.eigh(K + self.nugget*np.eye(K.shape[0])) #if np.min(ew) <= 0: # bonus_regularization = -2*np.min(ew)+1e-14 # ew += bonus_regularization # K += bonus_regularization*np.eye(K.shape[0]) if return_obj: # Should this be with yhat or y? # yhat = y -, beta) # Doesn't matter because alpha in nullspace of V.T # RW06: (5.8) with np.errstate(invalid = 'ignore'): obj = 0.5*, alpha) + 0.5*np.sum(np.log(ew)) if not return_grad: return obj # Now compute the gradient # Build the derivative of the covariance matrix K wrt to L if self.structure == 'tril': dK = np.zeros((M,M, len(ell))) for idx, (k, el) in enumerate(self.tril_ij): eidx = np.zeros(ell.shape) eidx[idx] = 1. # Approximation of the matrix exponential derivative [MH10] h = 1e-10 dL = np.imag(self._make_L(ell + 1j*h*eidx))/h dY =, X.T).T for i in range(M): # Evaluate the dot product # dK[i,j,idx] -=[i] - Y[j], dY[i] - dY[j]) dK[i,:,idx] -= np.sum((Y[i] - Y)*(dY[i] - dY), axis = 1) for idx in range(len(self.tril_ij)): dK[:,:,idx] *= K elif self.structure == 'diag': dK = np.zeros((M,M, len(ell))) for idx, (k, el) in enumerate(self.tril_ij): for i in range(M): dK[i,:,idx] -= (Y[i,k] - Y[:,k])*(Y[i,el] - Y[:,el]) for idx in range(len(self.tril_ij)): dK[:,:,idx] *= K elif self.structure in ['const', 'scalar_mult']: # In the scalar case everything drops and we # simply need # dK[i,j,1] = (Y[i] - Y[j])*(Y[i] - Y[j])*K # which we have already computed dK = -(squareform(dij)*K).reshape(M,M,1) # Now compute the gradient grad = np.zeros(len(ell)) for k in range(len(ell)): #Kinv_dK =,,,dK[:,:,k]))) #I = (ew > 0.1*np.sqrt(np.finfo(float).eps)) #I = (ew>5*np.finfo(float).eps) #print "k", k, "dK", dK.shape Kinv_dK =, (,dK[:,:,k]).T/ew).T) # Note flipped signs from RW06 eq. 5.9 grad[k] = 0.5*np.trace(Kinv_dK) grad[k] -= 0.5*,, dK[:,:,k])) if return_obj and return_grad: return obj, grad if not return_obj: return grad def _obj(self, ell, X = None, y = None): return self._log_marginal_likelihood(ell, X, y, return_obj = True, return_grad = False, return_alpha_beta = False) def _grad(self, ell, X = None, y = None): return self._log_marginal_likelihood(ell, X, y, return_obj = False, return_grad = True, return_alpha_beta = False) def _fit_init(self, X, y): m = self.m = X.shape[1] self.X = X self.y = y # Setup structure based properties if self.structure == 'tril': if self.rank is None: rank = m else: rank = self.rank self.tril_ij = [ (i,j) for i, j in zip(*np.tril_indices(m)) if i >= (m - rank)] self.tril_flat = np.array([ i*m + j for i,j in self.tril_ij]) elif self.structure == 'diag': self.tril_ij = [ (i,i) for i in range(m)] # Cache Vandermonde matrix on sample points if is not None: self.basis = LegendreTensorBasis(, X = X) self.V = self.basis.V(X) else: self.V = np.zeros((X.shape[0],0))
[docs] def fit(self, X, y, L0 = None): """ Fit a Gaussian process model Parameters ---------- X: array-like (M, m) M input coordinates of dimension m y: array-like (M,) y[i] is the output at X[i] """ X = np.array(X) y = np.array(y).flatten() # Initialized cached values for fit self._fit_init(X, y) if L0 is None: L0 = np.eye(self.m) if self.structure == 'tril': ell0 = np.array([L0[i,j] for i, j in self.tril_ij]) elif self.structure == 'diag': if len(L0.shape) == 1: ell0 = L0.flatten() else: ell0 = np.array([L0[i,i] for i, j in self.tril_ij]) elif self.structure == 'scalar_mult': ell0 = np.array(L0.flatten()[0]) elif self.structure == 'const': ell0 = np.array(L0.flatten()[0]) # Actually do the fitting # TODO: Implement multiple initializations self._fit(ell0)
def _fit(self, ell0): # the implementation in l_bfgs_b seems flaky when we have invalid values #ell, obj, d = fmin_l_bfgs_b(self._obj, ell0, fprime = self._grad, disp = True) res = scipy.optimize.minimize(self._obj, ell0, jac = self._grad, #method = 'L-BFGS-B', #options = {'disp': True}, ) ell = res.x self.L = self._make_L(ell) self.alpha, self.beta = self._log_marginal_likelihood(ell, return_obj = False, return_grad = False, return_alpha_beta = True) self._ell = ell def eval(self, Xnew, return_cov = False): Y =, self.X.T).T Ynew =, Xnew.T).T dij = cdist(Ynew, Y, 'sqeuclidean') K = np.exp(-0.5*dij) if is not None: V = self.basis.V(Xnew) else: V = np.zeros((Xnew.shape[0],0)) fXnew =, self.alpha) +, self.beta) if return_cov: KK = np.exp(-0.5*squareform(pdist(Y, 'sqeuclidean'))) ew, ev = eigh(KK) I = (ew > 500*np.finfo(float).eps) z = ev[:,I].dot( np.diag(1./ew[I]).dot(ev[:,I] #z =[:,I],[I]).dot(ev[:,I] K.T)) )) cov = np.array([ 1 -[i,:], z[:,i]) for i in range(Xnew.shape[0])]) cov[cov< 0] = 0. return fXnew, cov else: return fXnew