!pip install -qq GPy
import autograd.numpy as np
import pandas as pd
import GPy
import matplotlib.pyplot as plt
from autograd import grad
from matplotlib.animation import FuncAnimation
from matplotlib import rc
import seaborn as sns
Basics of kernels in Gaussian processes
Gaussian process (GP) is a stochstic process where each observation is assumed to be a sample from a Gaussian (Normal) distribution. Probability density function (PDF) for a single observation \(y_1\) is given as, \[ \begin{aligned} y_1 \sim \mathcal{N}\left(\mu_1, \sigma_1^2\right) \end{aligned} \] PDF of multiple such observations is a multivariate Gaussian distribution, \[ \begin{aligned} Y \sim \mathcal{N}\left(M, \Sigma\right) \end{aligned} \] In practice, \(M\) and \(\Sigma\) are modeled as functions of predictors \(X\).
Now, multivariate PDF can be modified as, \[ \begin{aligned} Y \sim \mathcal{N}\left(\mathcal{F}(X), \mathcal{K}(X,X)\right) \end{aligned} \] Where, \(\mathcal{F}\) is a mean function and \(\mathcal{K}\) is a kernel (covariance) function. Often \(\mathcal{F}(X)\) is assumed to be zero (\(\mathcal{F}(X)=0\)) and only \(\mathcal{K}(X, X)\) is employed to capture relationship between \(X\) and \(Y\).
The subsequent sections focus on various choices of \(\mathcal{K}\) and their effect on \(X\) and \(Y\).
RBF (Radial basis function) Kernel, Stationarity and Isotropy
RBF is one of the most commonly used kernels in GPs due to it’s infinetely differentiability (extreme flexibility). This property helps us to model a vast variety of functions \(X \to Y\).
RBF kernel is given as the following, \[ \begin{aligned} \mathcal{K}(x_1,x_2)= \sigma^2exp\left(-\frac{(x-x')^2}{2l^2}\right) \end{aligned} \] Where, \(\sigma^2\) is variance and \(l\) is known as lengthscale. #### Stationarity RBF is a stationary kernel and so it is invariant to translation in the input space. In other words, \(\mathcal{K}(x,x')\) depends only on \(x-x'\).
Isotropy
RBF is also isotropic kernel, which means that \(\mathcal{K}(x,x')\) depends only on \(|x-x'|\). Thus, we have \(\mathcal{K}(x,x') = \mathcal{K}(x',x)\).
Let’s visualize few functions drawn from the RBF kernel
def K_rbf(X1, X2, sigma=1., l=1.):
return (sigma**2)*(np.exp(-0.5*np.square(X1-X2.T)/l**2))
Helper functions
def plot_functions(kernel_func, ax0_ylim=(-3,3), ax1_ylim=(-0.1,1.1)):
= np.zeros(X.shape[0])
mean = kernel_func(X, X, sigma, l)
cov = np.random.multivariate_normal(mean, cov, size=5)
functions = plt.figure(figsize=(14,8), constrained_layout=True)
fig = fig.add_gridspec(2,4)
gs = fig.add_subplot(gs[0, 1:-1])
ax0 *ax0_ylim)
ax0.set_ylim(= fig.add_subplot(gs[1, 0:2])
ax1 *ax1_ylim)
ax1.set_ylim(= fig.add_subplot(gs[1, 2:4])
ax2 for func in functions:
'o-');
ax0.plot(X, func,'X');ax0.set_ylabel('Y');ax0.set_title('Functions drawn from '+k_name+' kernel');
ax0.set_xlabel(4]);ax1.set_title('K(0,X)');ax1.set_xlabel('X');ax1.set_ylabel('K(0,X)')
ax1.plot(X, cov[:,round(2), ax=ax2, xticklabels=X.ravel(), yticklabels=X.ravel(), annot=True);
sns.heatmap(cov.'X');ax2.set_ylabel('X');ax2.set_title('Covariance matrix');
ax2.set_xlabel(
def animate_functions(kernel_func, val_list, ax0_ylim=(-3,3), ax1_ylim=(-0.1,1.1),
='',p_name='',symbol=''):
k_name= plt.figure(figsize=(14,8))
fig = fig.add_gridspec(2,4)
gs = fig.add_subplot(gs[0, 1:-1]);ax1 = fig.add_subplot(gs[1, 0:2]);ax2 = fig.add_subplot(gs[1, 2:4]);
ax0 def update(p):
;ax1.cla();ax2.cla();
ax0.cla()*ax0_ylim);ax1.set_ylim(*ax1_ylim)
ax0.set_ylim(if p_name == 'Lengthscale':
= kernel_func(X, X, l=p)
cov elif p_name == 'Variance':
= kernel_func(X, X, sigma=np.sqrt(p))
cov elif p_name == 'Offset':
= kernel_func(X, X, c=p)
cov elif p_name == 'Period':
= kernel_func(X, X, p=p)
cov = np.random.multivariate_normal(mean, cov, size=5)
functions for func in functions:
'o-');
ax0.plot(X, func,'X');ax0.set_ylabel('Y');ax0.set_title('Functions drawn from '+k_name+' kernel\n'+p_name+' ('+symbol+') = '+str(p));
ax0.set_xlabel(4]);ax1.set_title('K(0,X)');ax1.set_title('K(0,X)');ax1.set_xlabel('X');ax1.set_ylabel('K(0,X)')
ax1.plot(X, cov[:,round(2), ax=ax2, xticklabels=X.ravel(), yticklabels=X.ravel(), annot=True, cbar=False);
sns.heatmap(cov.'X');ax2.set_ylabel('X');ax2.set_title('Covariance matrix');
ax2.set_xlabel(
= FuncAnimation(fig, update, frames=val_list, blit=False)
anim
plt.close()'animation', html='jshtml')
rc(return anim
Verifying if our kernel is consistent with GPy kernels.
= np.linspace(101,1001,200).reshape(-1,1)
X = 7, 11
sigma, l assert np.allclose(K_rbf(X,X,sigma,l), GPy.kern.RBF(1, variance=sigma**2, lengthscale=l).K(X,X))
0)
np.random.seed(= np.arange(-4,5).reshape(-1,1)
X = 1.
sigma = 3.
l = 'RBF'
k_name =(-3.5,3)) plot_functions(K_rbf, ax0_ylim
Let’s see the effect of varying parameters \(\sigma\) and \(l\) of the RBF kernel function.
0)
np.random.seed(= 1.
sigma = [0.5,1,2,3,4,5]
val_list ='RBF', p_name='Lengthscale', symbol='l') animate_functions(K_rbf, val_list, k_name
= 1.
l = [1,4,9,16,25]
val_list =(-12,12), ax1_ylim=(-0.1, 26),
animate_functions(K_rbf, val_list, ax0_ylim='RBF', p_name='Variance', symbol='sigma') k_name
With increase in value of \(l\), functions drawn from the kernel become smoother. Covariance between a pair of points is increasing with increase in \(l\).
Increasing \(\sigma^2\) increase the overall uncertainty (width of the space where 95% of the functions live) across all the points.
Matern Kernel
Matern kernels are given by a general formula as following, \[ \begin{aligned} \mathcal{K}(x_1, x_2) = \sigma^2\frac{1}{\Gamma(\nu)2^{\nu-1}}\Bigg( \frac{\sqrt{2\nu}}{l} |x_1-x_2| \Bigg)^\nu K_\nu\Bigg( \frac{\sqrt{2\nu}}{l} |x_1-x_2|\Bigg) \end{aligned} \] Where, \(\Gamma\) is gamma function and \(K_\nu\) is modified Bessel function of second order.
The general formula is not very intuitive about the functionality of this kernel. In practice, Matern with \(\nu=\{0.5,1.5,2.5\}\) are used, where GP with each kernel is \((\lceil\nu\rceil-1)\) times differentiable.
Matern functions corresponding to each \(\nu\) values are defined as the following, \[ \begin{aligned} Matern12 \to \mathcal{K_{\nu=0.5}}(x_1, x_2) &= \sigma^2exp\left(-\frac{|x_1-x_2|}{l}\right)\\ Matern32 \to \mathcal{K_{\nu=1.5}}(x_1, x_2) &= \sigma^2\left(1+\frac{\sqrt{3}|x_1-x_2|}{l}\right)exp\left(-\frac{\sqrt{3}|x_1-x_2|}{l}\right)\\ Matern52 \to \mathcal{K_{\nu=2.5}}(x_1, x_2) &= \sigma^2\left(1+\frac{\sqrt{5}|x_1-x_2|}{l}+\frac{5(x_1-x_2)^2)}{3l^2}\right)exp\left(-\frac{\sqrt{5}|x_1-x_2|}{l}\right) \end{aligned} \] Matern kernels are stationary as well as isotropic. With \(\nu \to \infty\) they converge to \(RBF\) kernel. \(Matern12\) is also known as \(Exponential\) kernel in toolkits such as GPy.
Now, let’s draw few functions from each of these versions and try to get intuition behind each of them.
def K_m12(X1, X2, sigma=1., l=1.): # v = 0.5
return (sigma**2)*(np.exp(-np.abs(X1-X2.T)/l))
def K_m32(X1, X2, sigma=1., l=1.): # v = 1.5
return (sigma**2)*(1+((3**0.5)*np.abs(X1-X2.T))/l)*(np.exp(-(3**0.5)*np.abs(X1-X2.T)/l))
def K_m52(X1, X2, sigma=1., l=1.): # v = 2.5
return (sigma**2)*(1+(((5**0.5)*np.abs(X1-X2.T))/l)+((5*(X1-X2.T)**2)/(3*l**2)))*\
-(5**0.5)*np.abs(X1-X2.T)/l)) (np.exp(
Verifying if our kernels are consistent with GPy kernels.
= np.linspace(101,1001,50).reshape(-1,1)
X assert np.allclose(K_m32(X,X,sigma=7.,l=11.), GPy.kern.Matern32(1,lengthscale=11.,variance=7**2).K(X,X))
assert np.allclose(K_m52(X,X,sigma=7.,l=11.), GPy.kern.Matern52(1,lengthscale=11.,variance=7**2).K(X,X))
= np.arange(-4,5).reshape(-1,1)
X = 1.
sigma = 3.
l
= plt.subplots(3,2,figsize=(14,10))
fig, ax = ['Matern12', 'Matern32', 'Matern52']
names for k_i, kernel in enumerate([K_m12, K_m32, K_m52]):
= np.zeros(X.shape[0])
mean = kernel(X, X, sigma, l)
cov = np.random.multivariate_normal(mean, cov, size=5)
functions for func in functions:
0].plot(X, func);
ax[k_i,0].set_xlabel('X');ax[k_i,0].set_ylabel('Y');ax[k_i,0].set_title('Functions drawn from '+names[k_i]+' kernel');
ax[k_i,round(2), ax=ax[k_i,1], xticklabels=X.ravel(), yticklabels=X.ravel(), annot=True);
sns.heatmap(cov.1].set_xlabel('X');ax[k_i,1].set_ylabel('X');ax[k_i,1].set_title('Covariance matrix');
ax[k_i,; plt.tight_layout()
From the above plot, we can say that smoothness is increasing in functions as we increase \(\nu\). Thus, smoothness of functions in terms of kernels is in the following order: Matern12<Matern32<Matern52.
Let us see effect of varying \(\sigma\) and \(l\) on Matern32 which is more popular among the three.
0)
np.random.seed(= 1.
sigma = [0.5,1,2,3,4,5]
val_list ='Matern32', p_name='Lengthscale', symbol='l') animate_functions(K_m32, val_list, k_name
We can see that Matern32 kernel behaves similar to RBF with varying \(l\). Though, Matern32 is less smoother than RBF. A quick comparison would clarify this.
= np.linspace(-10,10,100).reshape(-1,1)
X =3.)[:,50], label='RBF')
plt.plot(X, K_rbf(X,X, l=3.)[:,50], label='Matern32')
plt.plot(X, K_m32(X,X, l;plt.xlabel('X');plt.ylabel('Covariance (K(0,X))');
plt.legend()'K(0,X)'); plt.title(
Periodic Kernel
Periodic Kernel is given as the following, \[ \begin{aligned} \mathcal{K}(x_1,x_2)= \sigma^2\exp\left(-\frac{\sin^2(\pi|x_1 - x_2|/p)}{2l^2}\right) \end{aligned} \] Where \(p\) is period. Let’s visualize few functions drawn from this kernel.
def K_periodic(X1, X2, sigma=1., l=1., p=3.):
return sigma**2 * np.exp(-0.5*np.square(np.sin(np.pi*(X1-X2.T)/p))/l**2)
= np.linspace(10,1001,50).reshape(-1,1)
X assert np.allclose(K_periodic(X,X,sigma=7.,l=11.,p=3.),
1,lengthscale=11.,variance=7**2,period=3.).K(X,X)) GPy.kern.StdPeriodic(
0)
np.random.seed(= np.arange(-4,5).reshape(-1,1)
X = 1
sigma = 1.
l = 3.
p = 'Periodic'
k_name plot_functions(K_periodic)
We will investigate the effect of varying period \(p\) now.
0)
np.random.seed(= [1., 2., 3., 4., 5.]
val_list
=(0.4,1.1),
animate_functions(K_periodic, val_list, ax1_ylim='Periodic',p_name='Period') k_name
From the above animation we can see that, all points that are \(p\) distance apart from each other have exactly same values because they have correlation of exactly 1 (\(\sigma=1 \to covariance=correlation\)).
Now, we will investigate effect of lenging lengthscale \(l\) while other parameters are constant.
0)
np.random.seed(= [1., 2., 3., 4., 5.]
val_list
=(0.6,1.1),
animate_functions(K_periodic, val_list, ax1_ylim='Periodic',p_name='Lengthscale', symbol='l') k_name
We can see that correlation between a pair of locations \(\{x_1,x_2|x_1-x_2<p\}\) increases as the lengthscale is increased.
Linear Kernel
Linear kernel (a.k.a. dot-product kernel) is given as the following, \[ \begin{aligned} \mathcal{K}(x_1,x_2)= (x_1-c)(x_2-c)+\sigma^2 \end{aligned} \] Let’s visualize few functions drawn from the linear kernel
def K_lin(X1, X2, sigma=1., c=1.):
return (X1-c)@(X2.T-c) + sigma**2
0)
np.random.seed(= 1.
sigma = 1.
c
=(-10,5), ax1_ylim=(-3,7)) plot_functions(K_lin, ax0_ylim
Let’s see the effect of varying parameters \(\sigma\) and \(c\) of the linear kernel function.
= [-3,-2,-1,0,1,2,3]
val_list
=(-15,12), ax1_ylim=(-3,23),
animate_functions(K_lin, val_list, ax0_ylim='Offset', symbol='c') p_name
1)
np.random.seed(= np.square(np.array([1,2,3,4,5,8]))
val_list
=(-25,15), ax1_ylim=(-5,110),
animate_functions(K_lin, val_list, ax0_ylim='Variance', symbol='sigma') p_name
Varying \(c\) parameter changes position of shallow region in covariance matrix. In other words, as \(x \to c\), points close to \(x\) have variance \(\to \sigma^2\). Distant points have monotonically increasing variance.
Increasing \(\sigma^2\) adds a constant in all variance and covariances. So, it allows more uncertainty across all points and weakens the monotonic trend of variance over distant points.
Non-stationary behaviour of Linear kernel
Unlike other stationary kernels, Linear kernel is not invariant of translations in the input space. The comparison below, visually supports this claim.
= plt.subplots(2,2,figsize=(14,8), sharex=True)
fig, ax = [K_rbf, K_m32, K_periodic, K_lin]
kerns = ['RBF', 'Matern32', 'Periodic', 'Linear']
k_names = np.linspace(-10,10,21).reshape(-1,1)
X def update(x):
= 0
count for i in range(2):
for j in range(2):
ax.ravel()[count].cla()= kerns[count]
tmp_kern = np.zeros(X.shape[0])
mean = tmp_kern(X,X)
cov ;
ax.ravel()[count].plot(X, cov[:,x])-3],X[x+3])
ax.ravel()[count].set_xlim(X[x'X');
ax.ravel()[count].set_xlabel('K('+str(X[x].round(2))+',X)');
ax.ravel()[count].set_ylabel('Covariance K('+str(X[x].round(2))+',X) for '+k_names[count]+' kernel');
ax.ravel()[count].set_title(+= 1
count 3].set_ylim(-5,80)
ax.ravel()[
plt.tight_layout()
= FuncAnimation(fig, update, frames=[5,7,9,11,13,15], blit=False)
anim
plt.close()'animation', html='jshtml')
rc( anim
<Figure size 432x288 with 0 Axes>
Multiplications of kernels
If a single kernel is having high bias in fitting a dataset, we can use mutiple of these kernels in multiplications and/or summations. First, let us see effect of multiplication of a few kernels.
Periodic * Linear
= np.linspace(-10,10,100).reshape(-1,1)
X =2.)[:,50], label='Periodic')
plt.plot(X, K_periodic(X,X,sigma=0.01,c=0)[:,50], label='Linear')
plt.plot(X, K_lin(X,X,sigma=2.)[:,50]*K_lin(X,X,sigma=0.01,c=0)[:,50], label='Periodic*Linear')
plt.plot(X, K_periodic(X,X,sigma=(1,1));plt.xlabel('X');plt.ylabel('Covariance')
plt.legend(bbox_to_anchor'K(0,*)'); plt.title(
Linear * Linear
= np.linspace(-1,1,100).reshape(-1,1)
X =-1)[:,50], label='Linear1')
plt.plot(X, K_lin(X,X,c=1)[:,50], label='Linear2')
plt.plot(X, K_lin(X,X,c=0.5)[:,50], label='Linear3')
plt.plot(X, K_lin(X,X,c=-1)[:,50]*K_lin(X,X,c=1)[:,50], label='Linear1*Linear3')
plt.plot(X, K_lin(X,X,c=-1)[:,50]*K_lin(X,X,c=1)[:,50]*K_lin(X,X,c=0.5)[:,50], label='Linear1*Linear2*Linear3')
plt.plot(X, K_lin(X,X,c=(1,1)); plt.legend(bbox_to_anchor
Matern * Linear
= np.linspace(-1,1,100).reshape(-1,1)
X = K_lin(X,X,c=1)[:,50]
k1 = K_m32(X,X)[:,50]
k2 ='Linear')
plt.plot(X, k1, label='Matern32')
plt.plot(X, k2, label*k2, label='Matern32*Linear')
plt.plot(X, k1=(1,1)); plt.legend(bbox_to_anchor
Appendix (Extra material)
At this stage, we do not know how the fuctions are drawn from linear kernel based covariance matrix end up being lines with various intercepts and slopes.
Predicting at a single point after observing value at a single point
Let’s see how would be a GP prediction after observing value at a single point.
Our kernel function is given by, * \(K(x,x')=(x-c) \cdot (x'-c)+\sigma^2\)
Now, we observe value \(y\) at a location \(x\) and we want to predict value \(y^*\) at location \(x^*\). \[ \begin{aligned} (y^*|x_1,y_1,x^*) &= K(x^*,x) \cdot K^{-1}(x,x)\cdot y \\ &= \left(\frac{(x-c)(x^*-c)+\sigma^2}{(x-c)(x-c)+\sigma^2}\right)\cdot y \end{aligned} \] \(c\) and \(\sigma^2\) do not vary in numerator and denominator so, the value of \(y^* \propto x^*\).
Predicting at a single point after observing values at two points
Now, we’ll take a case where two values \({y_1, y_2}\) are observed at \({x_1, x_2}\). Let us try to predict value \(y^*\) at \(x^*\).
$$ y^* = \[\begin{bmatrix} K(x_1, x^*) & K(x_2,x^*) \end{bmatrix}\begin{bmatrix} K(x_1, x_1) & K(x_1,x_2) \\ K(x_2, x_1) & K(x_2,x_2) \end{bmatrix}\] ^{-1} \[\begin{bmatrix} y_1 \\ y_2 \end{bmatrix}\]\
& = \[\begin{bmatrix} (x_1-c)(x^*-c)+\sigma^2 & (x_2-c)(x^*-c)+\sigma^2 \end{bmatrix} \begin{bmatrix} (x_1-c)^2+\sigma^2 & (x_1-c) (x_2-c)+\sigma^2 \\ (x_2-c) (x_1-c)+\sigma^2 & (x_2-c)^2 +\sigma^2 \end{bmatrix}\] ^{-1} \[\begin{bmatrix} y_1 \\ y_2 \end{bmatrix}\]\
& = \[\begin{bmatrix} (x_1-c)(x^*-c)+\sigma^2 & (x_2-c)(x^*-c)+\sigma^2 \end{bmatrix} \frac{1}{\sigma^2(x_1-x_2)^2} \begin{bmatrix} (x_2-c)^2+\sigma^2 & -[(x_1-c)(x_2-c)+\sigma^2] \\ -[(x_2-c) (x_1-c)+\sigma^2] & (x_1-c)^2 +\sigma^2 \end{bmatrix} \begin{bmatrix} y_1 \\ y_2 \end{bmatrix} \tag{1}\]From Eq. (1) second term, we can say that if \(\sigma^2=0\), matrix is not-invertible because determinant is zero. It means that, if \(\sigma^2=0\), observing a single point is enough, we can infer values at infinite points after observing that single point.
Evaluating Eq. (1) further, it converges to the following equation, \[ \begin{aligned} y^* = \frac{(x_1y_2-x_2y_1)+x^*(y_1-y_2)}{(x_1-x_2)} \end{aligned} \] Interestingly, we can see that output does not depend on \(c\) or \(\sigma^2\) anymore. Let us verify experimentally if this is true for observing more than 2 data points.
Prepering useful functions
from scipy.optimize import minimize
def cov_func(x, x_prime, sigma, c):
return (x-c)@(x_prime-c) + sigma**2
def neg_log_likelihood(params):
= X.shape[0]
n = params
sigma, c, noise_std = cov_func(X, X.T, sigma, c)
cov = cov + (noise_std**2)*np.eye(n)
cov = 0.5*(Y.T@np.linalg.pinv(cov)@Y) + 0.5*n*np.log(2*np.pi) + 0.5*np.log(np.linalg.det(cov))
nll_ar return nll_ar[0,0]
def predict(params):
= params
sigma, c, noise_std = cov_func(X, X.T, sigma, c)
k +noise_std**2)
np.fill_diagonal(k, k.diagonal()= np.linalg.pinv(k)
k_inv = cov_func(X_test, X.T, sigma, c)
k_star
= k_star@k_inv@Y
mean = cov_func(X_test, X_test.T, sigma, c) - k_star@k_inv@k_star.T
cov return mean, cov
Observing more than two points and changing hyperparameters manually
= np.array([3,4,5,6,7,8]).reshape(-1,1)
X = np.array([6,9,8,11,10,13]).reshape(-1,1)
Y = np.linspace(1,8,20).reshape(-1,1)
X_test = [[1., 0.01, 10**-10], [100., 1., 10**-10],
params_grid 100., 0.01, 10**-10], [1., 2., 1.]] # sigma, c, noise_std
[
= np.hstack([np.ones((X.shape[0], 1)), X])
X_extra = np.linalg.pinv(X_extra.T@X_extra)@X_extra.T@Y
Theta = np.hstack([np.ones((X_test.shape[0], 1)), X_test])
X_test_extra = X_test_extra@Theta
Y_test_ideal
= plt.subplots(1,4,figsize=(16,5), sharey=True)
fig, ax = []
means for p_i, params in enumerate(params_grid):
= predict(params)
Y_test_mean, Y_test_cov
means.append(Y_test_mean)='train')
ax[p_i].scatter(X, Y, label='test')
ax[p_i].scatter(X_test, Y_test_mean, label;ax[p_i].set_xlabel('X');ax[p_i].set_ylabel('Y');
ax[p_i].legend()'sigma='+str(params[0])+', c='+str(params[1])+', noise='+str(params[2])); ax[p_i].set_title(
0]),\
np.allclose(Y_test_ideal, means[1]),\
np.allclose(Y_test_ideal, means[2]),\
np.allclose(Y_test_ideal, means[3]) np.allclose(Y_test_ideal, means[
(True, True, True, False)
= GPy.models.GPRegression(X, Y, GPy.kern.Linear(input_dim=1))
model # model['Gaussian_noise'].fix(10**-10)
# model.kern.variances.fix(10**-10)
model.optimize()
model.plot()='Normal Eq. fit')
plt.plot(X_test, Y_test_ideal, label0], label='Prediction')
plt.plot(X_test,model.predict(X_test)[
plt.legend() model
Model: GP regression
Objective: 13.51314321804978
Number of Parameters: 2
Number of Optimization Parameters: 2
Updates: True
GP_regression. | value | constraints | priors |
linear.variances | 2.806515343539501 | +ve | |
Gaussian_noise.variance | 2.0834221617534134 | +ve |
We can see that there is no change in fit with change in \(c\) and \(\sigma\). 4th fit is not matching with the ideal fit obtained by normal equation because of high noise. Now, let us estimate parameters by minimizing negative log marginal likelihood.
= [1., 1., 1.]
params = minimize(neg_log_likelihood, params, bounds=[(10**-5, 10**5), (10**-5, 10**5), (10**-5, 10**-5)])
result = result.x
params print(params, result.fun)
= predict(params)
Y_test_mean, Y_test_cov ='train')
plt.scatter(X, Y, label='test')
plt.scatter(X_test, Y_test_mean, label;plt.xlabel('X');plt.ylabel('Y');
plt.legend()= np.round(params, 4)
params 'sigma='+str(params[0])+', c='+str(params[1])+', noise='+str(params[2]));
plt.title( np.allclose(Y_test_ideal, Y_test_mean)
[9.99998123e-01 9.99998123e-01 1.00000000e-05] 10207223403405.541
False
def neg_log_likelihood(sigma, c, noise_std):
= X.shape[0]
n = cov_func(X, X.T, sigma, c)
cov = cov + (noise_std**2)*np.eye(n)
cov = 0.5*(Y.T@np.linalg.pinv(cov)@Y) + 0.5*n*np.log(2*np.pi) + 0.5*np.log(np.linalg.det(cov))
nll_ar return nll_ar[0,0]
= grad(neg_log_likelihood, argnum=[0,1,2])
grad_func = 0.01
alpha = []
loss = 1., 1., 1.
sigma, c, noise_std for _ in range(5000):
= grad_func(sigma, c, noise_std)
grads # print(grads)
= sigma - alpha*grads[0]
sigma = c - alpha*grads[1]
c = noise_std - alpha*grads[2]
noise_std
loss.append(neg_log_likelihood(sigma, c, noise_std))print(sigma, c, noise_std)
;
plt.plot(loss)-1] loss[
7.588989986845149 -2.830840439162303 32.2487569348891
31.05187173290998
= sigma, c, noise_std
params = predict(params)
Y_test_mean, Y_test_cov ='train')
plt.scatter(X, Y, label='test')
plt.scatter(X_test, Y_test_mean, label;plt.xlabel('X');plt.ylabel('Y');
plt.legend()= np.round(params, 4)
params 'sigma='+str(params[0])+', c='+str(params[1])+', noise='+str(params[2]));
plt.title(0], Y_test_mean, rtol=10**-1, atol=10**-1) np.allclose(means[
False