blog - Input Warped GPs - A failed idea

Comments

We are warping inputs \(\mathbf{x}\) into \(\mathbf{w}\cdot\mathbf{x}\)
Learning second level GP over \(\mathbf{w}\).
Appling penalty over \(\mathbf{w}\) if varies too much unnecessary.
See problems at the end of the notebook.
We need to check mathematical concerns related to this transformation.

import math
import numpy as np
import torch
import gpytorch
from matplotlib import pyplot as plt
import regdata as rd
from sklearn.cluster import KMeans

class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

class ExactNSGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood, num_latent):
        super(ExactNSGPModel, self).__init__(train_x, train_y, likelihood)
#         inds = np.random.choice(train_x.shape[0], size=num_latent, replace=False)
#         self.x_bar = train_x[inds]
        self.x_bar = torch.tensor(KMeans(n_clusters=num_latent).fit(train_x).cluster_centers_).to(train_x)
        self.w_bar = torch.nn.Parameter(torch.ones(num_latent,).to(self.x_bar))
        self.bias = torch.nn.Parameter(torch.zeros(1,).to(self.x_bar))
        self.latent_likelihood = gpytorch.likelihoods.GaussianLikelihood()
#       We can fix noise to be minimum but it is not ideal. Ideally, noise should automatically reduce to reasonable value.
#         self.latent_likelihood.raw_noise.requires_grad = False
#         self.latent_likelihood.raw_noise = torch.tensor(-10.)
        self.latent_model = ExactGPModel(self.x_bar, self.w_bar, self.latent_likelihood)
        
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())

    def forward(self, x):
        self.latent_model.eval()
        with gpytorch.settings.detach_test_caches(False):  # needed to back propagate thru predictive posterior
            self.latent_model.set_train_data(self.x_bar, self.w_bar, strict=False)
            self.w = self.latent_likelihood(self.latent_model(x))  # predictive posterior
        x_warped = x*self.w.mean[:, None] + self.bias
        mean_x = self.mean_module(x_warped)
        covar_x = self.covar_module(x_warped)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

def training(model, likelihood):
    training_iter = 100

    # Find optimal model hyperparameters
    model.train()
    likelihood.train()

    # Use the adam optimizer
    optimizer = torch.optim.Adam([
        {'params': model.parameters()},  # Includes GaussianLikelihood parameters
    ], lr=0.1)

    # "Loss" for GPs - the marginal log likelihood
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

    for i in range(training_iter):
        # Zero gradients from previous iteration
        optimizer.zero_grad()
        # Output from model
        output = model(train_x)
        # Calc loss and backprop gradients
        try:
            loss = -mll(output, train_y) + torch.square(model.w.mean-1).mean()
#             print(model.latent_likelihood.noise)
        except AttributeError:
            loss = -mll(output, train_y)
        loss.backward()
#         print('Iter %d/%d - Loss: %.3f   lengthscale: %.3f   noise: %.3f' % (
#             i + 1, training_iter, loss.item(),
#             model.covar_module.base_kernel.lengthscale.item(),
#             model.likelihood.noise.item()
#         ))
        optimizer.step()
    
def predict_plot(model, likelihood, title):
    # Get into evaluation (predictive posterior) mode
    model.eval()
    likelihood.eval()

    # Test points are regularly spaced along [0,1]
    # Make predictions by feeding model through likelihood
    with torch.no_grad():
        observed_pred = likelihood(model(test_x))

    with torch.no_grad():
        # Initialize plot
        f, ax = plt.subplots(1, 1, figsize=(10, 6))

        # Get upper and lower confidence bounds
        lower, upper = observed_pred.confidence_region()
        # Plot training data as black stars
        ax.plot(train_x.numpy(), train_y.numpy(), 'k*')
        # Plot predictive means as blue line
        ax.plot(test_x.numpy(), observed_pred.mean.numpy(), 'b')
        # Shade between the lower and upper confidence bounds
        ax.fill_between(test_x.numpy().ravel(), lower.numpy(), upper.numpy(), alpha=0.5)
        ax.legend(['Observed Data', 'Mean', 'Confidence'])
        ax.set_title(title)
    return observed_pred

def GP(num_latent):

    # initialize likelihood and model
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = ExactGPModel(train_x, train_y, likelihood)
    
    training(model, likelihood)
    predict_plot(model, likelihood, 'GP')

def NSGP(num_latent):

    # initialize likelihood and model
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = ExactNSGPModel(train_x, train_y, likelihood, num_latent)
    
    training(model, likelihood)
    observed_pred = predict_plot(model, likelihood, 'NSGP')
    
    with torch.no_grad():
        model.train()
        model.forward(test_x)
        plt.figure(figsize=(10,6))
        plt.plot(test_x*model.w.mean[:, None], observed_pred.mean.numpy())
        plt.title('Warped test inputs v/s test outputs')
    
    with torch.no_grad():
        model.train()
        model.forward(test_x)
        plt.figure(figsize=(10,6))
        plt.plot(test_x, model.w.mean, label='interpolated')
        plt.scatter(model.x_bar, model.w_bar, label='learned')
        plt.ylim(0,2)
        plt.title('Test input v/s weights')
        plt.legend()

Testing over various datasets

train_x, train_y, test_x = rd.DellaGattaGene(backend='torch').get_data()
GP(0)
NSGP(num_latent=7)

train_x, train_y, test_x = rd.Heinonen4(backend='torch').get_data()
GP(0)
NSGP(num_latent=10)

train_x, train_y, test_x = rd.Jump1D(backend='torch').get_data()
GP(0)
NSGP(num_latent=10)

train_x, train_y, test_x = rd.MotorcycleHelmet(backend='torch').get_data()
GP(0)
NSGP(num_latent=10)

train_x, train_y, test_x = rd.Olympic(backend='torch').get_data()
GP(0)
NSGP(num_latent=10)

train_x, train_y, test_x = rd.SineJump1D(backend='torch').get_data()
GP(0)
NSGP(num_latent=10)

train_x, train_y, test_x = rd.SineNoisy(backend='torch').get_data()
GP(0)
NSGP(num_latent=10)

Problems

Transformation from x to x_warped is not monotonic.