Coresets: 5 Coresets for Linear Regression
We will see one of the methods to select coreset points for linear regression. We calculate “Ridge leverage scores” to create a sampling distribution for coreset selection. The process to calculate sampling distribution \(p(X)\) is as the following.
- \(X_* = \begin{bmatrix}X \\ \lambda I_d\end{bmatrix}\)
- \(U \Sigma V^T = X_*\)
- \(\mathbf{p}(X) = ||U(0:n, :)||_2^2\)
Now, let us implement this in a python function.
from sklearn.linear_model import Ridge
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib import rc
import warnings
'ignore')
warnings.filterwarnings(
'font', size=16)
rc('text', usetex=True)
rc(def plot_essentials(): # essential code for every plot
= plt.gca().get_legend_handles_labels()
hand, labs if len(hand)>0:
;
plt.legend(hand, labs);
plt.tight_layout()
plt.show() plt.close()
def get_proba(x, lmd):
= [np.ones(x.shape[0]), x.ravel()]
x_list = np.vstack(x_list).T
x_extra = np.vstack([x_extra, np.eye(x_extra.shape[1])*np.sqrt(lmd)])
A = np.linalg.svd(A, full_matrices=False)
U, S, V = U[:x.shape[0], :]
U1 = np.square(U1).sum(axis=1)/np.square(U1).sum(axis=1).sum()
proba return proba
We will sample the coresets for Scikit-learn diabetes dataset.
5.1 Coresets for linear regression (order=1)
from sklearn.datasets import load_diabetes
= load_diabetes(return_X_y=True)
X, y = X[:, np.newaxis, 2]
X
= X[:-20], y[:-20]
X_train, y_train = X[-20:], y[-20:]
X_test, y_test
='Train');
plt.scatter(X_train, y_train, label='Test');
plt.scatter(X_test, y_test, label'X'); plt.ylabel('Y');
plt.xlabel(; plot_essentials()
Now, let us visualize sampling probabilities of the train dataset.
= 0.0001
lmd = get_proba(X_train, lmd);
X_proba =X_proba, label='');
plt.scatter(X_train, y_train, c;
plt.colorbar()'Sampling probability distribution over training data');
plt.title('X'); plt.ylabel('Y');
plt.xlabel(; plot_essentials()
Sampling 10% of the training data as a coreset.
0)
np.random.seed(= len(X_train)//10
N_core
= np.random.choice(len(X_train), size=N_core, replace=True, p=X_proba)
core_idx
= X_train[core_idx]
X_core = y_train[core_idx] y_core
Checking fit on original dataset and on coreset.
= Ridge(alpha=lmd).fit(X_train, y_train)
FullModel = Ridge(alpha=lmd).fit(X_core, y_core)
CoreModel
='Train data');
plt.scatter(X_train, y_train, label='Coreset');
plt.scatter(X_core, y_core, label
= FullModel.predict(X_train);
y_pred_full = CoreModel.predict(X_train);
y_pred_core
='Fit on train');
plt.plot(X_train, y_pred_full, label='Fit on coreset');
plt.plot(X_train, y_pred_core, label'X'); plt.ylabel('Y');
plt.xlabel(; plot_essentials()
Let us do cost comparison to verify if this is an \(\varepsilon\)-coreset. We take \(\varepsilon=0.3\).
= 0.3;
eps = np.square(FullModel.predict(X_train) - y_train).sum()/len(X_train);
FullModelCost = np.square(CoreModel.predict(X_train) - y_train).sum()/len(X_train); CoreModelCost
\[ \frac{cost(X,Q^*_C)}{cost(X,Q^*_X)} \le \frac{1+\varepsilon}{1-\varepsilon} \]
We have above relationship 1.02 \(\le\) 1.86, thus we found a valid \(\varepsilon\)-coreset here.
5.2 Coresets for linear regression (order=5)
Let us generate random data using order 5 polynomial kernel.
import GPy
123)
np.random.seed(
= GPy.kern.Poly(1, order=5, bias=1, scale=1, variance=1)
kernel = 1.1
sigma_n = 100
N = np.random.normal(0,1,N).reshape(-1,1)
X = X.shape[0]
N = kernel.K(X, X)
cov_matrix += np.eye(N)*sigma_n**2
cov_matrix
= np.random.multivariate_normal(np.zeros(N), cov_matrix).reshape(-1,1)
y_poly
=5);
plt.scatter(X, y_poly,s'X');plt.ylabel('y');
plt.xlabel(-10,30);
plt.ylim(; plot_essentials()
Sampling uniform samples and coreset samples.
def get_proba(x, lmd, order=1):
= [np.ones(x.shape[0])]
x_list for i in range(1,order+1):
**i)
x_list.append(x.ravel()= np.vstack(x_list).T
x_extra = np.vstack([x_extra, np.eye(x_extra.shape[1])*np.sqrt(lmd)])
A = np.linalg.svd(A, full_matrices=False)
U, S, V = U[:x.shape[0], :]
U1 = np.square(U1).sum(axis=1)/np.square(U1).sum(axis=1).sum()
proba return proba
def get_coreset(x, y, n, lmd, order, seed):
np.random.seed(seed)= get_proba(x, lmd, order)
proba = np.random.choice(x.shape[0], size=n, p=proba)
idx return idx
def get_uniform(x, y, n, seed):
np.random.seed(seed)= np.ones(x.shape[0])/x.shape[0]
proba = np.random.choice(x.shape[0], size=n, p=proba)
idx return idx
= (sigma_n/(kernel.variance)**0.5)**2
lmd = len(y_poly)//10
n_sample = 5
order
= get_coreset(X, y_poly, n_sample, lmd, order, seed=0)
core_idx = X[core_idx], y_poly[core_idx]
X_core, y_core = get_uniform(X, y_poly, n_sample, seed=0)
uni_idx = X[uni_idx], y_poly[uni_idx] X_uni, y_uni
Checking the fit using the same kernel (without optimizing the hyperparamaters because we have kept them constants for analysis).
Fitting on the uniform samples.
= GPy.models.GPRegression(X_uni, y_uni, kernel)
uniGP = sigma_n**2
uniGP.likelihood.variance
= plt.subplots();
fig, ax
='r', label='full data', alpha=0.5);
ax.scatter(X, y_poly, c=ax);
uniGP.plot(ax-10,30);
ax.set_ylim('Fit on uniform samples');
ax.set_title(; plot_essentials()
Fitting on the coreset samples.
= GPy.models.GPRegression(X_core, y_core, kernel)
coreGP = sigma_n**2
coreGP.likelihood.variance
= plt.subplots();
fig, ax
='r', label='full data', alpha=0.5);
ax.scatter(X, y_poly, c=ax);
coreGP.plot(ax-10,30);
ax.set_ylim('Fit on coreset samples');
ax.set_title(; plot_essentials()
5.3 Checking uncertainty in the fit
0);
np.random.seed(# X_new = X.copy()
= np.sort(np.random.normal(2,2,N*3)).reshape(-1,1);
X_new
= 20;
s
# Full data
= X.copy(), y_poly.copy()
train_X, train_y = GPy.models.GPRegression(train_X, train_y, kernel)
tmpGP = sigma_n**2
tmpGP.likelihood.variance
= tmpGP.predict(X_new)
_, var = np.sqrt(var)*2
std2
= plt.plot(X_new, std2, color='k', label='full data');
_
# Uniform samples
= X[uni_idx], y_poly[uni_idx]
train_X, train_y = GPy.models.GPRegression(train_X, train_y, kernel)
tmpGP = sigma_n**2
tmpGP.likelihood.variance
= tmpGP.predict(X_new)
_, var = np.sqrt(var)*2
std2
= plt.plot(X_new, std2, color='r', label='uniform');
_
# Coreset samples
= X[core_idx], y_poly[core_idx]
train_X, train_y = GPy.models.GPRegression(train_X, train_y, kernel)
tmpGP = sigma_n**2
tmpGP.likelihood.variance
= tmpGP.predict(X_new)
_, var = np.sqrt(var)*2
std2 = plt.plot(X_new, std2, color='g', label='coreset');
_ 0,7);
plt.ylim("X");plt.ylabel("Uncertainty");
plt.xlabel(; plot_essentials()
We can see that uncertainty for the model fitted on coreset is closely matching with the model fitted on full dataset. Uncertainty is high for uniform sampling in the end regions because we have low density of input \(X\) in that region and thus points in those region are less likely to be included in the uniform samples. ## Comparing uncertainty with best coreset and uncertainty sampling (active learning)
Sampling from an active learning technique called uncertainty sampling.
= list(range(N))
all_idx = [0]
train_idx = all_idx.copy()
pool_idx for i in train_idx:
pool_idx.remove(i)
= np.argsort(X.ravel())
sidx
def update():
# Computation
= X[train_idx], y_poly[train_idx]
train_X, train_y
= GPy.models.GPRegression(train_X, train_y, kernel)
tmpGP = sigma_n**2
tmpGP.likelihood.variance
= tmpGP.predict(X[pool_idx])
_, var = tmpGP.predict(X)
meanall, varall = np.sqrt(varall)*2
std2all
= pool_idx[np.argmax(var)]
next_idx
# Updating
train_idx.append(next_idx)
pool_idx.remove(next_idx)
for i in range(15):
update()
0);
np.random.seed(= np.sort(np.random.normal(2,2,N*3)).reshape(-1,1);
X_new
= 20
s
# Full data
= X.copy(), y_poly.copy()
train_X, train_y = GPy.models.GPRegression(train_X, train_y, kernel)
tmpGP = sigma_n**2
tmpGP.likelihood.variance
= tmpGP.predict(X_new)
_, var = np.sqrt(var)*2
std2
= plt.plot(X_new, std2, color='k', label='full data');
_
# Uniform samples
= X[uni_idx], y_poly[uni_idx]
train_X, train_y = GPy.models.GPRegression(train_X, train_y, kernel)
tmpGP = sigma_n**2
tmpGP.likelihood.variance
= tmpGP.predict(X_new)
_, var = np.sqrt(var)*2
std2
= plt.plot(X_new, std2, color='r', label='uniform');
_
# Coreset samples
= X[core_idx], y_poly[core_idx]
train_X, train_y = GPy.models.GPRegression(train_X, train_y, kernel)
tmpGP = sigma_n**2
tmpGP.likelihood.variance
= tmpGP.predict(X_new)
_, var = np.sqrt(var)*2
std2 = plt.plot(X_new, std2, color='g', label='coreset');
_
# Best core
= np.argsort(get_proba(X, lmd, order=5))[::-1][:n_sample]
best_core
= X[best_core], y_poly[best_core]
train_X, train_y = GPy.models.GPRegression(train_X, train_y, kernel)
tmpGP = sigma_n**2
tmpGP.likelihood.variance
= tmpGP.predict(X_new)
_, var = np.sqrt(var)*2
std2 = plt.plot(X_new, std2, color='b', label='best coreset (???)')
_
# Active learning samples
= X[train_idx], y_poly[train_idx]
train_X, train_y = GPy.models.GPRegression(train_X, train_y, kernel)
tmpGP = sigma_n**2
tmpGP.likelihood.variance
= tmpGP.predict(X_new)
_, var = np.sqrt(var)*2
std2 = plt.plot(X_new, std2, color='y', label='Uncert. Sampling')
_
0,5);
plt.ylim("X");plt.ylabel("Uncertainty");
plt.xlabel(; plot_essentials()
Interpretation of the above plot is as the following,
- Models trained on full dataset, a coreset and dataset generated by uncertainty sampling have more or less similar uncertainty.
- Model fitted on \(n'\) points that have highest probability in coreset sampling distribution have higher uncertanty in central region. This is because we have higher density of datapoints in the central region and all points with high probability lie on both ends. This suggests that, it is not a good strategy to choose highly probable points as coresets. We must sample the points properly with a sampling technique according to the sampling distribution.
- We see that uncertainty sampling results in best uncertainty here that is closest to oracle (full data fit). This raises following questions for the future research.
5.4 Questions
What can we say about uncertainty in the fit while fitting the model on the coreset points?
We saw that uncertainty sampling can sample the points that help in reducing uncertainty the most so can we say they can produce better coresets?