import GPy
import numpy as np
import pandas as pd
from sklearn.datasets import make_blobs, make_moons, make_circles
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.gaussian_process import GaussianProcessClassifier, kernels
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
Common functions
def get_kernel(ard):
return GPy.kern.RBF(2, ARD=ard)
def create_and_fit_model(model_class, X, y, ard, **kwargs):
= model_class(X, y, get_kernel(ard), **kwargs)
model
model.optimize()return model
Generate Synthetic Data
= make_blobs(n_samples=200, centers=9, random_state=0)
X, y # X, y = make_moons(n_samples=200, noise=0.1, random_state=0)
= y.reshape(-1, 1)
y = OneHotEncoder(sparse=False)
encoder
encoder.fit(y)
0], X[:, 1], c=y, s=50, cmap='rainbow', edgecolor='k'); plt.scatter(X[:,
= np.linspace(X.min(axis=0)[0]-1, X.max(axis=0)[0]+1, 50)
grid1 = np.linspace(X.min(axis=0)[1]-1, X.max(axis=0)[1]+1, 50)
grid2 = np.meshgrid(grid1, grid2)
Grid1, Grid2 = np.vstack([Grid1.ravel(), Grid2.ravel()]).T X_grid
Train-test split
= train_test_split(X, y, test_size=0.5, random_state=42)
X_train, X_test, y_train, y_test = encoder.transform(y_train)
y_train_one_hot = encoder.transform(y_test) y_test_one_hot
Treat it as a regression problem
Here we can regress over the class labels as if they are discrete realizations of a continuous variable. We will round the predictions to the nearest integer to get the class labels.
= create_and_fit_model(GPy.models.GPRegression, X_train, y_train, ard=True)
ard_model = create_and_fit_model(GPy.models.GPRegression, X_train, y_train, ard=False)
non_ard_model
print("ARD")
= ard_model.predict(X_test)[0].round().astype(int)
preds print(classification_report(y_test, preds))
print("Non-ARD")
= non_ard_model.predict(X_test)[0].round().astype(int)
preds print(classification_report(y_test, preds))
ARD
precision recall f1-score support
0 0.67 0.46 0.55 13
1 0.33 0.57 0.42 7
2 0.57 0.62 0.59 13
3 0.71 0.45 0.56 11
4 0.71 0.91 0.80 11
5 0.67 0.80 0.73 10
6 0.33 0.50 0.40 6
7 0.85 0.65 0.73 17
8 1.00 0.83 0.91 12
accuracy 0.65 100
macro avg 0.65 0.64 0.63 100
weighted avg 0.69 0.65 0.66 100
Non-ARD
precision recall f1-score support
0 0.60 0.46 0.52 13
1 0.27 0.43 0.33 7
2 0.57 0.62 0.59 13
3 0.83 0.45 0.59 11
4 0.73 1.00 0.85 11
5 0.62 0.80 0.70 10
6 0.38 0.50 0.43 6
7 0.92 0.65 0.76 17
8 1.00 0.92 0.96 12
accuracy 0.66 100
macro avg 0.66 0.65 0.64 100
weighted avg 0.70 0.66 0.67 100
We get the raw predictions as below:
= plt.subplots(1, 2, figsize=(12, 4))
fig, ax = ax.ravel()
ax = 0
i = []
mappables for model in [ard_model, non_ard_model]:
= model.predict(X_grid)[0]
y_grid = ax[i].contourf(Grid1, Grid2, y_grid.reshape(Grid1.shape), cmap='rainbow', alpha=0.5)
mappable
mappables.append(mappable)0], X[:, 1], c=y, s=50, cmap='rainbow', edgecolor='k')
ax[i].scatter(X[:, f"ARD={model.kern.ARD}")
ax[i].set_title(+= 1
i
# put a common colorbar for both mappables
0], ax=ax, cax=fig.add_axes([0.92, 0.1, 0.02, 0.8])); fig.colorbar(mappables[
Now, let us see the classification boundary:
= plt.subplots(1, 2, figsize=(12, 4))
fig, ax = ax.ravel()
ax = 0
i for model in [ard_model, non_ard_model]:
= model.predict(X_grid)[0].round().astype(int)
y_grid ='rainbow', alpha=0.5)
ax[i].contourf(Grid1, Grid2, y_grid.reshape(Grid1.shape), cmap0], X[:, 1], c=y, s=50, cmap='rainbow', edgecolor='k')
ax[i].scatter(X[:, f"ARD={model.kern.ARD}")
ax[i].set_title(+= 1
i
plt.tight_layout()
Treat it as a multi-output regression problem
In this method, we learn a shared GP model among each class in one v/s rest setting. We can get the predictions by taking the argmax of the predictions from each model.
= create_and_fit_model(GPy.models.GPRegression, X_train, y_train_one_hot, ard=True)
ard_model = create_and_fit_model(GPy.models.GPRegression, X_train, y_train_one_hot, ard=False)
non_ard_model
print("ARD")
= ard_model.predict(X_test)[0].argmax(axis=1)
preds print(classification_report(y_test, preds))
print("Non-ARD")
= non_ard_model.predict(X_test)[0].argmax(axis=1)
preds print(classification_report(y_test, preds))
ARD
precision recall f1-score support
0 0.80 0.62 0.70 13
1 0.67 0.86 0.75 7
2 0.85 0.85 0.85 13
3 0.92 1.00 0.96 11
4 1.00 1.00 1.00 11
5 0.90 0.90 0.90 10
6 1.00 1.00 1.00 6
7 1.00 1.00 1.00 17
8 1.00 1.00 1.00 12
accuracy 0.91 100
macro avg 0.90 0.91 0.91 100
weighted avg 0.91 0.91 0.91 100
Non-ARD
precision recall f1-score support
0 0.80 0.62 0.70 13
1 0.67 0.86 0.75 7
2 0.85 0.85 0.85 13
3 0.92 1.00 0.96 11
4 1.00 1.00 1.00 11
5 0.90 0.90 0.90 10
6 1.00 1.00 1.00 6
7 1.00 1.00 1.00 17
8 1.00 1.00 1.00 12
accuracy 0.91 100
macro avg 0.90 0.91 0.91 100
weighted avg 0.91 0.91 0.91 100
What will happen if we ignore the points where the predictions are below 0.5?
print("ARD")
= ard_model.predict(X_test)[0]
pred_probas = pred_probas.max(axis=1)
pred_proba
= pred_proba > 0.5
mask = ard_model.predict(X_test)[0].argmax(axis=1)
preds = preds[mask]
preds_masked = y_test[mask]
ground_truth_masked
print(ground_truth_masked.shape, preds_masked.shape)
print(classification_report(ground_truth_masked, preds_masked))
ARD
(98, 1) (98,)
precision recall f1-score support
0 0.78 0.64 0.70 11
1 0.67 0.86 0.75 7
2 0.92 0.85 0.88 13
3 0.92 1.00 0.96 11
4 1.00 1.00 1.00 11
5 0.90 0.90 0.90 10
6 1.00 1.00 1.00 6
7 1.00 1.00 1.00 17
8 1.00 1.00 1.00 12
accuracy 0.92 98
macro avg 0.91 0.92 0.91 98
weighted avg 0.92 0.92 0.92 98
Let’s visualize the uncertain points:
0], X_test[:, 1], c=y_test, s=50, cmap='rainbow', edgecolor='k')
plt.scatter(X_test[:, ~mask, 0], X_test[~mask, 1], s=150, c=y_test[~mask], cmap='rainbow', edgecolor='k', label='uncertain points');
plt.scatter(X_test[; plt.legend()
We see that some points close to the decision boundary are uncertain. We can now plot the decision boundary:
= plt.subplots(1, 2, figsize=(12, 4))
fig, ax = ax.ravel()
ax = 0
i for model in [ard_model, non_ard_model]:
= model.predict(X_grid)[0].argmax(axis=1) + 0.5 # due to some reason, unless we add 0.5, the contourf is not showing all the colors
y_grid ='rainbow', alpha=0.5)
ax[i].contourf(Grid1, Grid2, y_grid.reshape(Grid1.shape), cmap0], X[:, 1], c=y, s=50, cmap='rainbow', edgecolor='k')
ax[i].scatter(X[:, f"ARD={model.kern.ARD}")
ax[i].set_title(+= 1
i
# known = set()
# for i in range(len(y_grid)):
# pred = y_grid[i]
# if pred in known:
# x = np.random.uniform()
# if x < 0.2:
# known.remove(pred)
# else:
# known.add(pred)
# ax[0].text(X_grid[i, 0], X_grid[i, 1], str(pred), fontsize=10, color='k', ha='center', va='center')
plt.tight_layout()
pd.Series(y_grid).value_counts()
7 478
5 396
8 378
4 324
6 297
1 265
2 201
3 98
0 63
dtype: int64
Treat it as a One v/s Rest classification problem
Here we learn a separate model for each class. We can get the predictions by taking the argmax of the predictions from each model.
= kernels.ConstantKernel() * kernels.RBF(length_scale=[0.01, 0.01])
ard_kernel # non_ard_kernel = kernels.ConstantKernel() * kernels.RBF(length_scale=0.1)
= GaussianProcessClassifier(kernel=ard_kernel, random_state=0)
ard_model # non_ard_model = GaussianProcessClassifier(kernel=non_ard_kernel, random_state=0)
ard_model.fit(X_train, y_train.ravel())# non_ard_model.fit(X_train, y_train.ravel())
print("ARD")
= ard_model.predict(X_test)
preds print(classification_report(y_test, preds))
# print("Non-ARD")
# preds = non_ard_model.predict(X_test)
# print(classification_report(y_test, preds))
ARD
precision recall f1-score support
0 0.00 0.00 0.00 13
1 0.00 0.00 0.00 7
2 1.00 0.08 0.14 13
3 0.00 0.00 0.00 11
4 0.00 0.00 0.00 11
5 0.00 0.00 0.00 10
6 0.00 0.00 0.00 6
7 0.00 0.00 0.00 17
8 0.12 1.00 0.22 12
accuracy 0.13 100
macro avg 0.12 0.12 0.04 100
weighted avg 0.14 0.13 0.04 100
/home/patel_zeel/miniconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning:Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
/home/patel_zeel/miniconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning:Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
/home/patel_zeel/miniconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning:Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Visualizing the decision boundary:
= plt.subplots(1, 2, figsize=(12, 4))
fig, ax = ax.ravel()
ax = 0
i for model in [ard_model]:
= model.predict_proba(X_grid).argmax(axis=1)
y_grid ='rainbow', alpha=0.5)
ax[i].contourf(Grid1, Grid2, y_grid.reshape(Grid1.shape), cmap0], X[:, 1], c=y, s=50, cmap='rainbow', edgecolor='k')
ax[i].scatter(X[:, f"ARD={bool(i)}")
ax[i].set_title(+= 1
i
plt.tight_layout()
Try random forest model
= RandomForestClassifier(n_estimators=1000, random_state=0)
model
model.fit(X_train, y_train.ravel())
print("Random Forest")
= model.predict(X_test)
preds print(classification_report(y_test, preds))
Random Forest
precision recall f1-score support
0 0.78 0.54 0.64 13
1 0.60 0.86 0.71 7
2 0.92 0.85 0.88 13
3 0.92 1.00 0.96 11
4 1.00 1.00 1.00 11
5 0.91 1.00 0.95 10
6 1.00 1.00 1.00 6
7 1.00 1.00 1.00 17
8 1.00 1.00 1.00 12
accuracy 0.91 100
macro avg 0.90 0.92 0.90 100
weighted avg 0.91 0.91 0.91 100
= plt.subplots(figsize=(6, 4))
fig, ax = model.predict(X_grid) + 0.5 # due to some reason, unless we add 0.5, the contourf is not showing all the colors
y_grid ='rainbow', alpha=0.5)
ax.contourf(Grid1, Grid2, y_grid.reshape(Grid1.shape), cmap0], X[:, 1], c=y, s=50, cmap='rainbow', edgecolor='k'); ax.scatter(X[:,