# Common imports
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from matplotlib import rc
'fivethirtyeight')
plt.style.use('animation', html='jshtml')
rc(
# Copy the models
from copy import deepcopy
# Sklearn imports
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
# Entropy function
from scipy.stats import entropy
# Progress helper
from IPython.display import clear_output
QBC by posterior sampling
Interesting fact: For probabilistic models, QBC is similar to uncertainty sampling. How?
Draw \(k\) parameter sets from the posterior distribution representing \(k\) different models.
Query a point which shows maximum disagreement among the points.
An example: Bayesian linear regression
0)
np.random.seed(= 10
N = np.linspace(-1,1,N).reshape(-1,1)
X
= 3
t0 = 2
t1
= X * t1 + t0 + np.random.rand(N,1)
y
; plt.scatter(X, y)
Assume a posterior
= 50
n_samples
= np.random.normal(t0, 0.1, size=n_samples)
t0_dist_samples = np.random.normal(t1, 1, size=n_samples) t1_dist_samples
Plot the models
plt.scatter(X, y)
for i in range(len(t0_dist_samples)):
= t0_dist_samples[i]
sample_t0 = t1_dist_samples[i]
sample_t1
* sample_t1 + sample_t0,alpha=0.1) plt.plot(X, X
QBC by bootstrapping
2 class dataset
= make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, random_state=3, shuffle=True)
X, y
plt.figure()0], X[:,1], c=y); plt.scatter(X[:,
Full data fit with RF
= RandomForestClassifier(random_state=0)
model ; model.fit(X, y)
RandomForestClassifier(random_state=0)
Visualize decision boundary
= np.meshgrid(np.linspace(X[:,0].min()-0.1, X[:,0].max()+0.1, 100),
grid_X1, grid_X2 1].min()-0.1, X[:,1].max()+0.1, 100))
np.linspace(X[:,
= [(x1, x2) for x1, x2 in zip(grid_X1.ravel(), grid_X2.ravel())]
grid_X
= model.predict(grid_X)
grid_pred
=(6,5))
plt.figure(figsize0], X[:,1], c=y);
plt.scatter(X[:,*grid_X1.shape), alpha=0.2); plt.contourf(grid_X1, grid_X2, grid_pred.reshape(
Train, pool, test split
= train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
X_train_pool, X_test, y_train_pool, y_test = train_test_split(X_train_pool, y_train_pool, train_size=20, random_state=0)
X_train, X_pool, y_train, y_pool
= [X_train, X_pool, X_test]
X_list = [y_train, y_pool, y_test]
y_list = ['Train', 'Pool', 'Test']
t_list
= plt.subplots(1,3,figsize=(15,4), sharex=True, sharey=True)
fig, ax for i in range(3):
0], X_list[i][:,1], c=y_list[i])
ax[i].scatter(X_list[i][:,
ax[i].set_title(t_list[i])
Fitting a model on initial train data
= RandomForestClassifier(n_jobs=28, random_state=0)
AL_model
; AL_model.fit(X_train, y_train)
RandomForestClassifier(n_jobs=28, random_state=0)
Get the votes from trees on pool dataset
= np.zeros(shape=(X_pool.shape[0], len(AL_model.estimators_)))
votes
for learner_idx, learner in enumerate(AL_model.estimators_):
= learner.predict(X_pool) votes[:, learner_idx]
votes.shape
(780, 100)
votes
array([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[1., 1., 1., ..., 0., 1., 1.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]])
Convert to probabilities
= np.zeros(shape=(X_pool.shape[0], X_pool.shape[1]))
p_vote
for vote_idx, vote in enumerate(votes):
= {0 : (1-vote).sum(), 1 : vote.sum()}
vote_counter
for class_idx, class_label in enumerate(range(X.shape[1])):
= vote_counter[class_label]/len(AL_model.estimators_) p_vote[vote_idx, class_idx]
p_vote
array([[1. , 0. ],
[0.89, 0.11],
[0.06, 0.94],
...,
[0.93, 0.07],
[1. , 0. ],
[1. , 0. ]])
Calculate dissimilarity (entropy)
= 2 example_id
= 0
ans for category in range(X_pool.shape[1]):
+= (-p_vote[example_id][category] * np.log(p_vote[example_id][category]))
ans
ans
0.22696752250060448
= entropy(p_vote, axis=1) entr
entr[example_id]
0.22696752250060448
Active Learning Flow
def get_query_idx():
# Gather the votes
= np.zeros(shape=(X_pool.shape[0], len(AL_model.estimators_)))
votes for learner_idx, learner in enumerate(AL_model.estimators_):
= learner.predict(X_pool)
votes[:, learner_idx]
# Calcuate probability of votes
= np.zeros(shape=(X_pool.shape[0], X_pool.shape[1]))
p_vote for vote_idx, vote in enumerate(votes):
= {0 : (1-vote).sum(),
vote_counter 1 : vote.sum()}
for class_idx, class_label in enumerate(range(X.shape[1])):
= vote_counter[class_label]/len(AL_model.estimators_)
p_vote[vote_idx, class_idx]
# Calculate entropy for each example
= entropy(p_vote, axis=1)
entr
# Choose example with highest entropy (disagreement)
return entr.argmax()
Prepare data for random sampling
= X_train.copy()
X_train_rand = y_train.copy()
y_train_rand = X_pool.copy()
X_pool_rand = y_pool.copy()
y_pool_rand
= RandomForestClassifier(n_jobs=28, random_state=0) random_model
Run active learning
= 100
AL_iters 0)
np.random.seed(
= []
AL_inds = []
AL_models = []
random_inds = []
random_models
for iteration in range(AL_iters):
=True)
clear_output(waitprint("iteration", iteration)
######## Active Learning ############
# Fit the model
AL_model.fit(X_train, y_train)
AL_models.append(deepcopy(AL_model))
# Query a point
= get_query_idx()
query_idx
AL_inds.append(query_idx)
# Add it to the train data
= np.concatenate([X_train, X_pool[query_idx:query_idx+1, :]], axis=0)
X_train = np.concatenate([y_train, y_pool[query_idx:query_idx+1]], axis=0)
y_train
# Remove it from the pool data
= np.delete(X_pool, query_idx, axis=0)
X_pool = np.delete(y_pool, query_idx, axis=0)
y_pool
######## Random Sampling ############
# Fit the model
random_model.fit(X_train_rand, y_train_rand)
random_models.append(deepcopy(random_model))
# Query a point
= np.random.choice(len(X_pool))
query_idx
random_inds.append(query_idx)# Add it to the train data
= np.concatenate([X_train_rand, X_pool_rand[query_idx:query_idx+1, :]], axis=0)
X_train_rand = np.concatenate([y_train_rand, y_pool_rand[query_idx:query_idx+1]], axis=0)
y_train_rand
# Remove it from the pool data
= np.delete(X_pool_rand, query_idx, axis=0)
X_pool_rand = np.delete(y_pool_rand, query_idx, axis=0) y_pool_rand
iteration 99
Plot accuracy
= []
random_scores = []
AL_scores for iteration in range(AL_iters):
=True)
clear_output(waitprint("iteration", iteration)
AL_scores.append(accuracy_score(y_test, AL_models[iteration].predict(X_test)))
random_scores.append(accuracy_score(y_test, random_models[iteration].predict(X_test)))
='Active Learning');
plt.plot(AL_scores, label='Random Sampling');
plt.plot(random_scores, label;
plt.legend()'Iterations');
plt.xlabel('Accuracy\n(Higher is better)'); plt.ylabel(
iteration 99
Plot decision boundary
def update(i):
for each in ax:
each.cla()
= AL_models[i].predict(grid_X)
AL_grid_preds = random_models[i].predict(grid_X)
random_grid_preds
# Active learning
0].scatter(X_train[:n_train,0], X_train[:n_train,1], c=y_train[:n_train], label='initial_train', alpha=0.2)
ax[0].scatter(X_train[n_train:n_train+i, 0], X_train[n_train:n_train+i, 1],
ax[=y_train[n_train:n_train+i], label='new_points')
c0].contourf(grid_X1, grid_X2, AL_grid_preds.reshape(*grid_X1.shape), alpha=0.2);
ax[0].set_title('New points')
ax[
1].scatter(X_test[:, 0], X_test[:, 1], c=y_test, label='test_set')
ax[1].contourf(grid_X1, grid_X2, AL_grid_preds.reshape(*grid_X1.shape), alpha=0.2);
ax[1].set_title('Test points');
ax[0].text(locs[0],locs[1],'Active Learning')
ax[
# Random sampling
2].scatter(X_train_rand[:n_train,0], X_train_rand[:n_train,1], c=y_train_rand[:n_train], label='initial_train', alpha=0.2)
ax[2].scatter(X_train_rand[n_train:n_train+i, 0], X_train_rand[n_train:n_train+i, 1],
ax[=y_train_rand[n_train:n_train+i], label='new_points')
c2].contourf(grid_X1, grid_X2, random_grid_preds.reshape(*grid_X1.shape), alpha=0.2);
ax[2].set_title('New points')
ax[
3].scatter(X_test[:, 0], X_test[:, 1], c=y_test, label='test_set')
ax[3].contourf(grid_X1, grid_X2, random_grid_preds.reshape(*grid_X1.shape), alpha=0.2);
ax[3].set_title('Test points');
ax[2].text(locs[0],locs[1],'Random Sampling'); ax[
= (2.7, 4)
locs = plt.subplots(2,2,figsize=(12,6), sharex=True, sharey=True)
fig, ax = ax.ravel()
ax = X_train.shape[0]-AL_iters
n_train
= FuncAnimation(fig, func=update, frames=range(100))
anim
plt.close() anim