Some notes on Bayesian Optimization using Matern Kernel as per NIPS Practical Bayesian Optimization paper .
# see http://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpr_prior_posterior.html
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
%matplotlib inline
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
kernel = Matern(nu=2.5)
gp = GaussianProcessRegressor(kernel=kernel)
# suppose we are fitting this function..
X = np.linspace(0, 5, 100)
y = np.sin((X - 2.5) ** 2)
plt.plot(X, y)
# Generate data and fit GP
rng = np.random.RandomState(4)
# take 10 points...
X = rng.uniform(0, 5, 10)[:, np.newaxis]
y = np.sin((X[:, 0] - 2.5) ** 2)
gp.fit(X, y)
X_ = np.linspace(0, 5, 100)
y_mean, y_std = gp.predict(X_[:, np.newaxis], return_std=True)
plt.plot(X_, np.sin((X_ - 2.5) ** 2))
plt.plot(X, y, 'ro')
plt.grid(True)
plt.show()
# how should we approach this? One curve?
y_samples = gp.sample_y(X_[:, np.newaxis], 1)
plt.plot(X_, y_samples, lw=1)
plt.grid(True)
plt.plot(X, y, 'ro')
# how should we approach this? 10 curves?
y_samples = gp.sample_y(X_[:, np.newaxis], 10)
plt.plot(X_, y_samples, lw=1)
plt.grid(True)
plt.plot(X, y, 'ro')
# how should we approach this? 100 curves?
y_samples = gp.sample_y(X_[:, np.newaxis], 100)
plt.plot(X_, y_samples, lw=1)
plt.grid(True)
plt.plot(X, y, 'ro')
# we can show what it looks like 1 standard deviation away
plt.plot(X_, y_mean, 'b', X, y, 'ro')
plt.grid(True)
plt.fill_between(X_, y_mean - y_std, y_mean + y_std,
alpha=0.5, color='k')
# how should we approach this? 100 curves?
y_samples = gp.sample_y(X_[:, np.newaxis], 5)
n_std = 1.20
plt.plot(X_, y_samples, lw=1)
plt.plot(X, y, 'ro')
plt.grid(True)
plt.fill_between(X_, y_mean - n_std*y_std, y_mean + n_std*y_std,
alpha=0.95, color='k')
From here there are several way to pick the next point. Two common approaches are around:
- Upper confidence bound (exploration vs exploitation)
- Expected improvement
n_std = 1.0
plt.plot(X_, np.sin((X_ - 2.5) ** 2), 'b')
plt.plot(X, y, 'ro')
plt.grid(True)
plt.plot(X_, y_mean - n_std*y_std)
plt.plot(X_, y_mean)
n_std = 1.0
plt.plot(X, y, 'ro')
plt.grid(True)
plt.plot(X_, y_mean - n_std*y_std)
plt.plot(X_, y_mean)
GP Upper/Lower Confidence Band
This strategy aims to get a balance between exploration (search based on $\sigma$) and exploitation (search based on $\mu$), controlled by some parameter $\kappa$.
Then the next point $\mathbf{x}$ is the one which minimizes:
$$\mu(\mathbf{x}; …) - \kappa \sigma(\mathbf{x}; …)$$
# GP - LCB
# this parameter can be automatically selected or selected manually!
# selecting to something really large (say 10000)
# will suggest different search space to say 1.0
def gplcb(x, gp=gp, kappa=1.0):
mu, sigma = gp.predict(x, return_std=True)
return mu-kappa*sigma
plt.grid(True)
plt.plot(X_, np.vectorize(gplcb)(X_, kappa=1.0))
plt.title("GP-LCB (minimize regret)")
Expected Improvement
Expected improvement makes use of the gaussain assumptions to calculate the expected improvement over the best current point.
Let:
$$\gamma(\mathbf{x}) = \frac{f(\mathbf{x}_\text{best}) - \mu(\mathbf{x}; …)}{\sigma(\mathbf{x};…)}$$
Expected improvement is the point which maximizes (where to remove symbols $f^*:=f(\mathbf{x}_\text{best}), \mu:=\mu(\mathbf{x}; …), \sigma:=\sigma(\mathbf{x};…), \gamma:=\gamma(\mathbf{x})$
$$(f^*-\mu)\phi(\gamma)+\sigma(\Phi(\gamma)) $$
def ei(x, gp=gp, y_start={'y':y_samples}):
# this calculates the expected improvmeent of a single point
from scipy.stats import norm
x = np.array(x)
mu, sigma = gp.predict(x, return_std=True)
loss_optimum = np.min(y_start['y'])
z = (loss_optimum - mu + x)/sigma
return (loss_optimum-mu+x)*norm.cdf(z) + sigma*norm.pdf(z)
plt.plot(X_, np.vectorize(ei)(X_))
plt.grid(True)
plt.title("Expected Improvement (maximize)")
### now complete a few iterations of GP-LCB and EI
def next_gplcb(X, gp=gp, kappa=1.0):
gplcb_points = np.vectorize(gplcb)(X, gp=gp, kappa=1.0)
return X[np.argmin(gplcb_points)]
X_next = X.copy()
gp = GaussianProcessRegressor(kernel=kernel)
gp.fit(X_next, y)
X_ = np.linspace(0, 5, 100)
y_mean, y_std = gp.predict(X_[:, np.newaxis], return_std=True)
x_next = next_gplcb(X_, gp)
X_next = np.vstack([X_next, np.array([x_next])])
y_next = np.sin((X_next - 2.5) ** 2).reshape(-1,1)
# we can show what it looks like 1 standard deviation away
plt.plot(X_, y_mean, 'b', X.flatten(), y, 'ro')
plt.grid(True)
plt.fill_between(X_, y_mean - y_std, y_mean + y_std,
alpha=0.5, color='k')
# we can show what it looks like 1 standard deviation away
gp.fit(X_next, y_next.flatten())
X_ = np.linspace(0, 5, 100)
y_mean, y_std = gp.predict(X_[:, np.newaxis], return_std=True)
plt.plot(X_, y_mean, 'b', X_next, y_next, 'ro')
plt.grid(True)
plt.fill_between(X_, y_mean - y_std, y_mean + y_std,
alpha=0.5, color='k')
plt.grid(True)
plt.plot(X_, np.vectorize(gplcb)(X_, gp=gp, kappa=1.0))
plt.title("GP-LCB (minimize regret)")
### for expected improvement...
### now complete a few iterations of GP-LCB and EI
def next_ei(X, gp=gp, y_start=y):
ei_points = np.vectorize(ei)(X, gp=gp, y_start={'y':y})
return X[np.argmax(ei_points)]
X_next = X.copy()
gp = GaussianProcessRegressor(kernel=kernel)
gp.fit(X_next, y)
X_ = np.linspace(0, 5, 100)
y_mean, y_std = gp.predict(X_[:, np.newaxis], return_std=True)
x_next = next_ei(X_, gp, y)
X_next = np.vstack([X_next, np.array([x_next])])
y_next = np.sin((X_next - 2.5) ** 2).reshape(-1,1)
# we can show what it looks like 1 standard deviation away
plt.plot(X_, y_mean, 'b', X.flatten(), y, 'ro')
plt.grid(True)
plt.fill_between(X_, y_mean - y_std, y_mean + y_std,
alpha=0.5, color='k')
# we can show what it looks like 1 standard deviation away
gp.fit(X_next, y_next.flatten())
X_ = np.linspace(0, 5, 100)
y_mean, y_std = gp.predict(X_[:, np.newaxis], return_std=True)
plt.plot(X_, y_mean, 'b', X_next, y_next, 'ro')
plt.grid(True)
plt.fill_between(X_, y_mean - y_std, y_mean + y_std,
alpha=0.5, color='k')