import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import arviz as az
import pymc3 as pm
import matplotlib.gridspec as gridspec
import theano.tensor as tt
%matplotlib inline
plt.rcParams['font.size']=15
def plt_legend_out(frameon=True):
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0, frameon=frameon)
Gaußscher Kernel.
Kernelige lineare Regression
Koeffizientenvektor $ \ gamma $
Polymere Regression
$ x '$ ist ein Knoten oder Schwerpunkt
np.random.seed(1)
x = np.random.uniform(0, 10, size=20)
y = np.random.normal(np.sin(x), 0.2)
plt.plot(x, y, 'o')
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$f(x)$', fontsize=16, rotation=0)
plt.show()
def gauss_kernel(x, n_knots):
"""
Simple Gaussian radial kernel
"""
knots = np.linspace(x.min(), x.max(), n_knots)
w = 2
return np.array([np.exp(-(x-k)**2/w) for k in knots])
Gaußscher Kernel.
tmp = np.linspace(-5,1,100)
plt.plot(tmp,np.exp(tmp))
plt.xlabel('x')
plt.ylabel('$\exp{(x)}$')
plt.axvline(x=0,color='gray',lw=0.5)
plt.axhline(y=0,color='gray',lw=0.5)
plt.show()
n_knots = 5
gk = gauss_kernel(x,n_knots)
plt.scatter(range(n_knots),np.linspace(x.min(), x.max(), n_knots))
plt.xlabel('index')
plt.ylabel('knots')
plt.show()
plt.plot(x, y, 'o')
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$f(x)$', fontsize=16, rotation=0)
plt.show()
plt.scatter(range(len(x)),x)
plt.xlabel('index')
plt.ylabel('x')
plt.axhline(y=np.min(x),color='gray',lw=0.5)
plt.axhline(y=np.max(x),color='gray',lw=0.5)
plt.show()
knots = np.linspace(x.min(), x.max(), n_knots)
for i in range(n_knots):
# plt.scatter(range(len(x)),gk[i])
plt.plot(gk[i], label='$x\'=$'+str(knots[i]))
plt.ylabel('$K(x,x\')$')
plt.xlabel('index')
plt_legend_out()
Probieren Sie $ \ color {red} {\ gamma} $ aus der Cauchy-Verteilung und $ \ color {green} {\ epsilon} $ aus der gleichmäßigen Verteilung
with pm.Model() as kernel_model:
gamma = pm.Cauchy('gamma', alpha=0, beta=1, shape=n_knots)
sd = pm.Uniform('sd',0, 10)
mu = pm.math.dot(gamma, gauss_kernel(x, n_knots))
yl = pm.Normal('yl', mu=mu, sd=sd, observed=y)
kernel_trace = pm.sample(10000, step=pm.Metropolis())
Multiprocess sampling (4 chains in 4 jobs)
CompoundStep
>Metropolis: [sd]
>Metropolis: [gamma]
Sampling 4 chains, 0 divergences: 100%|██████████| 42000/42000 [00:07<00:00, 5320.97draws/s]
The number of effective samples is smaller than 10% for some parameters.
chain = kernel_trace[5000:]
pm.traceplot(chain);
plt.show()
#ppc = pm.sample_ppc(chain, model=kernel_model, samples=100)
ppc = pm.sample_posterior_predictive(chain, model=kernel_model, samples=100)
plt.plot(x, ppc['yl'].T, 'ro', alpha=0.1)
plt.plot(x, y, 'bo')
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$f(x)$', fontsize=16, rotation=0)
plt.show()
/opt/conda/lib/python3.7/site-packages/pymc3/sampling.py:1247: UserWarning: samples parameter is smaller than nchains times ndraws, some draws and/or chains may not be represented in the returned posterior predictive sample
"samples parameter is smaller than nchains times ndraws, some draws "
100%|██████████| 100/100 [00:00<00:00, 574.95it/s]
new_x = np.linspace(x.min(), x.max(), 100)
k = gauss_kernel(new_x, n_knots)
gamma_pred = chain['gamma']
for i in range(100):
idx = np.random.randint(0, len(gamma_pred))
y_pred = np.dot(gamma_pred[idx], k)
plt.plot(new_x, y_pred, 'r-', alpha=0.1)
plt.plot(x, y, 'bo')
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$f(x)$', fontsize=16, rotation=0)
plt.show()
squared_distance = lambda x, y: np.array([[(x[i] - y[j])**2 for i in range(len(x))] for j in range(len(y))])
np.random.seed(1)
test_points = np.linspace(0, 10, 100)
cov = np.exp(-squared_distance(test_points, test_points))
plt.plot(test_points, stats.multivariate_normal.rvs(cov=cov, size=6).T)
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$f(x)$', fontsize=16, rotation=0)
plt.show()
Das Rauschen wird modelliert, indem die diagonalen Elemente der Kovarianzmatrix so angepasst werden, dass sie nicht 1 werden.
\begin{eqnarray}
K_{i,j}=\left\{ \begin{array}{ll}
\eta\exp{(-\rho D)} & (i\neq j) \\
\eta+\sigma & (i=j)
\end{array} \right.
\end{eqnarray}
np.random.seed(1)
eta = 1
rho = 0.5
sigma = 0.03
D = squared_distance(test_points, test_points)
cov = eta * np.exp(-rho * D)
diag = eta * sigma
np.fill_diagonal(cov, diag)
for i in range(6):
plt.plot(test_points, stats.multivariate_normal.rvs(cov=cov))
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$f(x)$', fontsize=16, rotation=0)
plt.show()
/opt/conda/lib/python3.7/site-packages/scipy/stats/_multivariate.py:660: RuntimeWarning: covariance is not positive-semidefinite.
out = random_state.multivariate_normal(mean, cov, size)
Ex-post-Vorhersagedichte
p(f(X_*|X_*,X,y))\sim N(\mu_*,\Sigma_*)
\begin{eqnarray}
\mu_* &=& K_*^TK^{-1}y\\
\Sigma_* &=& \color{red}{K_{**}}-\color{green}{K_*}^TK^{-1}\color{green}{K_*}\\
K &=& K(X,X)\\
\color{red}{K_{**}} &=& K(X_*,X_*)\\
\color{green}{K_*} &=& K(X,X_*)
\end{eqnarray}
Der Unterschied zwischen $ X $ und $ X_ \ ast $ ist groß → $ \ color {green} {K_ \ ast} = K (X, X_ \ ast) $ nähert sich Null → $ \ color {green} {K_ \ ast } K ^ {-1} \ color {green} {K_ \ ast} $ nähert sich ebenfalls Null → $ \ Sigma_ \ ast = \ color {red} {K_ {\ ast \ ast}} - \ color {green} { K_ \ ast} K ^ {-1} \ color {green} {K_ \ ast} $ wird größer. Das heißt, eine Funktion weit vom Datenpunkt entfernt weist eine große Unsicherheit auf.
np.random.seed(1)
K_oo = eta * np.exp(-rho * D)
D_x = squared_distance(x, x)
K = eta * np.exp(-rho * D_x)
diag_x = eta + sigma
np.fill_diagonal(K, diag_x)
D_off_diag = squared_distance(x, test_points)
K_o = eta * np.exp(-rho * D_off_diag)
# Posterior mean
mu_post = np.dot(np.dot(K_o, np.linalg.inv(K)), y)
# Posterior covariance
SIGMA_post = K_oo - np.dot(np.dot(K_o, np.linalg.inv(K)), K_o.T)
for i in range(100):
fx = stats.multivariate_normal.rvs(mean=mu_post, cov=SIGMA_post)
plt.plot(test_points, fx, 'r-', alpha=0.1)
plt.plot(x, y, 'o')
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$f(x)$', fontsize=16, rotation=0)
plt.show()
Colleskey-Demontage
Da die Berechnung der inversen Matrix den Berechnungsbetrag in der Größenordnung von $ O (n ^ 3) $ erfordert, ist der obige Berechnungsbetrag belastend. Daher wird "Cholesky-Zersetzung" verwendet.
np.random.seed(1)
eta = 1
rho = 0.5
sigma = 0.03
# This is the true unknown function we are trying to approximate
f = lambda x: np.sin(x).flatten()
# Define the kernel
def kernel(a, b):
""" GP squared exponential kernel """
D = np.sum(a**2,1).reshape(-1,1) + np.sum(b**2,1) - 2*np.dot(a, b.T)
return eta * np.exp(- rho * D)
N = 20 # number of training points.
n = 100 # number of test points.
# Sample some input points and noisy versions of the function evaluated at
# these points.
X = np.random.uniform(0, 10, size=(N,1))
y = f(X) + sigma * np.random.randn(N)
K = kernel(X, X)
L = np.linalg.cholesky(K + sigma * np.eye(N))
# points we're going to make predictions at.
Xtest = np.linspace(0, 10, n).reshape(-1,1)
# compute the mean at our test points.
Lk = np.linalg.solve(L, kernel(X, Xtest))
mu = np.dot(Lk.T, np.linalg.solve(L, y))
# compute the variance at our test points.
K_ = kernel(Xtest, Xtest)
sd_pred = (np.diag(K_) - np.sum(Lk**2, axis=0))**0.5
plt.fill_between(Xtest.flat, mu - 2 * sd_pred, mu + 2 * sd_pred, color="r", alpha=0.2)
plt.plot(Xtest, mu, 'r', lw=2)
plt.plot(x, y, 'o')
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$f(x)$', fontsize=16, rotation=0)
plt.show()
import warnings
warnings.filterwarnings('ignore')
with pm.Model() as GP:
mu = np.zeros(N)
eta = pm.HalfCauchy('eta', 5)
rho = pm.HalfCauchy('rho', 5)
sigma = pm.HalfCauchy('sigma', 5)
D = squared_distance(x, x)
K = tt.fill_diagonal(eta * pm.math.exp(-rho * D), eta + sigma)
obs = pm.MvNormal('obs', mu, cov=K, observed=y)
test_points = np.linspace(0, 10, 100)
D_pred = squared_distance(test_points, test_points)
D_off_diag = squared_distance(x, test_points)
K_oo = eta * pm.math.exp(-rho * D_pred)
K_o = eta * pm.math.exp(-rho * D_off_diag)
mu_post = pm.Deterministic('mu_post', pm.math.dot(pm.math.dot(K_o, tt.nlinalg.matrix_inverse(K)), y))
SIGMA_post = pm.Deterministic('SIGMA_post', K_oo - pm.math.dot(pm.math.dot(K_o, tt.nlinalg.matrix_inverse(K)), K_o.T))
start = pm.find_MAP()
trace = pm.sample(1000, start=start)
logp = 15.78, ||grad|| = 1.6532e-05: 100%|██████████| 22/22 [00:00<00:00, 647.40it/s]
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sigma, rho, eta]
Sampling 4 chains, 0 divergences: 100%|██████████| 6000/6000 [01:37<00:00, 61.74draws/s]
varnames = ['eta', 'rho', 'sigma']
chain = trace[100:]
pm.traceplot(chain, varnames)
plt.show()
pm.summary(chain, varnames).round(4)
mean | sd | hpd_3% | hpd_97% | mcse_mean | mcse_sd | ess_mean | ess_sd | ess_bulk | ess_tail | r_hat | |
---|---|---|---|---|---|---|---|---|---|---|---|
eta | 2.729 | 3.053 | 0.234 | 7.215 | 0.082 | 0.060 | 1370.0 | 1303.0 | 1494.0 | 1730.0 | 1.0 |
rho | 0.132 | 0.056 | 0.055 | 0.233 | 0.001 | 0.001 | 1396.0 | 1354.0 | 1470.0 | 1622.0 | 1.0 |
sigma | 0.001 | 0.000 | 0.000 | 0.001 | 0.000 | 0.000 | 1297.0 | 1254.0 | 1631.0 | 1790.0 | 1.0 |
y_pred = [np.random.multivariate_normal(m, S) for m,S in zip(chain['mu_post'][::5], chain['SIGMA_post'][::5])]
for yp in y_pred:
plt.plot(test_points, yp, 'r-', alpha=0.1)
plt.plot(x, y, 'bo')
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$f(x)$', fontsize=16, rotation=0)
plt.show()
Periodic Kernel
periodic = lambda x, y: np.array([[np.sin((x[i] - y[j])/2)**2 for i in range(len(x))] for j in range(len(y))])
with pm.Model() as GP_periodic:
mu = np.zeros(N)
eta = pm.HalfCauchy('eta', 5)
rho = pm.HalfCauchy('rho', 5)
sigma = pm.HalfCauchy('sigma', 5)
P = periodic(x, x)
K = tt.fill_diagonal(eta * pm.math.exp(-rho * P), eta + sigma)
obs = pm.MvNormal('obs', mu, cov=K, observed=y)
test_points = np.linspace(0, 10, 100)
D_pred = periodic(test_points, test_points)
D_off_diag = periodic(x, test_points)
K_oo = eta * pm.math.exp(-rho * D_pred)
K_o = eta * pm.math.exp(-rho * D_off_diag)
mu_post = pm.Deterministic('mu_post', pm.math.dot(pm.math.dot(K_o, tt.nlinalg.matrix_inverse(K)), y))
SIGMA_post = pm.Deterministic('SIGMA_post', K_oo - pm.math.dot(pm.math.dot(K_o, tt.nlinalg.matrix_inverse(K)), K_o.T))
start = pm.find_MAP()
trace = pm.sample(1000, start=start)
/opt/conda/lib/python3.7/site-packages/theano/tensor/basic.py:6611: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
result[diagonal_slice] = x
logp = 23.985, ||grad|| = 1.9188: 100%|██████████| 18/18 [00:00<00:00, 797.56it/s]
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sigma, rho, eta]
Sampling 4 chains, 0 divergences: 100%|██████████| 6000/6000 [01:01<00:00, 98.19draws/s]
varnames = ['eta', 'rho', 'sigma']
chain = trace[100:]
pm.traceplot(chain, varnames);
y_pred = [np.random.multivariate_normal(m, S) for m,S in zip(chain['mu_post'][::5], chain['SIGMA_post'][::5])]
for yp in y_pred:
plt.plot(test_points, yp, 'r-', alpha=0.1)
plt.plot(x, y, 'bo')
plt.xlabel('$x$', fontsize=16)
plt.ylabel('$f(x)$', fontsize=16, rotation=0)
plt.show()
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:1: RuntimeWarning: covariance is not positive-semidefinite.
"""Entry point for launching an IPython kernel.
Recommended Posts