import autograd
import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import grad
from autograd.misc import flatten
import autograd.scipy.stats.norm as norm
from autograd.misc.optimizers import adam
import paragami
import time
In this notebook, I build autograd’s neural net example in paragami
.
In paragami
, the code is more verbose, but (I would argue), significantly more readable and less-error prone. It turns out that paragami
is measurably faster that autograd.misc.flatten
, though I’m not sure how well the speed improvement generalizes.
https://github.com/HIPS/autograd/blob/master/examples/neural_net_regression.py
def build_toy_dataset(n_data=80, noise_std=0.1):
= npr.RandomState(0)
rs = np.concatenate([np.linspace(0, 3, num=n_data/2),
inputs 6, 8, num=n_data/2)])
np.linspace(= np.cos(inputs) + rs.randn(n_data) * noise_std
targets = (inputs - 4.0) / 2.0
inputs = inputs[:, np.newaxis]
inputs = targets[:, np.newaxis] / 2.0
targets return inputs, targets
= build_toy_dataset() inputs, targets
First, let’s copy the example autograd implementation. Each layer has a set of weight and bias parameters. The example uses autograd.misc.flatten
to represent this parameter set as a list of tuples.
def init_random_params(scale, layer_sizes, rs=npr.RandomState(0)):
"""Build a list of (weights, biases) tuples, one for each layer."""
return [(rs.randn(insize, outsize) * scale, # weight matrix
* scale) # bias vector
rs.randn(outsize) for insize, outsize in zip(layer_sizes[:-1], layer_sizes[1:])]
def nn_predict(params, inputs, nonlinearity=np.tanh):
for W, b in params:
= np.dot(inputs, W) + b
outputs = nonlinearity(outputs)
inputs return outputs
# Note that this may silently do a nonsense thing if you later make params
# contain additional parameters that are not biases or weights. In this
# toy model it's probably fine, but in more complicated settings this
# is bad modeling practice in my opinion.
def log_gaussian(params, scale):
= flatten(params)
flat_params, _ return np.sum(norm.logpdf(flat_params, 0, scale))
def logprob(weights, inputs, targets, noise_scale=0.1):
= nn_predict(weights, inputs)
predictions return np.sum(norm.logpdf(predictions, targets, noise_scale))
def objective(weights, t):
return -logprob(weights, inputs, targets)\
-log_gaussian(weights, weight_prior_variance)
Now, let’s implement the same thing using paragami
. Rather than store the parameters as a list of tuples, we store it as a dictionary of dictionaries, represented with a corresponding pattern.
We’ll also represent the prior and model parameters with a dictionary. This will allow us to perform sensivitiy analysis with vittles
(which is beyond the scope of the present notebook).
# Each layer has its own pattern, with `w` for weights and `b` for biases.
def get_layer_pattern(input_size, output_size):
= paragami.PatternDict()
layer_pattern 'w'] = paragami.NumericArrayPattern((input_size, output_size))
layer_pattern['b'] = paragami.NumericVectorPattern(output_size)
layer_pattern[return layer_pattern
# The pattern for the whole network is a dictionary of the layer patterns.
def get_nn_pattern(layer_sizes):
= paragami.PatternDict(free_default=True)
pattern for l in range(len(layer_sizes) - 1):
= layer_sizes[l]
insize = layer_sizes[l + 1]
outsize str(l)] = get_layer_pattern(insize, outsize)
pattern[return pattern
= paragami.PatternDict(free_default=False)
prior_pattern 'w_scale'] = paragami.NumericScalarPattern(lb=0)
prior_pattern['b_scale'] = paragami.NumericScalarPattern(lb=0)
prior_pattern['noise_scale'] = paragami.NumericScalarPattern(lb=0) prior_pattern[
def init_random_params_paragami(scale, layer_sizes, rs=npr.RandomState(0)):
"""Build a list of (weights, biases) tuples, one for each layer."""
= dict()
params for l in range(len(layer_sizes) - 1):
= layer_sizes[l]
insize = layer_sizes[l + 1]
outsize str(l)] = dict()
params[str(l)]['w'] = rs.randn(insize, outsize) * scale
params[str(l)]['b'] = rs.randn(outsize) * scale
params[return params
def nn_predict_paragami(params, inputs, nonlinearity=np.tanh):
= len(params)
num_layers for l in range(num_layers):
= params[str(l)]
lpar = np.dot(inputs, lpar['w']) + lpar['b']
outputs = nonlinearity(outputs)
inputs return outputs
# I'll allow separate scales for the weights and biases and refer to them
# by names. If you later add parameters to the dictionary, this will
# still only add priors to the weights and biases.
def log_gaussian_paragami(params, prior_par):
= 0.0
log_prior = len(params)
num_layers for l in range(num_layers):
= params[str(l)]
lpar += np.sum(norm.logpdf(lpar['w'], 0, prior_par['w_scale']))
log_prior += np.sum(norm.logpdf(lpar['b'], 0, prior_par['b_scale']))
log_prior return log_prior
def logprob_paragami(params, inputs, targets, prior_par):
= nn_predict_paragami(params, inputs)
predictions return np.sum(norm.logpdf(predictions, targets, prior_par['noise_scale']))
def folded_objective(params, prior_par):
return -logprob_paragami(params, inputs, targets, prior_par) \
-log_gaussian_paragami(params, prior_par)
Here are some functions for comparing the two methods.
def assert_arrays_almost_equal(a, b, tol=1e-12):
if np.max(np.abs(a - b)) > tol:
raise ValueError('Not equal')
def convert_params_to_paragami(params, nn_pattern):
= dict()
params_paragami for l in range(len(nn_pattern.keys())):
str(l)] = dict()
params_paragami[str(l)]['w'] = params[l][0]
params_paragami[str(l)]['b'] = params[l][1]
params_paragami[= nn_pattern.validate_folded(params_paragami)
valid, msg if not valid:
raise ValueError(msg)
return params_paragami
Get the initial parameters and patterns.
= 0.1
init_scale = 10.0
weight_prior_variance = [1, 4, 4, 1]
layer_sizes
= init_random_params(init_scale, layer_sizes=layer_sizes) init_params
Get the paragami patterns.
= get_nn_pattern(layer_sizes)
nn_pattern = convert_params_to_paragami(init_params, nn_pattern)
init_params_paragami print(nn_pattern)
= { 'w_scale': weight_prior_variance,
prior_par 'b_scale': weight_prior_variance,
'noise_scale': 0.1}
# A nice feature of paragami is that we can check whether parameters have valid
# shapes and values.
assert nn_pattern.validate_folded(init_params_paragami)[0]
assert prior_pattern.validate_folded(prior_par)[0]
OrderedDict:
[0] = OrderedDict:
[w] = NumericArrayPattern (1, 4) (lb=-inf, ub=inf)
[b] = NumericArrayPattern (4,) (lb=-inf, ub=inf)
[1] = OrderedDict:
[w] = NumericArrayPattern (4, 4) (lb=-inf, ub=inf)
[b] = NumericArrayPattern (4,) (lb=-inf, ub=inf)
[2] = OrderedDict:
[w] = NumericArrayPattern (4, 1) (lb=-inf, ub=inf)
[b] = NumericArrayPattern (1,) (lb=-inf, ub=inf)
For paragami, we need to define the flattened objective ourself.
For autograd
, this is already done with the @unflatten_optimizer
decorator on their implementation of adam
. Of course, one could define a similar decorator for paragami
. Personally, I like to be able to choose separately how I flatten and how I optimize.
= nn_pattern.flatten(init_params_paragami)
init_params_flat = prior_pattern.flatten(prior_par)
prior_par_flat = paragami.FlattenFunctionInput(
flat_objective
folded_objective, [ nn_pattern, prior_pattern ],=[True, False], argnums=[0, 1])
free
def objective_paragami(par, t):
return flat_objective(par, prior_par_flat)
# Check that the two objectives are idential.
assert np.abs(folded_objective(init_params_paragami, prior_par) -
0)) < 1e-12
objective(init_params,
assert np.abs(objective_paragami(init_params_flat, 0) -
0)) < 1e-12 objective(init_params,
Perform optimization on the autograd
version.
def callback(params, t, g, print_every=50):
if t % print_every == 0:
print("Iteration {} log likelihood {}".format(
-objective(params, t)))
t,
print("Optimizing network parameters...")
= time.time()
base_time = adam(grad(objective), init_params,
optimized_params =0.01, num_iters=1000, callback=callback)
step_size= time.time() - base_time
base_time print('Optimized in {} seconds.'.format(base_time))
Optimizing network parameters...
Iteration 0 log likelihood -632.7841008314174
Iteration 50 log likelihood -384.2830389865702
Iteration 100 log likelihood -134.87342397542312
Iteration 150 log likelihood -10.500006545543343
Iteration 200 log likelihood -8.58780338607312
Iteration 250 log likelihood -6.856004924106443
Iteration 300 log likelihood -5.904673818790883
Iteration 350 log likelihood -5.5546502379648075
Iteration 400 log likelihood -5.430400278290989
Iteration 450 log likelihood -5.465832705012744
Iteration 500 log likelihood -5.345904727294112
Iteration 550 log likelihood -5.3165477163621375
Iteration 600 log likelihood -5.293316778869155
Iteration 650 log likelihood -5.272816900458096
Iteration 700 log likelihood -5.239799393394577
Iteration 750 log likelihood -5.213782130592122
Iteration 800 log likelihood -5.274903712577299
Iteration 850 log likelihood -5.1663544276520526
Iteration 900 log likelihood -5.1411319281978365
Iteration 950 log likelihood -5.368775500962386
Optimized in 3.2963759899139404 seconds.
Perform optimization the paragami version.
def callback_paragami(params_flat, t, g, print_every=50):
if t % print_every == 0:
print("Iteration {} log likelihood {}".format(
-objective_paragami(params_flat, t)))
t,
print("Optimizing network parameters...")
= time.time()
paragami_time = \
optimized_params_flat
adam(grad(objective_paragami), init_params_flat,=0.01, num_iters=1000, callback=callback_paragami)
step_size= time.time() - paragami_time
paragami_time print('Optimized in {} seconds.'.format(paragami_time))
Optimizing network parameters...
Iteration 0 log likelihood -632.7841008314174
Iteration 50 log likelihood -384.2830389865702
Iteration 100 log likelihood -134.87342397542312
Iteration 150 log likelihood -10.500006545543329
Iteration 200 log likelihood -8.58780338607312
Iteration 250 log likelihood -6.856004924106458
Iteration 300 log likelihood -5.904673818790883
Iteration 350 log likelihood -5.554650237964793
Iteration 400 log likelihood -5.430400278291003
Iteration 450 log likelihood -5.46583270501273
Iteration 500 log likelihood -5.345904727294112
Iteration 550 log likelihood -5.3165477163621375
Iteration 600 log likelihood -5.293316778869155
Iteration 650 log likelihood -5.2728169004581105
Iteration 700 log likelihood -5.239799393394577
Iteration 750 log likelihood -5.213782130592108
Iteration 800 log likelihood -5.274903712577299
Iteration 850 log likelihood -5.166354427652038
Iteration 900 log likelihood -5.1411319281978365
Iteration 950 log likelihood -5.368775500962386
Optimized in 3.0171921253204346 seconds.
print('autograd implementation optimized in {} seconds.'.format(base_time))
print('paragami implementation optimized in {} seconds.'.format(paragami_time))
autograd implementation optimized in 3.2963759899139404 seconds.
paragami implementation optimized in 3.0171921253204346 seconds.