A paragami version of autograd's simple neural net example.
In the below notebook, I make a version of autograd’s very simple neural
network
example
using my paragami
package as a more
readable alternative to autograd
’s flatten
function.
The notebook itselfcan be downloaded here.
In this notebook, I build autograd's neural net example in paragami
.
In paragami
, the code is more verbose, but (I would argue), significantly more readable and less-error prone. It turns out that paragami
is measurably faster that autograd.misc.flatten
, though I'm not sure how well the speed improvement generalizes.
https://github.com/HIPS/autograd/blob/master/examples/neural_net_regression.py
import autograd
import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import grad
from autograd.misc import flatten
import autograd.scipy.stats.norm as norm
from autograd.misc.optimizers import adam
import paragami
import time
def build_toy_dataset(n_data=80, noise_std=0.1):
rs = npr.RandomState(0)
inputs = np.concatenate([np.linspace(0, 3, num=n_data/2),
np.linspace(6, 8, num=n_data/2)])
targets = np.cos(inputs) + rs.randn(n_data) * noise_std
inputs = (inputs - 4.0) / 2.0
inputs = inputs[:, np.newaxis]
targets = targets[:, np.newaxis] / 2.0
return inputs, targets
inputs, targets = build_toy_dataset()
First, let's copy the example autograd implementation. Each layer has a set of weight and bias parameters. The example uses autograd.misc.flatten
to represent this parameter set as a list of tuples.
def init_random_params(scale, layer_sizes, rs=npr.RandomState(0)):
"""Build a list of (weights, biases) tuples, one for each layer."""
return [(rs.randn(insize, outsize) * scale, # weight matrix
rs.randn(outsize) * scale) # bias vector
for insize, outsize in zip(layer_sizes[:-1], layer_sizes[1:])]
def nn_predict(params, inputs, nonlinearity=np.tanh):
for W, b in params:
outputs = np.dot(inputs, W) + b
inputs = nonlinearity(outputs)
return outputs
# Note that this may silently do a nonsense thing if you later make params
# contain additional parameters that are not biases or weights. In this
# toy model it's probably fine, but in more complicated settings this
# is bad modeling practice in my opinion.
def log_gaussian(params, scale):
flat_params, _ = flatten(params)
return np.sum(norm.logpdf(flat_params, 0, scale))
def logprob(weights, inputs, targets, noise_scale=0.1):
predictions = nn_predict(weights, inputs)
return np.sum(norm.logpdf(predictions, targets, noise_scale))
def objective(weights, t):
return -logprob(weights, inputs, targets)\
-log_gaussian(weights, weight_prior_variance)
Now, let's implement the same thing using paragami
. Rather than store the parameters as a list of tuples, we store it as a dictionary of dictionaries, represented with a corresponding pattern.
We'll also represent the prior and model parameters with a dictionary. This will allow us to perform sensivitiy analysis with vittles
(which is beyond the scope of the present notebook).
# Each layer has its own pattern, with `w` for weights and `b` for biases.
def get_layer_pattern(input_size, output_size):
layer_pattern = paragami.PatternDict()
layer_pattern['w'] = paragami.NumericArrayPattern((input_size, output_size))
layer_pattern['b'] = paragami.NumericVectorPattern(output_size)
return layer_pattern
# The pattern for the whole network is a dictionary of the layer patterns.
def get_nn_pattern(layer_sizes):
pattern = paragami.PatternDict(free_default=True)
for l in range(len(layer_sizes) - 1):
insize = layer_sizes[l]
outsize = layer_sizes[l + 1]
pattern[str(l)] = get_layer_pattern(insize, outsize)
return pattern
prior_pattern = paragami.PatternDict(free_default=False)
prior_pattern['w_scale'] = paragami.NumericScalarPattern(lb=0)
prior_pattern['b_scale'] = paragami.NumericScalarPattern(lb=0)
prior_pattern['noise_scale'] = paragami.NumericScalarPattern(lb=0)
def init_random_params_paragami(scale, layer_sizes, rs=npr.RandomState(0)):
"""Build a list of (weights, biases) tuples, one for each layer."""
params = dict()
for l in range(len(layer_sizes) - 1):
insize = layer_sizes[l]
outsize = layer_sizes[l + 1]
params[str(l)] = dict()
params[str(l)]['w'] = rs.randn(insize, outsize) * scale
params[str(l)]['b'] = rs.randn(outsize) * scale
return params
def nn_predict_paragami(params, inputs, nonlinearity=np.tanh):
num_layers = len(params)
for l in range(num_layers):
lpar = params[str(l)]
outputs = np.dot(inputs, lpar['w']) + lpar['b']
inputs = nonlinearity(outputs)
return outputs
# I'll allow separate scales for the weights and biases and refer to them
# by names. If you later add parameters to the dictionary, this will
# still only add priors to the weights and biases.
def log_gaussian_paragami(params, prior_par):
log_prior = 0.0
num_layers = len(params)
for l in range(num_layers):
lpar = params[str(l)]
log_prior += np.sum(norm.logpdf(lpar['w'], 0, prior_par['w_scale']))
log_prior += np.sum(norm.logpdf(lpar['b'], 0, prior_par['b_scale']))
return log_prior
def logprob_paragami(params, inputs, targets, prior_par):
predictions = nn_predict_paragami(params, inputs)
return np.sum(norm.logpdf(predictions, targets, prior_par['noise_scale']))
def folded_objective(params, prior_par):
return -logprob_paragami(params, inputs, targets, prior_par) \
-log_gaussian_paragami(params, prior_par)
Here are some functions for comparing the two methods.
def assert_arrays_almost_equal(a, b, tol=1e-12):
if np.max(np.abs(a - b)) > tol:
raise ValueError('Not equal')
def convert_params_to_paragami(params, nn_pattern):
params_paragami = dict()
for l in range(len(nn_pattern.keys())):
params_paragami[str(l)] = dict()
params_paragami[str(l)]['w'] = params[l][0]
params_paragami[str(l)]['b'] = params[l][1]
valid, msg = nn_pattern.validate_folded(params_paragami)
if not valid:
raise ValueError(msg)
return params_paragami
Get the initial parameters and patterns.
init_scale = 0.1
weight_prior_variance = 10.0
layer_sizes= [1, 4, 4, 1]
init_params = init_random_params(init_scale, layer_sizes=layer_sizes)
Get the paragami patterns.
nn_pattern = get_nn_pattern(layer_sizes)
init_params_paragami = convert_params_to_paragami(init_params, nn_pattern)
print(nn_pattern)
prior_par = { 'w_scale': weight_prior_variance,
'b_scale': weight_prior_variance,
'noise_scale': 0.1}
# A nice feature of paragami is that we can check whether parameters have valid
# shapes and values.
assert nn_pattern.validate_folded(init_params_paragami)[0]
assert prior_pattern.validate_folded(prior_par)[0]
For paragami, we need to define the flattened objective ourself.
For autograd
, this is already done with the @unflatten_optimizer
decorator on their implementation of adam
. Of course, one could define a similar decorator for paragami
. Personally, I like to be able to choose separately how I flatten and how I optimize.
init_params_flat = nn_pattern.flatten(init_params_paragami)
prior_par_flat = prior_pattern.flatten(prior_par)
flat_objective = paragami.FlattenFunctionInput(
folded_objective, [ nn_pattern, prior_pattern ],
free=[True, False], argnums=[0, 1])
def objective_paragami(par, t):
return flat_objective(par, prior_par_flat)
# Check that the two objectives are idential.
assert np.abs(folded_objective(init_params_paragami, prior_par) -
objective(init_params, 0)) < 1e-12
assert np.abs(objective_paragami(init_params_flat, 0) -
objective(init_params, 0)) < 1e-12
Perform optimization on the autograd
version.
def callback(params, t, g, print_every=50):
if t % print_every == 0:
print("Iteration {} log likelihood {}".format(
t, -objective(params, t)))
print("Optimizing network parameters...")
base_time = time.time()
optimized_params = adam(grad(objective), init_params,
step_size=0.01, num_iters=1000, callback=callback)
base_time = time.time() - base_time
print('Optimized in {} seconds.'.format(base_time))
Perform optimization the paragami version.
def callback_paragami(params_flat, t, g, print_every=50):
if t % print_every == 0:
print("Iteration {} log likelihood {}".format(
t, -objective_paragami(params_flat, t)))
print("Optimizing network parameters...")
paragami_time = time.time()
optimized_params_flat = \
adam(grad(objective_paragami), init_params_flat,
step_size=0.01, num_iters=1000, callback=callback_paragami)
paragami_time = time.time() - paragami_time
print('Optimized in {} seconds.'.format(paragami_time))
print('autograd implementation optimized in {} seconds.'.format(base_time))
print('paragami implementation optimized in {} seconds.'.format(paragami_time))