In this notebook, I build autograd’s neural net example in paragami.

In paragami, the code is more verbose, but (I would argue), significantly more readable and less-error prone. It turns out that paragami is measurably faster that autograd.misc.flatten, though I’m not sure how well the speed improvement generalizes.

https://github.com/HIPS/autograd/blob/master/examples/neural_net_regression.py

import autograd
import autograd.numpy as np
import autograd.numpy.random as npr
from autograd import grad
from autograd.misc import flatten
import autograd.scipy.stats.norm as norm
from autograd.misc.optimizers import adam

import paragami
import time
def build_toy_dataset(n_data=80, noise_std=0.1):
    rs = npr.RandomState(0)
    inputs  = np.concatenate([np.linspace(0, 3, num=n_data/2),
                              np.linspace(6, 8, num=n_data/2)])
    targets = np.cos(inputs) + rs.randn(n_data) * noise_std
    inputs = (inputs - 4.0) / 2.0
    inputs  = inputs[:, np.newaxis]
    targets = targets[:, np.newaxis] / 2.0
    return inputs, targets

inputs, targets = build_toy_dataset()

First, let’s copy the example autograd implementation. Each layer has a set of weight and bias parameters. The example uses autograd.misc.flatten to represent this parameter set as a list of tuples.

def init_random_params(scale, layer_sizes, rs=npr.RandomState(0)):
    """Build a list of (weights, biases) tuples, one for each layer."""
    return [(rs.randn(insize, outsize) * scale,   # weight matrix
             rs.randn(outsize) * scale)           # bias vector
            for insize, outsize in zip(layer_sizes[:-1], layer_sizes[1:])]

def nn_predict(params, inputs, nonlinearity=np.tanh):
    for W, b in params:
        outputs = np.dot(inputs, W) + b
        inputs = nonlinearity(outputs)
    return outputs

# Note that this may silently do a nonsense thing if you later make params
# contain additional parameters that are not biases or weights.  In this
# toy model it's probably fine, but in more complicated settings this
# is bad modeling practice in my opinion.
def log_gaussian(params, scale):
    flat_params, _ = flatten(params)
    return np.sum(norm.logpdf(flat_params, 0, scale))

def logprob(weights, inputs, targets, noise_scale=0.1):
    predictions = nn_predict(weights, inputs)
    return np.sum(norm.logpdf(predictions, targets, noise_scale))

def objective(weights, t):
    return -logprob(weights, inputs, targets)\
           -log_gaussian(weights, weight_prior_variance)

Now, let’s implement the same thing using paragami. Rather than store the parameters as a list of tuples, we store it as a dictionary of dictionaries, represented with a corresponding pattern.

We’ll also represent the prior and model parameters with a dictionary. This will allow us to perform sensivitiy analysis with vittles (which is beyond the scope of the present notebook).

# Each layer has its own pattern, with `w` for weights and `b` for biases.
def get_layer_pattern(input_size, output_size):
    layer_pattern = paragami.PatternDict()
    layer_pattern['w'] = paragami.NumericArrayPattern((input_size, output_size))
    layer_pattern['b'] = paragami.NumericVectorPattern(output_size)
    return layer_pattern

# The pattern for the whole network is a dictionary of the layer patterns.
def get_nn_pattern(layer_sizes):
    pattern = paragami.PatternDict(free_default=True)
    for l in range(len(layer_sizes) - 1):
        insize = layer_sizes[l]
        outsize = layer_sizes[l + 1]
        pattern[str(l)] = get_layer_pattern(insize, outsize)
    return pattern

prior_pattern = paragami.PatternDict(free_default=False)
prior_pattern['w_scale'] = paragami.NumericScalarPattern(lb=0)
prior_pattern['b_scale'] = paragami.NumericScalarPattern(lb=0)
prior_pattern['noise_scale'] = paragami.NumericScalarPattern(lb=0)
def init_random_params_paragami(scale, layer_sizes, rs=npr.RandomState(0)):
    """Build a list of (weights, biases) tuples, one for each layer."""
    params = dict()
    for l in range(len(layer_sizes) - 1):
        insize = layer_sizes[l]
        outsize = layer_sizes[l + 1]
        params[str(l)] = dict()
        params[str(l)]['w'] = rs.randn(insize, outsize) * scale
        params[str(l)]['b'] = rs.randn(outsize) * scale
    return params

def nn_predict_paragami(params, inputs, nonlinearity=np.tanh):
    num_layers = len(params)
    for l in range(num_layers):
        lpar = params[str(l)]
        outputs = np.dot(inputs, lpar['w']) + lpar['b']
        inputs = nonlinearity(outputs)
    return outputs

# I'll allow separate scales for the weights and biases and refer to them
# by names.  If you later add parameters to the dictionary, this will
# still only add priors to the weights and biases.
def log_gaussian_paragami(params, prior_par):
    log_prior = 0.0
    num_layers = len(params)
    for l in range(num_layers):
        lpar = params[str(l)]
        log_prior += np.sum(norm.logpdf(lpar['w'], 0, prior_par['w_scale']))
        log_prior += np.sum(norm.logpdf(lpar['b'], 0, prior_par['b_scale']))
    return log_prior

def logprob_paragami(params, inputs, targets, prior_par):
    predictions = nn_predict_paragami(params, inputs)
    return np.sum(norm.logpdf(predictions, targets, prior_par['noise_scale']))

def folded_objective(params, prior_par):
    return -logprob_paragami(params, inputs, targets, prior_par) \
           -log_gaussian_paragami(params, prior_par)

Here are some functions for comparing the two methods.

def assert_arrays_almost_equal(a, b, tol=1e-12):
    if np.max(np.abs(a - b)) > tol:
        raise ValueError('Not equal')

def convert_params_to_paragami(params, nn_pattern):
    params_paragami = dict()
    for l in range(len(nn_pattern.keys())):
        params_paragami[str(l)] = dict()
        params_paragami[str(l)]['w'] = params[l][0]
        params_paragami[str(l)]['b'] = params[l][1]
    valid, msg = nn_pattern.validate_folded(params_paragami)
    if not valid:
        raise ValueError(msg)
    return params_paragami

Get the initial parameters and patterns.

init_scale = 0.1
weight_prior_variance = 10.0
layer_sizes= [1, 4, 4, 1]

init_params = init_random_params(init_scale, layer_sizes=layer_sizes)

Get the paragami patterns.

nn_pattern = get_nn_pattern(layer_sizes)
init_params_paragami = convert_params_to_paragami(init_params, nn_pattern)
print(nn_pattern)

prior_par = { 'w_scale': weight_prior_variance,
              'b_scale': weight_prior_variance,
              'noise_scale': 0.1}

# A nice feature of paragami is that we can check whether parameters have valid
# shapes and values.
assert nn_pattern.validate_folded(init_params_paragami)[0]
assert prior_pattern.validate_folded(prior_par)[0]
OrderedDict:
    [0] = OrderedDict:
    [w] = NumericArrayPattern (1, 4) (lb=-inf, ub=inf)
    [b] = NumericArrayPattern (4,) (lb=-inf, ub=inf)
    [1] = OrderedDict:
    [w] = NumericArrayPattern (4, 4) (lb=-inf, ub=inf)
    [b] = NumericArrayPattern (4,) (lb=-inf, ub=inf)
    [2] = OrderedDict:
    [w] = NumericArrayPattern (4, 1) (lb=-inf, ub=inf)
    [b] = NumericArrayPattern (1,) (lb=-inf, ub=inf)

For paragami, we need to define the flattened objective ourself.

For autograd, this is already done with the @unflatten_optimizer decorator on their implementation of adam. Of course, one could define a similar decorator for paragami. Personally, I like to be able to choose separately how I flatten and how I optimize.

init_params_flat = nn_pattern.flatten(init_params_paragami)
prior_par_flat = prior_pattern.flatten(prior_par)
flat_objective =  paragami.FlattenFunctionInput(
    folded_objective, [ nn_pattern, prior_pattern ],
    free=[True, False], argnums=[0, 1])

def objective_paragami(par, t):
    return flat_objective(par, prior_par_flat)
# Check that the two objectives are idential.
assert np.abs(folded_objective(init_params_paragami, prior_par) -
              objective(init_params, 0)) < 1e-12

assert np.abs(objective_paragami(init_params_flat, 0) -
              objective(init_params, 0)) < 1e-12

Perform optimization on the autograd version.

def callback(params, t, g, print_every=50):
    if t % print_every == 0:
        print("Iteration {} log likelihood {}".format(
            t, -objective(params, t)))

print("Optimizing network parameters...")
base_time = time.time()
optimized_params = adam(grad(objective), init_params,
                        step_size=0.01, num_iters=1000, callback=callback)
base_time = time.time() - base_time
print('Optimized in {} seconds.'.format(base_time))
Optimizing network parameters...
Iteration 0 log likelihood -632.7841008314174
Iteration 50 log likelihood -384.2830389865702
Iteration 100 log likelihood -134.87342397542312
Iteration 150 log likelihood -10.500006545543343
Iteration 200 log likelihood -8.58780338607312
Iteration 250 log likelihood -6.856004924106443
Iteration 300 log likelihood -5.904673818790883
Iteration 350 log likelihood -5.5546502379648075
Iteration 400 log likelihood -5.430400278290989
Iteration 450 log likelihood -5.465832705012744
Iteration 500 log likelihood -5.345904727294112
Iteration 550 log likelihood -5.3165477163621375
Iteration 600 log likelihood -5.293316778869155
Iteration 650 log likelihood -5.272816900458096
Iteration 700 log likelihood -5.239799393394577
Iteration 750 log likelihood -5.213782130592122
Iteration 800 log likelihood -5.274903712577299
Iteration 850 log likelihood -5.1663544276520526
Iteration 900 log likelihood -5.1411319281978365
Iteration 950 log likelihood -5.368775500962386
Optimized in 3.2963759899139404 seconds.

Perform optimization the paragami version.

def callback_paragami(params_flat, t, g, print_every=50):
    if t % print_every == 0:
        print("Iteration {} log likelihood {}".format(
            t, -objective_paragami(params_flat, t)))

print("Optimizing network parameters...")
paragami_time = time.time()
optimized_params_flat = \
    adam(grad(objective_paragami), init_params_flat,
         step_size=0.01, num_iters=1000, callback=callback_paragami)
paragami_time = time.time() - paragami_time
print('Optimized in {} seconds.'.format(paragami_time))
Optimizing network parameters...
Iteration 0 log likelihood -632.7841008314174
Iteration 50 log likelihood -384.2830389865702
Iteration 100 log likelihood -134.87342397542312
Iteration 150 log likelihood -10.500006545543329
Iteration 200 log likelihood -8.58780338607312
Iteration 250 log likelihood -6.856004924106458
Iteration 300 log likelihood -5.904673818790883
Iteration 350 log likelihood -5.554650237964793
Iteration 400 log likelihood -5.430400278291003
Iteration 450 log likelihood -5.46583270501273
Iteration 500 log likelihood -5.345904727294112
Iteration 550 log likelihood -5.3165477163621375
Iteration 600 log likelihood -5.293316778869155
Iteration 650 log likelihood -5.2728169004581105
Iteration 700 log likelihood -5.239799393394577
Iteration 750 log likelihood -5.213782130592108
Iteration 800 log likelihood -5.274903712577299
Iteration 850 log likelihood -5.166354427652038
Iteration 900 log likelihood -5.1411319281978365
Iteration 950 log likelihood -5.368775500962386
Optimized in 3.0171921253204346 seconds.
print('autograd implementation optimized in {} seconds.'.format(base_time))
print('paragami implementation optimized in {} seconds.'.format(paragami_time))
autograd implementation optimized in 3.2963759899139404 seconds.
paragami implementation optimized in 3.0171921253204346 seconds.