Homework 09: Artificial Neural Networks#

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
import pandas as pd
import os.path
import subprocess
import autograd.numpy as anp
import autograd

Helpers for Getting, Loading and Locating Data

def wget_data(url: str):
    local_path = './tmp_data'
    p = subprocess.Popen(["wget", "-nc", "-P", local_path, url], stderr=subprocess.PIPE, encoding='UTF-8')
    rc = None
    while rc is None:
      line = p.stderr.readline().strip('\n')
      if len(line) > 0:
        print(line)
      rc = p.poll()

def locate_data(name, check_exists=True):
    local_path='./tmp_data'
    path = os.path.join(local_path, name)
    if check_exists and not os.path.exists(path):
        raise RuxntimeError('No such data file: {}'.format(path))
    return path

Problem 1#

In class, we hand-tuned a network to assign 2D points to the correct cluster, then repeated the exercise using tensorflow to automatically learn network parameters from the training data. In this problem, you will construct and optimize the same neural network by hand (without tensorflow) to get a deeper understanding of what is involved (and why tensorflow saves a lot of work).

First, load the training data:

wget_data('https://raw.githubusercontent.com/illinois-ipaml/MachineLearningForPhysics/main/data/circles_data.hf5')
wget_data('https://raw.githubusercontent.com/illinois-ipaml/MachineLearningForPhysics/main/data/circles_targets.hf5')
X = pd.read_hdf(locate_data('circles_data.hf5')).values
y = pd.read_hdf(locate_data('circles_targets.hf5')).values.astype(float)
def plot_circles():
    cmap = sns.color_palette('colorblind', 2)
    colors = [cmap[int(c)] for c in y]
    plt.scatter(X[:, 0], X[:, 1], c=colors)
    plt.gca().set_aspect(1)
    
plot_circles()

Next, implement the function below to evalute the network prediction given its parameter values and a 2D input point \((x_1, x_2)\). Recall that the network consists of:

  • 2 input nodes

  • 4 hidden nodes with sigmoid activation

  • 1 output node with sigmoid activation

The corresponding mathematical function is: $\( \Large \begin{align} y^{out}(\mathbf{x},\Theta) &= \phi\left([u_1, u_2, u_3, u_4]\cdot \mathbf{G}(x_1, x_2) + a \right) \\ \mathbf{G}(x_1, x_2) &= \phi\left( \begin{bmatrix} v_{11} & v_{12} \\ v_{21} & v_{22} \\ v_{31} & v_{32} \\ v_{41} & v_{42} \end{bmatrix} \cdot \begin{bmatrix}x_1 \\ x_2\end{bmatrix} + \begin{bmatrix}b_1 \\ b_2 \\ b_3 \\ b_4\end{bmatrix} \right) \end{align} \\ \phi(s) = \frac{1}{1 + e^{-s}} \)$ In the function, we pack all the parameters into a single array to simplify the later steps.

def network(x, params):
    """Evaluate the network output at (x1,x2) with the specified parameters.
    
    Parameters
    ----------
    x : array
        Array of length 2 giving the 2D coordinates of a point to classify.
    params : array
        Array of length 17 containing values for the parameters: u1, u2, u3, u4, a,
        v11, v12, v21, v22, v31, v32, v41, v42, b1, b2, b3, b4.
        
    Returns
    -------
    float
        Value of the network's single output node.
    """
    # YOUR CODE HERE
    raise NotImplementedError()
# A correct solution should pass the tests below.
initial = np.array([
    5,5,5,5,              # ui
    -18,                  # a
    8,0, -8,0, 0,8, 0,-8, # vij
    10,10,10,10           # bi
], dtype=float)
assert np.round(network([0, 0], initial), 3) == 0.881
assert np.round(network([0, 1], initial), 3) == 0.803
assert np.round(network([1, 0], initial), 3) == 0.803
assert np.round(network([1, -1], initial), 3) == 0.692
assert np.round(network([-1, 1], initial), 3) == 0.692

We can learn the parameters from the data by optimizing the cross-entropy loss function:

\[ \Large E(\Theta, \mathbf{X}^{train}, \mathbf{y}^{train}) = -\sum_{i=1}^N\, \left[ y^{train}_i \log y^{out}(\mathbf{X}^{train}_i, \Theta) + (1 - y^{train}_i) \log (1 - y^{out}(\mathbf{X}^{train}_i, \Theta)) \right] \]

(Refer to the Neural Network lecture notes for details on why this is a good loss function).

Implement the function below to evaluate this loss:

def loss(params, X_train, y_train):
    """Evaluate the cross-entropy loss.
    
    Parameters
    ----------
    params : array
        Array of length 17 containing values for the parameters: u1, u2, u3, u4, a,
        v11, v12, v21, v22, v31, v32, v41, v42, b1, b2, b3, b4.
    X_train : array
        Array of shape (N, 2) with training values.
    y_train : array
        Array of N target values which are each either 0. or 1.
        
    Returns
    -------
    float
        The cross-entropy loss value.
    """
    # YOUR CODE HERE
    raise NotImplementedError()
# A correct solution should pass the tests below.
assert np.round(loss(initial, np.array([[0.,0.]]), np.array([0.])), 3) == 2.126
assert np.round(loss(initial, np.array([[0.,0.]]), np.array([1.])), 3) == 0.127
assert np.round(loss(initial, np.array([[2.,1.]]), np.array([0.])), 3) == 0.027
assert np.round(loss(initial, np.array([[2.,1.]]), np.array([1.])), 3) == 3.611
assert np.round(loss(initial, X, y), 1) == 41.4

We can check if the initial parameter values are at a local minimum of the loss by running 1D scans along each parameter axis. Scans are also useful for rough numerical estimates of the partial derivatives

\[ \Large \frac{\partial}{\partial \Theta_k} E(\Theta, \mathbf{X}^{train}, \mathbf{y}^{train}) \; . \]
def plot_scans(params, X_train, y_train, step_size=0.5, n=3):
    names = 'u1,u2,u3,u4,a,v11,v12,v21,v22,v31,v32,v41,v42,b1,b2,b3,b4'.split(',')
    dp_grid = np.linspace(-step_size, +step_size, 2 * n + 1)
    L = np.empty_like(dp_grid)
    for i, p0 in enumerate(initial):
        pscan = params.copy()
        for j, dp in enumerate(dp_grid):
            pscan[i] = p0 + dp
            L[j] = loss(pscan, X_train, y_train)
        plt.plot(dp_grid, L, label=names[i])
        print('numerical grad[{}] = {:.3f}'.format(names[i], np.gradient(L, dp_grid)[n]))
    plt.legend(ncol=5)
    plt.ylim(35., 50.)
    plt.xlabel('Change in parameter value')
    plt.ylabel('Cross-entropy loss')

plot_scans(initial, X, y)

Finally, implement the function below to calculate the partial derivatives of the loss function,

\[ \Large \frac{\partial}{\partial \Theta_k} E(\Theta, \mathbf{X}^{train}, \mathbf{y}^{train}) \; . \]

These should be calculated to high accuracy, to allow stable optimization, so either using the autograd package for automatic differentiation (refer to the Optimization lecture) or else using derivative formulas that you calculate by hand.

def loss_gradient(params, X_train, y_train):
    """Calculate the partial derivatives of the loss function.
    
    Parameters
    ----------
    params : array
        Array of length 17 containing values for the parameters: u1, u2, u3, u4, a,
        v11, v12, v21, v22, v31, v32, v41, v42, b1, b2, b3, b4.
    X_train : array
        Array of shape (N, 2) with training values.
    y_train : array
        Array of N target values which are each either 0. or 1.
        
    Returns
    -------
    array
        Array of length 17 containing the partial derivatives of the loss function
        with respect to each of the parameters.
    """
    # YOUR CODE HERE
    raise NotImplementedError()
# A correct solution should pass the tests below.
assert np.allclose(
    np.round(loss_gradient(initial, np.array([[0.,0.]]), np.array([0.])), 3),
    [ 0.881, 0.881, 0.881, 0.881, 0.881, 0., 0., 0.,
      0., 0., 0., 0., 0., 0., 0., 0., 0.], atol=1e-3)
assert np.allclose(
    np.round(loss_gradient(initial, np.array([[0.,0.]]), np.array([1.])), 3),
    [ -0.119, -0.119, -0.119, -0.119, -0.119, 0., 0.,
      0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], atol=1e-3)
assert np.allclose(
    np.round(loss_gradient(initial, np.array([[2.,1.]]), np.array([0.])), 3),
    [ 0.027, 0., 0.027, 0.024, 0.027, 0., 0., 0.001,
      0., 0., 0., 0.028, 0.014, 0., 0., 0., 0.014], atol=1e-3)
assert np.allclose(
    np.round(loss_gradient(initial, np.array([[2.,1.]]), np.array([1.])), 3),
    [ -0.973, -0.002, -0.973, -0.857, -0.973, -0., -0., -0.024,
      -0.012, 0., 0., -1.022, -0.511, 0., -0.012, 0., -0.511], atol=1e-3)
assert np.allclose(
    np.round(loss_gradient(initial, X, y), 1),
    [ -24.1, -24.1, -24.1, -24.1, -21.9, -0.5, 0.0, 0.5, 0.0,
      0.0, -0.5, 0.0, 0.4, 0.3, 0.3, 0.3, 0.3 ], atol=1e-1)

You have now implemented all of the pieces necessary to optimize the 17 network parameters using the gradient descent update rule,

\[ \Large \Theta \rightarrow \Theta - \eta \nabla E(\Theta) \; , \]

where \(\eta\) is the learning rate. The large values of some of the derivatives we found above indicate that \(\eta\) will need to be small in order for the optimization to converge.

def optimize(initial, X_train, y_train, eta=0.001, n_steps=20):
    loss_history = [loss(initial, X_train, y_train)]
    params = initial.copy()
    for i in range(n_steps):
        params -= eta * loss_gradient(params, X_train, y_train)
        loss_history.append(loss(params, X_train, y_train))
        print('step {:02d}: loss={:.1f}'.format(i + 1, loss_history[-1]))
    plt.plot(loss_history, 'o-')
    plt.xlabel('Optimization step')
    plt.ylabel('Cross-entropy loss')

optimize(initial, X, y)