Homework 01: Numerical python and data handling#

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
import pandas as pd
import os.path
import subprocess
def wget_data(url):
    local_path='./tmp_data'
    subprocess.run(["wget", "-nc", "-P", local_path, url])
def locate_data(name, check_exists=True):
    local_path='./tmp_data'
    path = os.path.join(local_path, name)
    if check_exists and not os.path.exists(path):
        raise RuxntimeError('No such data file: {}'.format(path))
    return path

Problem 1#

Use np.einsum to evaluate the tensor expression \(g^{il} \Gamma^m_{ki} x^k\) which arises in contravariant derivatives in General Relativity. Note we are using the GR convention that repeated indices (k,l) are summed over.

def tensor_expr(g, Gamma, x, D=4):
    """Evaluate the tensor expression above.
    
    Parameters
    ----------
    g : array
        Numpy array of shape (D, D)
    Gamma : array
        Numpy array of shape (D, D, D)
    x : array
        Numpy array of shape (D,)
    D : int
        Dimension of input tensors.
        
    Returns
    -------
    array
        Numpy array of shape (D, D) that evaluates the tensor expression.
    """
    assert g.shape == (D, D)
    assert Gamma.shape == (D, D, D)
    assert x.shape == (D,)
    
    # YOUR CODE HERE
    raise NotImplementedError()
# A correct solution should pass these tests.
g = np.arange(4 ** 2).reshape(4, 4)
Gamma = np.arange(4 ** 3).reshape(4, 4, 4)
x = np.arange(4)
y = tensor_expr(g, Gamma, x)
assert np.array_equal(
    y,
    [[ 1680,  3984,  6288,  8592], [ 1940,  4628,  7316, 10004],
     [ 2200,  5272,  8344, 11416], [ 2460,  5916,  9372, 12828]])

Problem 2#

Use np.histogram to calculate the fraction of values in an arbitrary input data array that lie in each of the 10 intervals [0.0, 0.1), [0.1, 0.2), …, [0.9, 1.0). You can assume that all input values are in the range [0,1). This is a useful technique to estimate the probability density that the data was sampled from.

def estimate_probability_density(data, bins):
    """Estimate the probability density of arbitrary data.
    
    Parameters
    ----------
    data : array
        1D numpy array of random values.
    bins : array
        1D numpy array of N+1 bin edges to use. Must be increasing.

    Returns
    -------
    array
        1D numpy array of N probability densities.
    """
    assert np.all(np.diff(bins) > 0)

    # YOUR CODE HERE
    raise NotImplementedError()
# A correct solution should pass these tests.
generator = np.random.RandomState(seed=123)
data = generator.uniform(size=100)
bins = np.linspace(0., 1., 11)
rho = estimate_probability_density(data, bins)
assert np.allclose(0.1 * rho.sum(), 1.)
assert np.allclose(rho, [ 0.6,  0.8,  0.7,  1.7,  1.1,  1.3,  1.6,  0.9,  0.8,  0.5])

Problem 3#

Define a function to calculate the entropy \(H(\rho)\) of a binned probability density, defined as: $\( H(\rho) \equiv -\sum_i \rho_i \log(\rho_i) \Delta w_i \; , \)\( where \)\rho_i\( is the binned density in bin \)i\( with width \)w_i$.

def binned_entropy(rho, bins):
    """Calculate the binned entropy.
    
    Parameters
    ----------
    rho : array
        1D numpy array of densities, e.g., calculated by the previous function.
    bins : array
        1D numpy array of N+1 bin edges to use. Must be increasing.

    Returns
    -------
    float
        Value of the binned entropy.
    """
    assert np.all(np.diff(bins) > 0)
    
    # YOUR CODE HERE
    raise NotImplementedError()
# A correct solution should pass these tests.
generator = np.random.RandomState(seed=123)
data1 = generator.uniform(size=10000)
data2 = generator.uniform(size=10000) ** 4
bins = np.linspace(0., 1., 11)
rho1 = estimate_probability_density(data1, bins)
rho2 = estimate_probability_density(data2, bins)
H1 = binned_entropy(rho1, bins)
H2 = binned_entropy(rho2, bins)
assert np.allclose(H1, -0.000801544)
assert np.allclose(H2, -0.699349908)

Problem 4#

Define a function that reads pong_data.hf5 and returns a new subset DataFrame containing only the columns x5, y5, x7, y7 (in that order) and only the last 200 rows.

wget_data('https://raw.githubusercontent.com/illinois-ipaml/MachineLearningForPhysics/main/data/pong_data.hf5')
def create_subset():
    """Read pong_data.hf5 and return a subset.
    """
    # YOUR CODE HERE
    raise NotImplementedError()
# A correct solution should pass these tests.
subset = create_subset()
assert np.array_equal(subset.columns.values, ('x5', 'y5', 'x7', 'y7'))
assert len(subset) == 200
summary = subset.describe()
assert np.allclose(summary.loc['mean', :].values,
                   [ 0.43564752,  0.30610958,  0.57520991,  0.21383226])

Acknowledgments#

  • Initial version: Mark Neubauer

© Copyright 2024