Homework 08: Cross Validation#
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
import pandas as pd
import os.path
import subprocess
from sklearn import model_selection, ensemble
Helpers for Getting, Loading and Locating Data
def wget_data(url: str):
local_path = './tmp_data'
p = subprocess.Popen(["wget", "-nc", "-P", local_path, url], stderr=subprocess.PIPE, encoding='UTF-8')
rc = None
while rc is None:
line = p.stderr.readline().strip('\n')
if len(line) > 0:
print(line)
rc = p.poll()
def locate_data(name, check_exists=True):
local_path='./tmp_data'
path = os.path.join(local_path, name)
if check_exists and not os.path.exists(path):
raise RuxntimeError('No such data file: {}'.format(path))
return path
Problem 1#
The default score function used by sklearn to evaluate how well a regression model predicts data is the coefficient of determination \(R^2\). Implement the function below to calculate \(R^2\):
def calculate_R2(y_data, y_pred):
"""Calculate the coefficient of determination R2 for two arrays.
Parameters
----------
y_data : array
Array of data values, must have the same shape as y_pred.
y_pred : array
Array of predicted values, must have the same shape as y_data.
Returns
-------
float
Calculated coefficient of determination R2.
"""
assert y_data.shape == y_pred.shape
# YOUR CODE HERE
raise NotImplementedError()
# A correct solution should pass the tests below.
gen = np.random.RandomState(seed=123)
N = 100
x = gen.uniform(size=N)
y_pred = 2 * x - 1
y_data = y_pred + gen.normal(scale=0.1, size=N)
assert np.round(calculate_R2(y_data, y_pred), 3) == 0.961
assert np.round(calculate_R2(y_data, -y_pred), 3) == -2.935
assert np.round(calculate_R2(y_pred, y_pred), 3) == 1.000
assert np.round(calculate_R2(y_pred, -y_pred), 3) == -3.000
assert np.round(calculate_R2(y_data, np.full(N, np.mean(y_pred))), 3) == 0.000
Problem 2#
Implement the function below to perform a grid-search cross validation of a random forest regression over the following grid:
min_samples_leaf
= 1, 10, 20n_estimators
= 5, 10, 15
Hint: you will need to ensure reproducible “random” behavior in order to pass all the tests.
def cv_summary(cv):
"""Summarize the results from a GridSearchCV fit.
Summarize a cross-validation grid search in a pandas DataFrame with the
following transformations of the full results:
- Remove all columns with timing measurements.
- Remove the 'param_' prefix from column names.
- Remove the '_score' suffix from column names.
- Round scores to 3 decimal places.
If the parameter grid is 1D, then this function also plots the test
and training R2 scores versus the parameter.
Parameters
----------
cv : sklearn.model_selection.GridSearchCV
Instance of a GridSearchCV object that has been fit to some data.
Returns
-------
pandas.DataFrame
Summary table of cross-validation results.
"""
# Look up the list of parameters used in the grid.
params = list(cv.cv_results_['params'][0].keys())
# Index results by the test score rank.
index = cv.cv_results_['rank_test_score']
df = pd.DataFrame(cv.cv_results_, index=index).drop(columns=['params', 'rank_test_score'])
# Remove columns that measure running time.
df = df.drop(columns=[n for n in df.columns.values if n.endswith('_time')])
# Remove param_ prefix from column names.
df = df.rename(lambda n: n[6:] if n.startswith('param_') else n, axis='columns')
# Remove _score suffix from column names.
df = df.rename(lambda n: n[:-6] if n.endswith('_score') else n, axis='columns')
if len(params) == 1:
# Plot the test and training scores vs the grid parameter when there is only one.
plt.plot(df[params[0]], df['mean_train'], 'o:', label='train')
plt.plot(df[params[0]], df['mean_test'], 'o-', label='test')
plt.legend(fontsize='x-large')
plt.xlabel('Hyperparameter value')
plt.ylabel('Score $R^2$')
plt.ylim(max(-2, np.min(df['mean_test'])), 1)
return df.sort_index().round(3)
def cross_validate(X, Y, gen):
"""Perform cross validation of a random forest regression.
Parameters
----------
X : array
Array with shape (N, DX) of N samples with DX features.
Y : array
Array with shape (N, DY) of N samples with DY features.
gen : np.random.RandomState
Random state to use for reproducible random numbers.
Returns
-------
pandas.DataFrame
Cross-validation summary table produced by cv_summary().
"""
assert len(X) == len(Y)
# YOUR CODE HERE
raise NotImplementedError()
wget_data('https://raw.githubusercontent.com/illinois-ipaml/MachineLearningForPhysics/main/data/spectra_data.hf5')
wget_data('https://raw.githubusercontent.com/illinois-ipaml/MachineLearningForPhysics/main/data/spectra_targets.hf5')
X = pd.read_hdf(locate_data('spectra_data.hf5')).values
Y = pd.read_hdf(locate_data('spectra_targets.hf5')).values
# A correct solution should pass the tests below.
gen = np.random.RandomState(seed=123)
cvs = cross_validate(X, Y, gen)
assert np.all(cvs.columns.values == [
'min_samples_leaf', 'n_estimators', 'split0_test', 'split1_test',
'mean_test', 'std_test', 'split0_train', 'split1_train', 'mean_train',
'std_train'])
assert np.all(cvs['min_samples_leaf'].values == [1, 1, 1, 10, 10, 10, 20, 20, 20])
assert np.all(cvs['n_estimators'].values == [15, 10, 5, 15, 10, 5, 15, 10, 5])
assert np.allclose(
cvs['mean_test'].values,
[0.961, 0.955, 0.942, 0.896, 0.891, 0.879, 0.496, 0.490, 0.480], atol=1e-3)
assert np.allclose(
cvs['mean_train'].values,
[0.993, 0.992, 0.990, 0.909, 0.908, 0.905, 0.512, 0.515, 0.507], atol=1e-3)
Acknowledgments#
Initial version: Mark Neubauer
© Copyright 2024