Lecture 5 live coding#
2025-02-18
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from nhanes.load import load_NHANES_data
Pandas reassignment#
If we’re chaining together operations, we need to be careful about which dataframe we’re modifying.
Select respondents who:
are adults AND
report being in
Excellent
orVery good
health
nhanes_df = load_NHANES_data()
sel_df = nhanes_df[nhanes_df['AgeInYearsAtScreening'] >= 18]
sel_df = nhanes_df[(nhanes_df['GeneralHealthCondition'] == 'Excellent') | (nhanes_df['GeneralHealthCondition'] == 'Very good')]
.loc
operations can be in-place:
print(nhanes_df['SmokedAtLeast100CigarettesInLife'].value_counts())
#nhanes_df.loc[nhanes_df['GeneralHealthCondition'] == 'Excellent', 'SmokedAtLeast100CigarettesInLife'] = 0
SmokedAtLeast100CigarettesInLife
0.0 3301
1.0 2232
Name: count, dtype: int64
Tip
Restart the notebook and run from top-to-bottom to double check any reassignment issues.
If the data are not too large, can re-run the cell to re-load the data.
If there is a pandas operation that modifies portions of the dataframe, watch the inplace
parameter!
print(nhanes_df['SmokedAtLeast100CigarettesInLife'].isna().sum())
#nhanes_df.fillna(0)
#nhanes_df['GeneralHealthCondition'].replace({'Poor?': 'Poor', 'Fair or': 'Fair'})
2833
Tip
To be safe, I generally recommend explicitly re-assigning the dataframe whenever you’re modifying portions of it.
Bootstrap#
Using a collected sample as a substitute for the population#
Bootstrap sampling “works” because we assume our collected sample is representative of the population.
Suppose we have a distribution of class colors:
Where \(p=\{\frac{1}{4}, \frac{1}{4}, \frac{1}{4}, \frac{1}{4}\}\) is the probability of each class:
Blue Lion
Green Griffin
Red Pegasus
Yellow Sphinx
A sample of adequate size should be representative of the population.
rng = np.random.RandomState(42)
class_colors = ['Blue Lion', 'Green Griffin', 'Red Pegasus', 'Yellow Sphinx']
class_probs = [ 1/4, 1/4, 1/4, 1/4]
n_samples = 500
# class_sample = rng.choice(class_colors, size=n_samples, p=class_probs)
# pd.Series(class_sample).value_counts(normalize=True)
Tip
As long as the proportions are about right, the sample is representative, regardless of the size!
Why we care about uncertainty \(\to\) confidence intervals#
n_samples = 100
true_goose_weight = 10
goose_weights = rng.normal(loc=true_goose_weight, scale=2, size=n_samples)
def mean_estimator(sample):
"""
An estimator of the expected value E[ ] of the population.
Returns:
float: the mean of the sample
"""
return np.mean(sample)
bootstrap_means = []
n_bootstraps = 5000
# for i in range(n_bootstraps):
# # 1. draw a bootstrap sample
# bootstrap_sample = rng.choice(goose_weights, size=100, replace=True)
# bootstrap_sample
# # 2. compute our mean again, with the bootstrap sample
# bootstrap_mean = mean_estimator(bootstrap_sample)
# # 3. add our bootstrap_mean to the list
# bootstrap_means.append(bootstrap_mean)
# lower_ci = np.percentile(bootstrap_means, 2.5)
# upper_ci = np.percentile(bootstrap_means, 97.5)
# print(f"Our estimate of average goose weight: {mean_estimator(goose_weights):.1f} lbs")
# print(f"95% Confidence Interval: [{lower_ci:.1f}, {upper_ci:.1f}] lbs")
# sns.histplot(bootstrap_means)
# plt.axvline(true_goose_weight, color='black', linestyle='--', label='True average goose weight')
# plt.axvline(lower_ci, color='red', linestyle='--')
# plt.axvline(upper_ci, color='red', linestyle='--', label='95% CI')
# plt.xlabel('Bootstrap mean estimates of goose weight (lbs)')
# plt.gca().spines['top'].set_visible(False)
# plt.gca().spines['right'].set_visible(False)
# plt.legend()
#plt.show()
#plt.savefig('../lectures/s25/slidev/assets/lec05_bootstrap_mean_ci.png', transparent=True, dpi=300)
n_bootstraps = 1000
n_intervals = 500
count_missed = 0
# for j in range(n_intervals):
# # First draw a new sample from the true distribution
# goose_sample = rng.normal(loc=true_goose_weight, scale=2, size=100)
# bootstrap_means = []
# for i in range(n_bootstraps):
# # Draw bootstrap samples from this sample
# bootstrap_sample = rng.choice(goose_sample, size=100, replace=True)
# bootstrap_mean = mean_estimator(bootstrap_sample)
# bootstrap_means.append(bootstrap_mean)
# # Compute CI for this sample
# lower_ci = np.percentile(bootstrap_means, 2.5)
# upper_ci = np.percentile(bootstrap_means, 97.5)
# if true_goose_weight < lower_ci or true_goose_weight > upper_ci:
# count_missed += 1
# print(f"Number of intervals that missed the true mean: {count_missed}")
# print(f"Proportion of intervals that missed the true mean: {count_missed / n_intervals:.3f}")
Why we care about uncertainty \(\to\) comparing estimators#
def first_ten_estimator(sample):
"""
A (not very good) estimator of the expected value E[ ] of the population.
Returns:
float: the mean of the first 10 elements of the sample
"""
return np.mean(sample[:10])
bootstrap_means = []
bootstrap_first_tens = []
n_bootstraps = 5000
# for i in range(n_bootstraps):
# # 1. draw a bootstrap sample
# bootstrap_sample = rng.choice(goose_weights, size=100, replace=True)
# bootstrap_sample
# # 2. compute our estimates, with the bootstrap sample
# bootstrap_mean = mean_estimator(bootstrap_sample)
# bootstrap_first_ten = first_ten_estimator(bootstrap_sample)
# # 3. add our bootstrap_mean to the list
# bootstrap_means.append(bootstrap_mean)
# bootstrap_first_tens.append(bootstrap_first_ten)
# sns.kdeplot(bootstrap_means)#, label="Mean of whole sample")
# sns.kdeplot(bootstrap_first_tens)#, label="Mean of first 10")
# plt.axvline(true_goose_weight, color='black', linestyle='--', label='True average goose weight')
# plt.xlabel('Bootstrap mean estimates of goose weight (lbs)')
# plt.gca().spines['top'].set_visible(False)
# plt.gca().spines['right'].set_visible(False)
# plt.legend()
#plt.show()