[complete] Lecture 6 live coding#
2025-02-20
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
rng = np.random.RandomState(42)
# high number of samples to reduce sampling noise
n_samples = 30000
Chain simulation#
# generate a binary T
T = rng.choice([0, 1], size=n_samples)
# generate a binary M that is caused by T
# T -> M
M = rng.normal(loc=T, scale=0.3)
M = (M > 0.5).astype(int)
# generate a continuous Y that is caused by M
# M -> Y
Y = rng.normal(loc=M, scale=0.3)
# add columns to chain_df
chain_df = pd.DataFrame({
'T': T,
'M': M,
'Y': Y
})
chain_df.shape
(30000, 3)
# COMPLETED CELL
# generate a binary T
# T = rng.choice([0, 1], size=n_samples)
# # generate a binary M that is caused by T
# M = rng.normal(loc=T, scale=0.3)
# M = (M > 0.5).astype(int)
# # generate a continuous Y that is caused by M
# Y = rng.normal(loc=M, scale=0.3)
# chain_df = pd.DataFrame({'T': T, 'M': M, 'Y': Y})
Note
Correlation is a measure of linear association that has a range of \([-1, 1]\). A correlation near 0 means no linear association.
# compute the correlation between every pair of columns
chain_df.corr()
T | M | Y | |
---|---|---|---|
T | 1.000000 | 0.905467 | 0.776260 |
M | 0.905467 | 1.000000 | 0.857164 |
Y | 0.776260 | 0.857164 | 1.000000 |
# COMPLETED CELL
# compute the correlation between every pair of columns
# chain_df.corr()
Note
“Controlling for” a variable means that we’re conditioning on the variable – we’re holding the variable constant.
# Examine the correlation between Y and T within each level of M
# T -> M -> Y
# T -| M -> Y
chain_df[M==1].corr()
T | M | Y | |
---|---|---|---|
T | 1.000000 | NaN | -0.004686 |
M | NaN | NaN | NaN |
Y | -0.004686 | NaN | 1.000000 |
# COMPLETED CELL
# sel_df = chain_df[M==0].corr()
# print(sel_df.loc['Y', 'T'])
# sel_df = chain_df[M==1].corr()
# print(sel_df.loc['Y', 'T'])
Fork simulation#
# generate a binary X
X = rng.choice([0, 1], size=n_samples)
# generate a binary T that is caused by X
# T <- X
T = rng.normal(loc=X, scale=0.3)
T = (T > 0.5).astype(int)
# generate a continuous Y that is caused by X
# Y <- X
Y = rng.normal(loc=X, scale=0.3)
# add columns to fork_df
fork_df = pd.DataFrame({'T':T, 'X':X, 'Y':Y})
# COMPLETED CELL
# # generate a binary X
# X = rng.choice([0, 1], size=n_samples)
# # generate a binary T that is caused by X
# T = rng.normal(loc=X, scale=0.3)
# T = (T > T.mean()).astype(int)
# # generate a continuous Y that is caused by X
# Y = rng.normal(loc=X, scale=0.3)
# fork_df = pd.DataFrame({'X': X, 'T': T, 'Y': Y})
# compute the correlation between every pair of columns
fork_df.corr()
T | X | Y | |
---|---|---|---|
T | 1.000000 | 0.908865 | 0.777081 |
X | 0.908865 | 1.000000 | 0.857706 |
Y | 0.777081 | 0.857706 | 1.000000 |
# COMPLETED CELL
# compute the correlation between every pair of columns
# corr_df = fork_df.corr()
# corr_df.loc['Y', 'T']
fork_df[X==1].corr().loc['T', 'Y']
np.float64(-0.012969108942544294)
# COMPLETED CELL
# Examine the correlation between Y and T within each level of X
# sel_df = fork_df[X==0].corr()
# print(sel_df.loc['Y', 'T'])
# sel_df = fork_df[X==1].corr()
# print(sel_df.loc['Y', 'T'])
Collider simulation#
# Generate a binary T
T = rng.choice([0, 1], size=n_samples)
# Generate a continuous Y
Y = rng.normal(loc=np.zeros(n_samples), scale=0.3)
# Generate a binary C that is caused by T and Y
# T -> C <- Y
C = rng.normal(loc=T+Y, scale=0.3)
C = (C > C.mean()).astype(int)
# add columns to collider_df
collider_df = pd.DataFrame({'T':T, 'Y':Y, 'C':C})
# COMPLETED CELL
# # Generate a binary T
# T = rng.choice([0, 1], size=n_samples)
# # Generate a continuous Y
# Y = rng.normal(loc=np.zeros(n_samples), scale=0.3)
# # Generate a binary C that is caused by T and Y
# C = rng.normal(loc=T + Y, scale=0.3)
# C = (C > C.mean()).astype(int)
# # add columns to collider_df
# collider_df = pd.DataFrame({'T': T, 'Y': Y, 'C': C})
collider_df.corr()
T | Y | C | |
---|---|---|---|
T | 1.000000 | 0.001689 | 0.762485 |
Y | 0.001689 | 1.000000 | 0.278352 |
C | 0.762485 | 0.278352 | 1.000000 |
# Check the overall correlation
# collider_df.corr().loc['Y', 'T']
# Check the correlation between Y and T within each level of C
collider_df[C==1].corr().loc['T', 'Y']
np.float64(-0.3470879206577149)
# COMPLETED CELL
# Check the correlation between Y and T within each level of C
# sel_df = collider_df[C==0].corr()
# print(sel_df.loc['Y', 'T'])
# sel_df = collider_df[C==1].corr()
# print(sel_df.loc['Y', 'T'])