[complete] Lecture 6 live coding

[complete] Lecture 6 live coding#

2025-02-20

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets

rng = np.random.RandomState(42)
# high number of samples to reduce sampling noise
n_samples = 30000

Chain simulation#

# generate a binary T
T = rng.choice([0, 1], size=n_samples)

# generate a binary M that is caused by T
# T -> M
M = rng.normal(loc=T, scale=0.3)
M = (M > 0.5).astype(int)

# generate a continuous Y that is caused by M
# M -> Y
Y = rng.normal(loc=M, scale=0.3)

# add columns to chain_df
chain_df = pd.DataFrame({
    'T': T,
    'M': M,
    'Y': Y
})

chain_df.shape

(30000, 3)

# COMPLETED CELL
# generate a binary T
# T = rng.choice([0, 1], size=n_samples)

# # generate a binary M that is caused by T
# M = rng.normal(loc=T, scale=0.3)
# M = (M > 0.5).astype(int)

# # generate a continuous Y that is caused by M
# Y = rng.normal(loc=M, scale=0.3)

# chain_df = pd.DataFrame({'T': T, 'M': M, 'Y': Y})

Note

Correlation is a measure of linear association that has a range of \([-1, 1]\). A correlation near 0 means no linear association.

# compute the correlation between every pair of columns
chain_df.corr()

	T	M	Y
T	1.000000	0.905467	0.776260
M	0.905467	1.000000	0.857164
Y	0.776260	0.857164	1.000000

# COMPLETED CELL
# compute the correlation between every pair of columns
# chain_df.corr()

Note

“Controlling for” a variable means that we’re conditioning on the variable – we’re holding the variable constant.

# Examine the correlation between Y and T within each level of M
# T ->  M -> Y
# T -|  M -> Y
chain_df[M==1].corr()

	T	M	Y
T	1.000000	NaN	-0.004686
M	NaN	NaN	NaN
Y	-0.004686	NaN	1.000000

# COMPLETED CELL
# sel_df = chain_df[M==0].corr()
# print(sel_df.loc['Y', 'T'])
# sel_df = chain_df[M==1].corr()
# print(sel_df.loc['Y', 'T'])

Fork simulation#

# generate a binary X
X = rng.choice([0, 1], size=n_samples)

# generate a binary T that is caused by X
# T <- X
T = rng.normal(loc=X, scale=0.3)
T = (T > 0.5).astype(int)

# generate a continuous Y that is caused by X
# Y <- X
Y = rng.normal(loc=X, scale=0.3)

# add columns to fork_df
fork_df = pd.DataFrame({'T':T, 'X':X, 'Y':Y})

# COMPLETED CELL
# # generate a binary X
# X = rng.choice([0, 1], size=n_samples)

# # generate a binary T that is caused by X
# T = rng.normal(loc=X, scale=0.3)
# T = (T > T.mean()).astype(int)

# # generate a continuous Y that is caused by X
# Y = rng.normal(loc=X, scale=0.3)

# fork_df = pd.DataFrame({'X': X, 'T': T, 'Y': Y})

# compute the correlation between every pair of columns
fork_df.corr()

	T	X	Y
T	1.000000	0.908865	0.777081
X	0.908865	1.000000	0.857706
Y	0.777081	0.857706	1.000000

# COMPLETED CELL
# compute the correlation between every pair of columns
# corr_df = fork_df.corr()
# corr_df.loc['Y', 'T']

fork_df[X==1].corr().loc['T', 'Y']

-0.012969108942544296

# COMPLETED CELL
# Examine the correlation between Y and T within each level of X
# sel_df = fork_df[X==0].corr()
# print(sel_df.loc['Y', 'T'])
# sel_df = fork_df[X==1].corr()
# print(sel_df.loc['Y', 'T'])

Collider simulation#

# Generate a binary T
T = rng.choice([0, 1], size=n_samples)

# Generate a continuous Y
Y = rng.normal(loc=np.zeros(n_samples), scale=0.3)

# Generate a binary C that is caused by T and Y
# T -> C <- Y
C = rng.normal(loc=T+Y, scale=0.3)
C = (C > C.mean()).astype(int)

# add columns to collider_df
collider_df = pd.DataFrame({'T':T, 'Y':Y, 'C':C})

# COMPLETED CELL
# # Generate a binary T
# T = rng.choice([0, 1], size=n_samples)

# # Generate a continuous Y
# Y = rng.normal(loc=np.zeros(n_samples), scale=0.3)

# # Generate a binary C that is caused by T and Y
# C = rng.normal(loc=T + Y, scale=0.3)
# C = (C > C.mean()).astype(int)
# # add columns to collider_df
# collider_df = pd.DataFrame({'T': T, 'Y': Y, 'C': C})

collider_df.corr()

	T	Y	C
T	1.000000	0.001689	0.762485
Y	0.001689	1.000000	0.278352
C	0.762485	0.278352	1.000000

# Check the overall correlation
# collider_df.corr().loc['Y', 'T']

# Check the correlation between Y and T within each level of C
collider_df[C==1].corr().loc['T', 'Y']

-0.3470879206577148

# COMPLETED CELL
# Check the correlation between Y and T within each level of C
# sel_df = collider_df[C==0].corr()
# print(sel_df.loc['Y', 'T'])
# sel_df = collider_df[C==1].corr()
# print(sel_df.loc['Y', 'T'])