Lecture 6 live coding#
2025-02-20
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
rng = np.random.RandomState(42)
# high number of samples to reduce sampling noise
n_samples = 30000
Chain simulation#
# generate a binary T
# generate a binary M that is caused by T
# generate a continuous Y that is caused by M
# add columns to chain_df
# COMPLETED CELL
# generate a binary T
# T = rng.choice([0, 1], size=n_samples)
# # generate a binary M that is caused by T
# M = rng.normal(loc=T, scale=0.3)
# M = (M > 0.5).astype(int)
# # generate a continuous Y that is caused by M
# Y = rng.normal(loc=M, scale=0.3)
# chain_df = pd.DataFrame({'T': T, 'M': M, 'Y': Y})
Note
Correlation is a measure of linear association that has a range of \([-1, 1]\). A correlation near 0 means no linear association.
# compute the correlation between every pair of columns
# COMPLETED CELL
# compute the correlation between every pair of columns
# chain_df.corr()
Note
“Controlling for” a variable means that we’re conditioning on the variable – we’re holding the variable constant.
# Examine the correlation between Y and T within each level of M
# COMPLETED CELL
# sel_df = chain_df[M==0].corr()
# print(sel_df.loc['Y', 'T'])
# sel_df = chain_df[M==1].corr()
# print(sel_df.loc['Y', 'T'])
Fork simulation#
# generate a binary X
# generate a binary T that is caused by X
# generate a continuous Y that is caused by X
# add columns to fork_df
# COMPLETED CELL
# # generate a binary X
# X = rng.choice([0, 1], size=n_samples)
# # generate a binary T that is caused by X
# T = rng.normal(loc=X, scale=0.3)
# T = (T > T.mean()).astype(int)
# # generate a continuous Y that is caused by X
# Y = rng.normal(loc=X, scale=0.3)
# fork_df = pd.DataFrame({'X': X, 'T': T, 'Y': Y})
# compute the correlation between every pair of columns
# COMPLETED CELL
# compute the correlation between every pair of columns
# corr_df = fork_df.corr()
# corr_df.loc['Y', 'T']
# COMPLETED CELL
# Examine the correlation between Y and T within each level of X
# sel_df = fork_df[X==0].corr()
# print(sel_df.loc['Y', 'T'])
# sel_df = fork_df[X==1].corr()
# print(sel_df.loc['Y', 'T'])
Collider simulation#
# Generate a binary T
# Generate a continuous Y
# Generate a binary C that is caused by T and Y
# add columns to collider_df
# COMPLETED CELL
# # Generate a binary T
# T = rng.choice([0, 1], size=n_samples)
# # Generate a continuous Y
# Y = rng.normal(loc=np.zeros(n_samples), scale=0.3)
# # Generate a binary C that is caused by T and Y
# C = rng.normal(loc=T + Y, scale=0.3)
# C = (C > C.mean()).astype(int)
# # add columns to collider_df
# collider_df = pd.DataFrame({'T': T, 'Y': Y, 'C': C})
# Check the overall correlation
# collider_df.corr().loc['Y', 'T']
# Check the correlation between Y and T within each level of C
# COMPLETED CELL
# Check the correlation between Y and T within each level of C
# sel_df = collider_df[C==0].corr()
# print(sel_df.loc['Y', 'T'])
# sel_df = collider_df[C==1].corr()
# print(sel_df.loc['Y', 'T'])