Lecture 6 live coding

Lecture 6 live coding#

2025-02-20

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets

rng = np.random.RandomState(42)
# high number of samples to reduce sampling noise
n_samples = 30000

Chain simulation#

# generate a binary T

# generate a binary M that is caused by T

# generate a continuous Y that is caused by M

# add columns to chain_df

# COMPLETED CELL
# generate a binary T
# T = rng.choice([0, 1], size=n_samples)

# # generate a binary M that is caused by T
# M = rng.normal(loc=T, scale=0.3)
# M = (M > 0.5).astype(int)

# # generate a continuous Y that is caused by M
# Y = rng.normal(loc=M, scale=0.3)

# chain_df = pd.DataFrame({'T': T, 'M': M, 'Y': Y})

Note

Correlation is a measure of linear association that has a range of \([-1, 1]\). A correlation near 0 means no linear association.

# compute the correlation between every pair of columns

# COMPLETED CELL
# compute the correlation between every pair of columns
# chain_df.corr()

Note

“Controlling for” a variable means that we’re conditioning on the variable – we’re holding the variable constant.

# Examine the correlation between Y and T within each level of M

# COMPLETED CELL
# sel_df = chain_df[M==0].corr()
# print(sel_df.loc['Y', 'T'])
# sel_df = chain_df[M==1].corr()
# print(sel_df.loc['Y', 'T'])

Fork simulation#

# generate a binary X

# generate a binary T that is caused by X

# generate a continuous Y that is caused by X

# add columns to fork_df

# COMPLETED CELL
# # generate a binary X
# X = rng.choice([0, 1], size=n_samples)

# # generate a binary T that is caused by X
# T = rng.normal(loc=X, scale=0.3)
# T = (T > T.mean()).astype(int)

# # generate a continuous Y that is caused by X
# Y = rng.normal(loc=X, scale=0.3)

# fork_df = pd.DataFrame({'X': X, 'T': T, 'Y': Y})

# compute the correlation between every pair of columns

# COMPLETED CELL
# compute the correlation between every pair of columns
# corr_df = fork_df.corr()
# corr_df.loc['Y', 'T']

# COMPLETED CELL
# Examine the correlation between Y and T within each level of X
# sel_df = fork_df[X==0].corr()
# print(sel_df.loc['Y', 'T'])
# sel_df = fork_df[X==1].corr()
# print(sel_df.loc['Y', 'T'])

Collider simulation#

# Generate a binary T

# Generate a continuous Y

# Generate a binary C that is caused by T and Y

# add columns to collider_df

# COMPLETED CELL
# # Generate a binary T
# T = rng.choice([0, 1], size=n_samples)

# # Generate a continuous Y
# Y = rng.normal(loc=np.zeros(n_samples), scale=0.3)

# # Generate a binary C that is caused by T and Y
# C = rng.normal(loc=T + Y, scale=0.3)
# C = (C > C.mean()).astype(int)
# # add columns to collider_df
# collider_df = pd.DataFrame({'T': T, 'Y': Y, 'C': C})

# Check the overall correlation
# collider_df.corr().loc['Y', 'T']

# Check the correlation between Y and T within each level of C

# COMPLETED CELL
# Check the correlation between Y and T within each level of C
# sel_df = collider_df[C==0].corr()
# print(sel_df.loc['Y', 'T'])
# sel_df = collider_df[C==1].corr()
# print(sel_df.loc['Y', 'T'])