{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"(lec6_live_complete)=\n",
"# [complete] Lecture 6 live coding\n",
"\n",
"**2025-02-20**\n",
"\n",
"---"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import ipywidgets as widgets"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"rng = np.random.RandomState(42)\n",
"# high number of samples to reduce sampling noise\n",
"n_samples = 30000\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Chain simulation"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# generate a binary T\n",
"T = rng.choice([0, 1], size=n_samples)\n",
"\n",
"# generate a binary M that is caused by T\n",
"# T -> M\n",
"M = rng.normal(loc=T, scale=0.3)\n",
"M = (M > 0.5).astype(int)\n",
"\n",
"# generate a continuous Y that is caused by M\n",
"# M -> Y\n",
"Y = rng.normal(loc=M, scale=0.3)\n",
"\n",
"# add columns to chain_df\n",
"chain_df = pd.DataFrame({\n",
" 'T': T,\n",
" 'M': M,\n",
" 'Y': Y\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(30000, 3)"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chain_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# COMPLETED CELL\n",
"# generate a binary T\n",
"# T = rng.choice([0, 1], size=n_samples)\n",
"\n",
"# # generate a binary M that is caused by T\n",
"# M = rng.normal(loc=T, scale=0.3)\n",
"# M = (M > 0.5).astype(int)\n",
"\n",
"# # generate a continuous Y that is caused by M\n",
"# Y = rng.normal(loc=M, scale=0.3)\n",
"\n",
"# chain_df = pd.DataFrame({'T': T, 'M': M, 'Y': Y})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
":::{note}\n",
"\n",
"Correlation is a measure of linear association that has a range of $[-1, 1]$. A correlation near 0 means no linear association.\n",
"\n",
":::"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" T | \n",
" M | \n",
" Y | \n",
"
\n",
" \n",
" \n",
" \n",
" T | \n",
" 1.000000 | \n",
" 0.901533 | \n",
" 0.772763 | \n",
"
\n",
" \n",
" M | \n",
" 0.901533 | \n",
" 1.000000 | \n",
" 0.858280 | \n",
"
\n",
" \n",
" Y | \n",
" 0.772763 | \n",
" 0.858280 | \n",
" 1.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" T M Y\n",
"T 1.000000 0.901533 0.772763\n",
"M 0.901533 1.000000 0.858280\n",
"Y 0.772763 0.858280 1.000000"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# compute the correlation between every pair of columns\n",
"chain_df.corr()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# COMPLETED CELL\n",
"# compute the correlation between every pair of columns\n",
"# chain_df.corr()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
":::{note}\n",
"\n",
"\"Controlling for\" a variable means that we're conditioning on the variable -- we're holding the variable constant.\n",
"\n",
":::"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" T | \n",
" M | \n",
" Y | \n",
"
\n",
" \n",
" \n",
" \n",
" T | \n",
" 1.000000 | \n",
" NaN | \n",
" 0.000686 | \n",
"
\n",
" \n",
" M | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" Y | \n",
" 0.000686 | \n",
" NaN | \n",
" 1.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" T M Y\n",
"T 1.000000 NaN 0.000686\n",
"M NaN NaN NaN\n",
"Y 0.000686 NaN 1.000000"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Examine the correlation between Y and T within each level of M\n",
"# T -> M -> Y\n",
"# T -| M -> Y\n",
"chain_df[M==1].corr()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# COMPLETED CELL\n",
"# sel_df = chain_df[M==0].corr()\n",
"# print(sel_df.loc['Y', 'T'])\n",
"# sel_df = chain_df[M==1].corr()\n",
"# print(sel_df.loc['Y', 'T'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Fork simulation"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# generate a binary X\n",
"X = rng.choice([0, 1], size=n_samples)\n",
"\n",
"# generate a binary T that is caused by X\n",
"# T <- X\n",
"T = rng.normal(loc=X, scale=0.3)\n",
"T = (T > 0.5).astype(int)\n",
"\n",
"# generate a continuous Y that is caused by X\n",
"# Y <- X\n",
"Y = rng.normal(loc=X, scale=0.3)\n",
"\n",
"# add columns to fork_df\n",
"fork_df = pd.DataFrame({'T':T, 'X':X, 'Y':Y})"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# COMPLETED CELL\n",
"# # generate a binary X\n",
"# X = rng.choice([0, 1], size=n_samples)\n",
"\n",
"# # generate a binary T that is caused by X\n",
"# T = rng.normal(loc=X, scale=0.3)\n",
"# T = (T > T.mean()).astype(int)\n",
"\n",
"# # generate a continuous Y that is caused by X\n",
"# Y = rng.normal(loc=X, scale=0.3)\n",
"\n",
"# fork_df = pd.DataFrame({'X': X, 'T': T, 'Y': Y})"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" T | \n",
" X | \n",
" Y | \n",
"
\n",
" \n",
" \n",
" \n",
" T | \n",
" 1.00000 | \n",
" 0.904210 | \n",
" 0.775750 | \n",
"
\n",
" \n",
" X | \n",
" 0.90421 | \n",
" 1.000000 | \n",
" 0.857782 | \n",
"
\n",
" \n",
" Y | \n",
" 0.77575 | \n",
" 0.857782 | \n",
" 1.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" T X Y\n",
"T 1.00000 0.904210 0.775750\n",
"X 0.90421 1.000000 0.857782\n",
"Y 0.77575 0.857782 1.000000"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# compute the correlation between every pair of columns\n",
"fork_df.corr()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# COMPLETED CELL\n",
"# compute the correlation between every pair of columns\n",
"# corr_df = fork_df.corr()\n",
"# corr_df.loc['Y', 'T']"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"np.float64(0.002639686410825742)"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fork_df[X==1].corr().loc['T', 'Y']"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# COMPLETED CELL\n",
"# Examine the correlation between Y and T within each level of X\n",
"# sel_df = fork_df[X==0].corr()\n",
"# print(sel_df.loc['Y', 'T'])\n",
"# sel_df = fork_df[X==1].corr()\n",
"# print(sel_df.loc['Y', 'T'])\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Collider simulation\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"# Generate a binary T\n",
"T = rng.choice([0, 1], size=n_samples)\n",
"\n",
"# Generate a continuous Y\n",
"Y = rng.normal(loc=np.zeros(n_samples), scale=0.3)\n",
"\n",
"# Generate a binary C that is caused by T and Y\n",
"# T -> C <- Y\n",
"C = rng.normal(loc=T+Y, scale=0.3)\n",
"C = (C > C.mean()).astype(int)\n",
"\n",
"# add columns to collider_df\n",
"collider_df = pd.DataFrame({'T':T, 'Y':Y, 'C':C})"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# COMPLETED CELL\n",
"# # Generate a binary T\n",
"# T = rng.choice([0, 1], size=n_samples)\n",
"\n",
"# # Generate a continuous Y\n",
"# Y = rng.normal(loc=np.zeros(n_samples), scale=0.3)\n",
"\n",
"# # Generate a binary C that is caused by T and Y\n",
"# C = rng.normal(loc=T + Y, scale=0.3)\n",
"# C = (C > C.mean()).astype(int)\n",
"# # add columns to collider_df\n",
"# collider_df = pd.DataFrame({'T': T, 'Y': Y, 'C': C})"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" T | \n",
" Y | \n",
" C | \n",
"
\n",
" \n",
" \n",
" \n",
" T | \n",
" 1.000000 | \n",
" -0.005946 | \n",
" 0.753668 | \n",
"
\n",
" \n",
" Y | \n",
" -0.005946 | \n",
" 1.000000 | \n",
" 0.281741 | \n",
"
\n",
" \n",
" C | \n",
" 0.753668 | \n",
" 0.281741 | \n",
" 1.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" T Y C\n",
"T 1.000000 -0.005946 0.753668\n",
"Y -0.005946 1.000000 0.281741\n",
"C 0.753668 0.281741 1.000000"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"collider_df.corr()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"# Check the overall correlation\n",
"# collider_df.corr().loc['Y', 'T']\n"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"np.float64(-0.34709013628348634)"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Check the correlation between Y and T within each level of C\n",
"collider_df[C==1].corr().loc['T', 'Y']"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"# COMPLETED CELL\n",
"# Check the correlation between Y and T within each level of C\n",
"# sel_df = collider_df[C==0].corr()\n",
"# print(sel_df.loc['Y', 'T'])\n",
"# sel_df = collider_df[C==1].corr()\n",
"# print(sel_df.loc['Y', 'T'])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.10"
}
},
"nbformat": 4,
"nbformat_minor": 4
}