{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "(lec6_live_complete)=\n", "# [complete] Lecture 6 live coding\n", "\n", "**2025-02-20**\n", "\n", "---" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import ipywidgets as widgets" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "rng = np.random.RandomState(42)\n", "# high number of samples to reduce sampling noise\n", "n_samples = 30000\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Chain simulation" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# generate a binary T\n", "T = rng.choice([0, 1], size=n_samples)\n", "\n", "# generate a binary M that is caused by T\n", "# T -> M\n", "M = rng.normal(loc=T, scale=0.3)\n", "M = (M > 0.5).astype(int)\n", "\n", "# generate a continuous Y that is caused by M\n", "# M -> Y\n", "Y = rng.normal(loc=M, scale=0.3)\n", "\n", "# add columns to chain_df\n", "chain_df = pd.DataFrame({\n", " 'T': T,\n", " 'M': M,\n", " 'Y': Y\n", "})" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(30000, 3)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chain_df.shape" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# COMPLETED CELL\n", "# generate a binary T\n", "# T = rng.choice([0, 1], size=n_samples)\n", "\n", "# # generate a binary M that is caused by T\n", "# M = rng.normal(loc=T, scale=0.3)\n", "# M = (M > 0.5).astype(int)\n", "\n", "# # generate a continuous Y that is caused by M\n", "# Y = rng.normal(loc=M, scale=0.3)\n", "\n", "# chain_df = pd.DataFrame({'T': T, 'M': M, 'Y': Y})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ ":::{note}\n", "\n", "Correlation is a measure of linear association that has a range of $[-1, 1]$. A correlation near 0 means no linear association.\n", "\n", ":::" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TMY
T1.0000000.9015330.772763
M0.9015331.0000000.858280
Y0.7727630.8582801.000000
\n", "
" ], "text/plain": [ " T M Y\n", "T 1.000000 0.901533 0.772763\n", "M 0.901533 1.000000 0.858280\n", "Y 0.772763 0.858280 1.000000" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# compute the correlation between every pair of columns\n", "chain_df.corr()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# COMPLETED CELL\n", "# compute the correlation between every pair of columns\n", "# chain_df.corr()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ ":::{note}\n", "\n", "\"Controlling for\" a variable means that we're conditioning on the variable -- we're holding the variable constant.\n", "\n", ":::" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TMY
T1.000000NaN0.000686
MNaNNaNNaN
Y0.000686NaN1.000000
\n", "
" ], "text/plain": [ " T M Y\n", "T 1.000000 NaN 0.000686\n", "M NaN NaN NaN\n", "Y 0.000686 NaN 1.000000" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Examine the correlation between Y and T within each level of M\n", "# T -> M -> Y\n", "# T -| M -> Y\n", "chain_df[M==1].corr()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# COMPLETED CELL\n", "# sel_df = chain_df[M==0].corr()\n", "# print(sel_df.loc['Y', 'T'])\n", "# sel_df = chain_df[M==1].corr()\n", "# print(sel_df.loc['Y', 'T'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Fork simulation" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# generate a binary X\n", "X = rng.choice([0, 1], size=n_samples)\n", "\n", "# generate a binary T that is caused by X\n", "# T <- X\n", "T = rng.normal(loc=X, scale=0.3)\n", "T = (T > 0.5).astype(int)\n", "\n", "# generate a continuous Y that is caused by X\n", "# Y <- X\n", "Y = rng.normal(loc=X, scale=0.3)\n", "\n", "# add columns to fork_df\n", "fork_df = pd.DataFrame({'T':T, 'X':X, 'Y':Y})" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# COMPLETED CELL\n", "# # generate a binary X\n", "# X = rng.choice([0, 1], size=n_samples)\n", "\n", "# # generate a binary T that is caused by X\n", "# T = rng.normal(loc=X, scale=0.3)\n", "# T = (T > T.mean()).astype(int)\n", "\n", "# # generate a continuous Y that is caused by X\n", "# Y = rng.normal(loc=X, scale=0.3)\n", "\n", "# fork_df = pd.DataFrame({'X': X, 'T': T, 'Y': Y})" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TXY
T1.000000.9042100.775750
X0.904211.0000000.857782
Y0.775750.8577821.000000
\n", "
" ], "text/plain": [ " T X Y\n", "T 1.00000 0.904210 0.775750\n", "X 0.90421 1.000000 0.857782\n", "Y 0.77575 0.857782 1.000000" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# compute the correlation between every pair of columns\n", "fork_df.corr()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# COMPLETED CELL\n", "# compute the correlation between every pair of columns\n", "# corr_df = fork_df.corr()\n", "# corr_df.loc['Y', 'T']" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "np.float64(0.002639686410825742)" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fork_df[X==1].corr().loc['T', 'Y']" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# COMPLETED CELL\n", "# Examine the correlation between Y and T within each level of X\n", "# sel_df = fork_df[X==0].corr()\n", "# print(sel_df.loc['Y', 'T'])\n", "# sel_df = fork_df[X==1].corr()\n", "# print(sel_df.loc['Y', 'T'])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Collider simulation\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# Generate a binary T\n", "T = rng.choice([0, 1], size=n_samples)\n", "\n", "# Generate a continuous Y\n", "Y = rng.normal(loc=np.zeros(n_samples), scale=0.3)\n", "\n", "# Generate a binary C that is caused by T and Y\n", "# T -> C <- Y\n", "C = rng.normal(loc=T+Y, scale=0.3)\n", "C = (C > C.mean()).astype(int)\n", "\n", "# add columns to collider_df\n", "collider_df = pd.DataFrame({'T':T, 'Y':Y, 'C':C})" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# COMPLETED CELL\n", "# # Generate a binary T\n", "# T = rng.choice([0, 1], size=n_samples)\n", "\n", "# # Generate a continuous Y\n", "# Y = rng.normal(loc=np.zeros(n_samples), scale=0.3)\n", "\n", "# # Generate a binary C that is caused by T and Y\n", "# C = rng.normal(loc=T + Y, scale=0.3)\n", "# C = (C > C.mean()).astype(int)\n", "# # add columns to collider_df\n", "# collider_df = pd.DataFrame({'T': T, 'Y': Y, 'C': C})" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TYC
T1.000000-0.0059460.753668
Y-0.0059461.0000000.281741
C0.7536680.2817411.000000
\n", "
" ], "text/plain": [ " T Y C\n", "T 1.000000 -0.005946 0.753668\n", "Y -0.005946 1.000000 0.281741\n", "C 0.753668 0.281741 1.000000" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "collider_df.corr()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# Check the overall correlation\n", "# collider_df.corr().loc['Y', 'T']\n" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "np.float64(-0.34709013628348634)" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Check the correlation between Y and T within each level of C\n", "collider_df[C==1].corr().loc['T', 'Y']" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# COMPLETED CELL\n", "# Check the correlation between Y and T within each level of C\n", "# sel_df = collider_df[C==0].corr()\n", "# print(sel_df.loc['Y', 'T'])\n", "# sel_df = collider_df[C==1].corr()\n", "# print(sel_df.loc['Y', 'T'])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.10" } }, "nbformat": 4, "nbformat_minor": 4 }