{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "(lec6_live_complete)=\n",
    "# [complete] Lecture 6 live coding\n",
    "\n",
    "**2025-02-20**\n",
    "\n",
    "---"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import ipywidgets as widgets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "rng = np.random.RandomState(42)\n",
    "# high number of samples to reduce sampling noise\n",
    "n_samples = 30000\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Chain simulation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# generate a binary T\n",
    "T = rng.choice([0, 1], size=n_samples)\n",
    "\n",
    "# generate a binary M that is caused by T\n",
    "# T -> M\n",
    "M = rng.normal(loc=T, scale=0.3)\n",
    "M = (M > 0.5).astype(int)\n",
    "\n",
    "# generate a continuous Y that is caused by M\n",
    "# M -> Y\n",
    "Y = rng.normal(loc=M, scale=0.3)\n",
    "\n",
    "# add columns to chain_df\n",
    "chain_df = pd.DataFrame({\n",
    "    'T': T,\n",
    "    'M': M,\n",
    "    'Y': Y\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(30000, 3)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chain_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# COMPLETED CELL\n",
    "# generate a binary T\n",
    "# T = rng.choice([0, 1], size=n_samples)\n",
    "\n",
    "# # generate a binary M that is caused by T\n",
    "# M = rng.normal(loc=T, scale=0.3)\n",
    "# M = (M > 0.5).astype(int)\n",
    "\n",
    "# # generate a continuous Y that is caused by M\n",
    "# Y = rng.normal(loc=M, scale=0.3)\n",
    "\n",
    "# chain_df = pd.DataFrame({'T': T, 'M': M, 'Y': Y})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    ":::{note}\n",
    "\n",
    "Correlation is a measure of linear association that has a range of $[-1, 1]$. A correlation near 0 means no linear association.\n",
    "\n",
    ":::"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>T</th>\n",
       "      <th>M</th>\n",
       "      <th>Y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>T</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.901533</td>\n",
       "      <td>0.772763</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>M</th>\n",
       "      <td>0.901533</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.858280</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Y</th>\n",
       "      <td>0.772763</td>\n",
       "      <td>0.858280</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          T         M         Y\n",
       "T  1.000000  0.901533  0.772763\n",
       "M  0.901533  1.000000  0.858280\n",
       "Y  0.772763  0.858280  1.000000"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# compute the correlation between every pair of columns\n",
    "chain_df.corr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# COMPLETED CELL\n",
    "# compute the correlation between every pair of columns\n",
    "# chain_df.corr()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    ":::{note}\n",
    "\n",
    "\"Controlling for\" a variable means that we're conditioning on the variable -- we're holding the variable constant.\n",
    "\n",
    ":::"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>T</th>\n",
       "      <th>M</th>\n",
       "      <th>Y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>T</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000686</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>M</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Y</th>\n",
       "      <td>0.000686</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          T   M         Y\n",
       "T  1.000000 NaN  0.000686\n",
       "M       NaN NaN       NaN\n",
       "Y  0.000686 NaN  1.000000"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Examine the correlation between Y and T within each level of M\n",
    "# T ->  M -> Y\n",
    "# T -|  M -> Y\n",
    "chain_df[M==1].corr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# COMPLETED CELL\n",
    "# sel_df = chain_df[M==0].corr()\n",
    "# print(sel_df.loc['Y', 'T'])\n",
    "# sel_df = chain_df[M==1].corr()\n",
    "# print(sel_df.loc['Y', 'T'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fork simulation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# generate a binary X\n",
    "X = rng.choice([0, 1], size=n_samples)\n",
    "\n",
    "# generate a binary T that is caused by X\n",
    "# T <- X\n",
    "T = rng.normal(loc=X, scale=0.3)\n",
    "T = (T > 0.5).astype(int)\n",
    "\n",
    "# generate a continuous Y that is caused by X\n",
    "# Y <- X\n",
    "Y = rng.normal(loc=X, scale=0.3)\n",
    "\n",
    "# add columns to fork_df\n",
    "fork_df = pd.DataFrame({'T':T, 'X':X, 'Y':Y})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# COMPLETED CELL\n",
    "# # generate a binary X\n",
    "# X = rng.choice([0, 1], size=n_samples)\n",
    "\n",
    "# # generate a binary T that is caused by X\n",
    "# T = rng.normal(loc=X, scale=0.3)\n",
    "# T = (T > T.mean()).astype(int)\n",
    "\n",
    "# # generate a continuous Y that is caused by X\n",
    "# Y = rng.normal(loc=X, scale=0.3)\n",
    "\n",
    "# fork_df = pd.DataFrame({'X': X, 'T': T, 'Y': Y})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>T</th>\n",
       "      <th>X</th>\n",
       "      <th>Y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>T</th>\n",
       "      <td>1.00000</td>\n",
       "      <td>0.904210</td>\n",
       "      <td>0.775750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>X</th>\n",
       "      <td>0.90421</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.857782</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Y</th>\n",
       "      <td>0.77575</td>\n",
       "      <td>0.857782</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         T         X         Y\n",
       "T  1.00000  0.904210  0.775750\n",
       "X  0.90421  1.000000  0.857782\n",
       "Y  0.77575  0.857782  1.000000"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# compute the correlation between every pair of columns\n",
    "fork_df.corr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# COMPLETED CELL\n",
    "# compute the correlation between every pair of columns\n",
    "# corr_df = fork_df.corr()\n",
    "# corr_df.loc['Y', 'T']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "np.float64(0.002639686410825742)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fork_df[X==1].corr().loc['T', 'Y']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# COMPLETED CELL\n",
    "# Examine the correlation between Y and T within each level of X\n",
    "# sel_df = fork_df[X==0].corr()\n",
    "# print(sel_df.loc['Y', 'T'])\n",
    "# sel_df = fork_df[X==1].corr()\n",
    "# print(sel_df.loc['Y', 'T'])\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Collider simulation\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate a binary T\n",
    "T = rng.choice([0, 1], size=n_samples)\n",
    "\n",
    "# Generate a continuous Y\n",
    "Y = rng.normal(loc=np.zeros(n_samples), scale=0.3)\n",
    "\n",
    "# Generate a binary C that is caused by T and Y\n",
    "# T -> C <- Y\n",
    "C = rng.normal(loc=T+Y, scale=0.3)\n",
    "C = (C > C.mean()).astype(int)\n",
    "\n",
    "# add columns to collider_df\n",
    "collider_df = pd.DataFrame({'T':T, 'Y':Y, 'C':C})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# COMPLETED CELL\n",
    "# # Generate a binary T\n",
    "# T = rng.choice([0, 1], size=n_samples)\n",
    "\n",
    "# # Generate a continuous Y\n",
    "# Y = rng.normal(loc=np.zeros(n_samples), scale=0.3)\n",
    "\n",
    "# # Generate a binary C that is caused by T and Y\n",
    "# C = rng.normal(loc=T + Y, scale=0.3)\n",
    "# C = (C > C.mean()).astype(int)\n",
    "# # add columns to collider_df\n",
    "# collider_df = pd.DataFrame({'T': T, 'Y': Y, 'C': C})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>T</th>\n",
       "      <th>Y</th>\n",
       "      <th>C</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>T</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.005946</td>\n",
       "      <td>0.753668</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Y</th>\n",
       "      <td>-0.005946</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.281741</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C</th>\n",
       "      <td>0.753668</td>\n",
       "      <td>0.281741</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          T         Y         C\n",
       "T  1.000000 -0.005946  0.753668\n",
       "Y -0.005946  1.000000  0.281741\n",
       "C  0.753668  0.281741  1.000000"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "collider_df.corr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check the overall correlation\n",
    "# collider_df.corr().loc['Y', 'T']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "np.float64(-0.34709013628348634)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check the correlation between Y and T within each level of C\n",
    "collider_df[C==1].corr().loc['T', 'Y']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# COMPLETED CELL\n",
    "# Check the correlation between Y and T within each level of C\n",
    "# sel_df = collider_df[C==0].corr()\n",
    "# print(sel_df.loc['Y', 'T'])\n",
    "# sel_df = collider_df[C==1].corr()\n",
    "# print(sel_df.loc['Y', 'T'])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}