diff --git a/.gitignore b/.gitignore index 295d151a..564210b0 100644 --- a/.gitignore +++ b/.gitignore @@ -24,10 +24,3 @@ bld/ *.pdf *.run.xml *.synctex.gz -*.synctex.gz.sum.synctex -*.toc -*.snm -*.nav - -# Jupyter notebooks -*.ipynb diff --git a/inst/WORDLIST.txt b/inst/WORDLIST.txt index 50c0cad2..83d8f199 100644 --- a/inst/WORDLIST.txt +++ b/inst/WORDLIST.txt @@ -99,10 +99,6 @@ Koepke Kopka KopkaDaly Langtangen -late -lates -LATE -LATEs LeClere linux Loong diff --git a/paper/mte_papers.md b/paper/mte_papers.md deleted file mode 100644 index 96281171..00000000 --- a/paper/mte_papers.md +++ /dev/null @@ -1,38 +0,0 @@ -# MTE Papers - -A collection of papers related to the MTE framework. -In particular, applied papers that employ the framework of Mogstad et al. ECMA. - -## Applied Papers using Mogstad et al. ECMA Framework - -### Rose and Shem-Tov 2021 JPE - -Title: *How Does Incarceration Affect Reoffending? Estimating the Dose-Response Function* - -They use and extend the MTE model for extrapolation. They don't provide inference. -For example, see Figure 5 and Table 6. - -### Koichiro, Takanori Makoto AER - -Title: Selection on Welfare Gains: Experimental Evidence from Electricity Plan Choice† - -They don't seem to primarily use the MST method, but rather the parametric method of Brinch et al. - - -### Shea WP - -https://jkcshea.github.io/ - -# To Check - -- https://direct.mit.edu/rest/article-abstract/doi/10.1162/rest_a_01483/124131/The-Value-of-Piped-Water-and-Sewers-Evidence-from?redirectedFrom=fulltext -- https://direct.mit.edu/rest/article-abstract/105/3/646/102834/Reconciling-Seemingly-Contradictory-Results-from -- https://academic.oup.com/ej/article-abstract/132/646/2231/6548192 -- https://academic.oup.com/restud/article-abstract/90/1/432/6582594 -- https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4405043 -- https://www.aeaweb.org/articles?id=10.1257/aer.20181756 -- Appendix of https://academic.oup.com/qje/article-abstract/133/4/1885/5025665 - -# Misc - -- Mismeasurement tratment: https://www.sciencedirect.com/science/article/pii/S0304407623002725 diff --git a/paper/presentation.tex b/paper/presentation.tex deleted file mode 100644 index d7494c2c..00000000 --- a/paper/presentation.tex +++ /dev/null @@ -1,287 +0,0 @@ -\documentclass[11pt, aspectratio=169]{beamer} -% \documentclass[11pt,handout]{beamer} -\usepackage[T1]{fontenc} -\usepackage[utf8]{inputenc} -\usepackage{textcomp} -\usepackage{float, afterpage, rotating, graphicx} -\usepackage{epstopdf} -\usepackage{longtable, booktabs, tabularx} -\usepackage{fancyvrb, moreverb, relsize} -\usepackage{eurosym, calc} -\usepackage{amsmath, amssymb, amsfonts, amsthm, bm} -\usepackage[ - natbib=true, - bibencoding=inputenc, - bibstyle=authoryear-ibid, - citestyle=authoryear-comp, - maxcitenames=3, - maxbibnames=10, - useprefix=false, - sortcites=true, - backend=biber -]{biblatex} -\AtBeginDocument{\toggletrue{blx@useprefix}} -\AtBeginBibliography{\togglefalse{blx@useprefix}} -\setlength{\bibitemsep}{1.5ex} -\addbibresource{refs.bib} - -\hypersetup{colorlinks=true, linkcolor=black, anchorcolor=black, citecolor=black, -filecolor=black, menucolor=black, runcolor=black, urlcolor=black} - -\setbeamertemplate{footline}[frame number] -\setbeamertemplate{navigation symbols}{} -\setbeamertemplate{frametitle}{\centering\vspace{1ex}\insertframetitle\par} - -\newcommand{\indep}{\perp\!\!\!\!\perp} - - -\begin{document} - -\title{Thesis} - -% \author[Julian Budde] -% { -% {\bf Julian Budde}\\ -% {\small University of Bonn}\\[1ex] -% } - - -\begin{frame} - \titlepage - \note{~} -\end{frame} - -\begin{frame} - \frametitle{Marginal Treatment Effect Model: Notation} - - Based on~\cite{mogstad2018using}. - - \vspace{0.5cm} - - Program evaluation setting: - \begin{itemize} - \item Outcome $Y$ - \item Binary Treatment $D$ - \item Potential Outcomes $Y = D Y_{i1} + (1-D) Y_{i0}$ - \item Binary Instrument $Z$ - \end{itemize} - - \vspace{0.5cm} - - \textbf{Treatment Choice Equation}: - \begin{equation} - D = I\{p(Z) - U \geq 0\} - \end{equation} - where $p(z)$ is the propensity score and $U\sim Unif[0,1]$. - - \vspace{0.5cm} - - $U$ is ``resistance'' to treatment: Small $u$ $\rightarrow$ always take treatment. - -\end{frame} - -\begin{frame} - \frametitle{MTE Model: Assumptions} - $(Y,D,Z)$ are observables, $(Y_1, Y_0, U)$ unobservables. - - - \begin{itemize} - \item $U \indep Z$ - \item $E[Y_d|Z,U] = E[Y_d|U]$ and $E[Y_d^2] < \infty$ for $d \in \{0,1\}$ - \item $U$ is uniform on $[0,1]$ conditional on $Z$. - \end{itemize} - -\end{frame} - -\begin{frame} - \frametitle{MTR Functions} - - Key to the paper: Define everything in terms of (unobservable) MTR functions. - - For $d\in\{0,1\}$: - \begin{equation} - m_d(u) \equiv E[Y_d | U=u]. - \end{equation} - - \vspace{0.5cm} - - \textbf{Extrapolation}: Combine - \begin{itemize} - \item \textit{Information} on point-identified estimands, with - \item \textit{Assumptions} on MTR functions. - \end{itemize} - - \vspace{0.5cm} - - $\Rightarrow$ Linear program to get identified set. - - \vspace{0.5cm} - - Question: How to perform inference? -\end{frame} - -\section{Simple Example} - -\begin{frame} - \frametitle{Setup} - - \begin{itemize} - \item Outcome $Y \in [0,1]$. - \item Binary Treatment $D$ - \item Binary Instrument $Z$ - \item Propensity score: $p(0) = 0.4$ and $p(1) = 0.6$. - \end{itemize} - - \vspace{0.5cm} - - \textbf{Identify LATE}: $\beta_0^s = E[Y_1 - Y_0 | u \in (0.4, 0.6]]$. - - \vspace{0.5cm} - - \textbf{Target Parameter}: $\beta^* = E[Y_1 - Y_0 | u \in (0.4, 0.8]]$. - -\end{frame} - -\begin{frame} - \frametitle{Solution} - The linear programs have an explicit solution: - \begin{equation} - \beta^* \in [\omega \beta_0^s + (1 - \omega) * (-1), \omega \beta_0^s + (1-\omega) * 1], - \end{equation} - where $\omega = \frac{p(1) - p(0)}{\overline{u} + p(1) - p(0)}$ is the relative complier share. -\end{frame} - -\begin{frame} - \frametitle{Solution with Constraints} - - We can assume that MTR functions must be \textit{increasing} in $u$. - This introduces a \textit{kink} in the solution. - - \begin{equation} - \overline{\beta^*}(\beta_s)= - \begin{cases} - \omega \beta_s + (1 - \omega),& \quad \text{if } \beta_s \geq 0\\ - \beta_s + (1 - \omega), & \quad \text{if } \beta_s < 0, - \end{cases} - \end{equation} - and - \begin{equation} - \underline{\beta^*}(\beta_s)= - \begin{cases} - \beta_s - (1 - \omega),& \quad \text{if } \beta_s \geq 0\\ - \omega \beta_s - (1 - \omega), & \quad \text{if } \beta_s < 0. - \end{cases} - \end{equation} - -\end{frame} - -\begin{frame} - \frametitle{Inference} - - From~\cite{fang2019inference} we know the standard \textit{bootstrap fails} when $\phi(\theta)$ is not fully differentiable. - - \vspace{0.5cm} - - \textbf{Alternatives}: Based on \textit{directional} differentiability of $\phi$ we can use adjusted delta methods. - \begin{itemize} - \item \textit{Analytical} delta method, e.g.~\cite{fang2019inference}. - \item \textit{Numerical} delta method, e.g.~\cite{hong2018numerical}. - \end{itemize} - - -\end{frame} - -\begin{frame} - \frametitle{Analytical delta bootstrap}: - \begin{itemize} - \item[1.] Bootstrap approximation to distribution of $\sqrt{n}(\hat{\beta^s} - \beta_0^s)$. - \item[2.] Suitable estimator $\hat{\phi'_n}$ for directional derivative. - \item[3.] Delta method: - \begin{equation*} - \hat{\phi'_n}(\sqrt{n}(\hat{\beta^s}^*_n - \hat{\beta^s}_n)) - \end{equation*} - \end{itemize} - - In our case: Simple pre-test - \begin{equation*} - \hat{\phi'_n}(h)= - \begin{cases} - h \omega,& \quad \text{if } \sqrt{n}\hat{\beta^s}/\hat{\sigma^s} > \kappa_n\\ - I(h < 0) h + I(h > 0) h \omega,& \quad \text{if } |\sqrt{n}\hat{\beta^s}/\hat{\sigma^s}| \leq \kappa_n\\ - h,& \quad \text{if } \sqrt{n}\hat{\beta^s}/\hat{\sigma^s} < -\kappa_n\\ - \end{cases} - \end{equation*} - where we require $\kappa_n \to \infty$ but more slowly than $\sqrt{n}$, i.e. $\kappa_n / \sqrt{n} \to 0$. -\end{frame} - -\begin{frame} - \frametitle{Numerical delta bootstrap} - - If we don't know the analytical structure of $\phi'$, we can use a numerical approximation: - \begin{equation*} - \hat{\phi'}_{n, s_n} \equiv \frac{1}{s_n}\{\phi(\hat{\beta^s} + s_n h) - \phi(\hat{\beta^s})\} - \end{equation*} - - Combining this with a bootstrap approximation to $\sqrt{n}(\hat{\beta^s} - \beta_0^s)$ we get - \begin{equation*} - \hat{\phi'}_{n, s_n}(\sqrt{n}(\hat{\beta^s} - \beta_0^s)) \equiv \frac{1}{s_n}\{\phi(\hat{\beta^s} + s_n \sqrt{n}(\hat{\beta^s} - \beta_0^s)) - \phi(\hat{\beta^s})\}, - \end{equation*} - the distribution of which we can use to construct confidence intervals. - We require $s_n\to0$ but $s_n\sqrt{n} \to \infty$. -\end{frame} - -\begin{frame} - \frametitle{Monte Carlo Simulations} - - \begin{itemize} - \item True parameter $\beta^*$ at upper bound of identified set. - \item Compare: Standard bootstrap, analytical and numerical delta method. - \end{itemize} - - \vspace{0.5cm} - - To construct $1-\alpha = 0.95$ confidence interval for \textit{true parameter}: - \begin{itemize} - \item Use asymptotic approximations from above. - \item Construct one-sided $\alpha/2$ intervals for the upper and lower bound. - \item By~\cite{imbens2004confidence} logic these should be \textit{conservative}. - \end{itemize} - - \vspace{0.5cm} - - Setting: $N_{boot} = N_{sims} = 250$, $s_n = 1 / \sqrt{n}$, $\kappa_n = \sqrt{n}$. - -\end{frame} - -\begin{frame} - \frametitle{Results: Coverage} - - \begin{figure} - \includegraphics[width=\textheight]{../bld/boot/figures/coverage.png} - \caption{Coverage by Method and True Parameter} - \end{figure} - -\end{frame} - -\begin{frame} - \frametitle{Outlook} - \begin{itemize} - \item[1.] Get inference to work: CI construction, tuning params, check analytical delta. - \item[2.] Understand solutions under different constraints: Shape restrictions, parametric restrictions (written package for this) - \item[3.] Simulate bootstrap and numerical delta method in - \begin{itemize} - \item[(a)] More complicated example $\Rightarrow$ original paper example - \item[(b)] Empirically relevant example $\Rightarrow$ literature - \end{itemize} - \item[4.] Justify numerical delta bootstrap theoretically by studying characteristics of LP solutions - \end{itemize} - -\end{frame} - -\begin{frame}[allowframebreaks] - \frametitle{References} - \renewcommand{\bibfont}{\normalfont\footnotesize} - \printbibliography -\end{frame} - -\end{document} diff --git a/paper/refs.bib b/paper/refs.bib index eab15c16..9f5f1c1c 100644 --- a/paper/refs.bib +++ b/paper/refs.bib @@ -115,25 +115,3 @@ @article{poirier2024quantifying journal={arXiv preprint arXiv:2404.14603}, year={2024} } - -@article{hong2018numerical, - title={The numerical delta method}, - author={Hong, Han and Li, Jessie}, - journal={Journal of Econometrics}, - volume={206}, - number={2}, - pages={379--394}, - year={2018}, - publisher={Elsevier} -} - -@article{fang2019inference, - title={Inference on directionally differentiable functions}, - author={Fang, Zheng and Santos, Andres}, - journal={The Review of Economic Studies}, - volume={86}, - number={1}, - pages={377--412}, - year={2019}, - publisher={Oxford University Press} -} diff --git a/paper/thesis.tex b/paper/thesis.tex index c0110c88..c820bf80 100644 --- a/paper/thesis.tex +++ b/paper/thesis.tex @@ -292,7 +292,7 @@ \subsubsection{Monotone MTR Functions} \caption{Upper Bound with Monotone MTR Functions}\label{fig:sm_upper_incr.tex} \end{figure} -Figure~\ref{fig:sm_sol_upper} displays the solution to the upper bound $\overline{\beta^*}$ and lower bound $\underline{\beta^*}$ as a function of $\beta_s$ under different restrictions. +Figure~\ref{fig:sm_sol_upper} displays the solution to the upper bound $\overline{\beta^*}$ and lower bound $\underleftarrow{\beta^*}$ as a function of $\beta_s$ under different restrictions. \begin{figure} \centering \begin{subfigure}{0.45\textwidth} @@ -335,76 +335,6 @@ \subsection{Estimation and Inference} . \end{equation} -\section{Inference Approaches} - -The target is to make inference on the parameter $\beta^*$ which for the simulation purposes I set to $LATE(0.4, 0.6)$. -\footnote{Inference for the interval-identified parameter $\beta^*$ and its identified set are not necessarily the same as pointed out by \cite{imbens2004confidence} in simple examples. I will return to this point below.} -That means, we want to extrapolte the LATE for the instrument-complier subpopulation with $u\in[0.4, 0.6]$ to a broader sub-population. -Inference is complicated by the fact that the optimal solution $\beta^* = \beta^*(\beta_s)$ exhibits a kink at $\beta_s=0$ and hence is not fully differentiable at that point. -However, as pointed out by~\cite{fang2023inference} and~\cite{hong2018numerical}, a delta method can be used that only requires \textit{directional} differentiability of the functional. -~\cite{fang2023inference} propose a method that combines a bootstrap approximation to the underlying parameter with an estimate of the directional derivative of the functional $\phi$ that exploits its analytical structure. -~\cite{hong2018numerical} on the other hand propose to use a numerical approximation to the directional derivative. -Henceforth, I will call these methods the \textit{analytical} and \textit{numerical} delta bootstrap, respectively. - -\subsection*{Notation} -In the following I will use the notation of~\cite{fang2023inference}. -Our goal is to perform inference on the real-valued parameter $\beta^*$, which is only set-identified. -As shown above the set is given by $[\underline{\beta^*}(\beta_s), \overline{\beta^*}]$, -with the following explicit solutions: -\begin{equation*} - \overline{\beta^*}(\beta_s)= - \begin{cases} - \omega \beta_s + (1 - \omega),& \quad \text{if } \beta_s \geq 0\\ - \beta_s + (1 - \omega), & \quad \text{if } \beta_s < 0, - \end{cases} -\end{equation*} -and -\begin{equation*} - \underline{\beta^*}(\beta_s)= - \begin{cases} - \beta_s - (1 - \omega),& \quad \text{if } \beta_s \geq 0\\ - \omega \beta_s - (1 - \omega), & \quad \text{if } \beta_s < 0. - \end{cases} -\end{equation*} -In the notation of~\cite{fang2023inference} we have an underlying parameter $\theta_0 \equiv \beta_0^s$ which in our case is real-valued. -The mapping $\phi: \mathbb{R} \to \mathbb{R}$ is the solution to the upper or lower bound, respectively. -That is, we have $\overline{\phi} \equiv \overline{\beta^*}$ with the mapping given above. -As is evident, this function has a kink at $\beta_s = 0$ and hence is only directionally differentiable at that point. - -If we can find a method to construct confidence intervals for $\overline{\phi}(\beta_0^s)$ and $\underline{\phi}(\beta_0^s)$, we can also construct a confidence interval. - - -Note that our setting is the same as Example 2.2 of~\cite{hong2018numerical}, which itself is a more general version of Example 2 in~\cite{fang2023inference}. - -\section{Simulation Results} - -\begin{figure} - \centering - \begin{subfigure}{0.45\textwidth} - \centering - \includegraphics[width=\textwidth]{../bld/boot/figures/coverage.png} - \caption{Coverage for True Parameter} - \end{subfigure} - \hfill - \begin{subfigure}{0.45\textwidth} - \centering - \includegraphics[width=\textwidth]{../bld/boot/figures/length.png} - \caption{Length of CIs} - \end{subfigure} - - \begin{subfigure}{0.45\textwidth} - \centering - \includegraphics[width=\textwidth]{../bld/boot/figures/means.png} - \caption{Means of Upper and Lower Bound} - \end{subfigure} - \hfill - \begin{subfigure}{0.45\textwidth} - \centering - \includegraphics[width=\textwidth]{../bld/boot/figures/coverage_eps_fun.png} - \caption{Coverage by Step Size for Numerical Delta Method} - \end{subfigure} - \caption{Simulation: CI Properties for Standard and Numerical Delta Bootstrap}\label{fig:simulation_ci_bootstrap} -\end{figure} \bibliographystyle{plain} \bibliography{refs.bib} diff --git a/pyproject.toml b/pyproject.toml index 26699b1c..f320ccd9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,9 +12,7 @@ pdbcls = "pdbp:Pdb" target-version = "py311" select = ["ALL"] fix = true -# Might want to reduce this at some point, currently just the set to whatever number -# is required. -pylint.max-args = 13 +pylint.max-args = 9 fix-only = false # No linting errors will be reported extend-ignore = [ "S101", # Use of `assert` detected. diff --git a/src/thesis/simple_model/__init__.py b/src/thesis/bootstrap_fail/__init__.py similarity index 100% rename from src/thesis/simple_model/__init__.py rename to src/thesis/bootstrap_fail/__init__.py diff --git a/src/thesis/bootstrap_fail/bootstrap.ipynb b/src/thesis/bootstrap_fail/bootstrap.ipynb new file mode 100644 index 00000000..174abe69 --- /dev/null +++ b/src/thesis/bootstrap_fail/bootstrap.ipynb @@ -0,0 +1,98 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Explore bootstrap simulation functions.\"\"\"\n", + "\n", + "import plotly.graph_objects as go\n", + "\n", + "from thesis.bootstrap_fail.funcs import (\n", + " _bootstrap_ci,\n", + " _draw_data,\n", + ")\n", + "from thesis.config import RNG\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "n_obs = 10_000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = _draw_data(\n", + " n_obs=n_obs,\n", + " rng=RNG,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lo, hi, late = _bootstrap_ci(\n", + " alpha=0.05,\n", + " u_hi=0.7,\n", + " data=data,\n", + " n_boot=1_000,\n", + " rng=RNG,\n", + " return_distr=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Make histogram of lo, hi, and late in one figure\n", + "fig = go.Figure()\n", + "fig.add_trace(go.Histogram(x=lo, histnorm=\"probability\", name=\"lo\"))\n", + "fig.add_trace(go.Histogram(x=hi, histnorm=\"probability\", name=\"hi\"))\n", + "fig.add_trace(go.Histogram(x=late, histnorm=\"probability\", name=\"late\"))\n", + "fig.update_layout(barmode=\"overlay\")\n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/thesis/bootstrap_fail/bootstrap_plot.ipynb b/src/thesis/bootstrap_fail/bootstrap_plot.ipynb new file mode 100644 index 00000000..90d97003 --- /dev/null +++ b/src/thesis/bootstrap_fail/bootstrap_plot.ipynb @@ -0,0 +1,91 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Task for running bootstrap simulations.\"\"\"\n", + "\n", + "\n", + "import pandas as pd # type: ignore[import-untyped]\n", + "import plotly.graph_objects as go # type: ignore[import-untyped]\n", + "\n", + "from thesis.bootstrap_fail.task_bootstrap_sims import ID_TO_KWARGS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "coverage = []\n", + "\n", + "for kwargs in ID_TO_KWARGS.values():\n", + " res = pd.read_pickle(kwargs.path_to_data) # noqa: S301\n", + " res[\"ci_covers\"] = (res[\"lo\"] < res[\"true\"]) & (res[\"hi\"] > res[\"true\"])\n", + " res[\"ci_length\"] = res[\"hi\"] - res[\"lo\"]\n", + " coverage.append(\n", + " (\n", + " kwargs.u_hi,\n", + " kwargs.n_obs,\n", + " res[\"ci_covers\"].mean(),\n", + " res[\"ci_length\"].mean(),\n", + " ),\n", + " )\n", + "\n", + "data = pd.DataFrame(coverage, columns=[\"u_hi\", \"n_obs\", \"coverage\", \"length\"])\n", + "\n", + "fig = go.Figure()\n", + "for n_obs in data.n_obs.unique():\n", + " data_sub = data[data.n_obs == n_obs]\n", + " fig.add_trace(\n", + " go.Scatter(x=data_sub.u_hi, y=data_sub.coverage, name=f\"n_obs={n_obs}\"),\n", + " )\n", + "\n", + "fig.update_layout(\n", + " title=\"Coverage probability of confidence interval\",\n", + " xaxis_title=\"u_hi\",\n", + " yaxis_title=\"Coverage probability\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = go.Figure()\n", + "for n_obs in data.n_obs.unique():\n", + " data_sub = data[data.n_obs == n_obs]\n", + " fig.add_trace(\n", + " go.Scatter(x=data_sub.u_hi, y=data_sub.length, name=f\"n_obs={n_obs}\"),\n", + " )\n", + "\n", + "fig.update_layout(\n", + " title=\"Average length of confidence interval\",\n", + " xaxis_title=\"u_hi\",\n", + " yaxis_title=\"Average length\",\n", + ")" + ] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/thesis/bootstrap_fail/funcs.py b/src/thesis/bootstrap_fail/funcs.py new file mode 100644 index 00000000..3a60d283 --- /dev/null +++ b/src/thesis/bootstrap_fail/funcs.py @@ -0,0 +1,155 @@ +"""Functions for bootstrap sampling experiment.""" + +from math import comb + +import numpy as np +import pandas as pd # type: ignore[import-untyped] +from numpy.typing import ArrayLike +from scipy import integrate # type: ignore[import-untyped] + +from thesis.classes import Instrument + + +def simulation_bootstrap( + n_sims: int, + n_obs: int, + n_boot: int, + u_hi: float, + alpha: float, + instrument: Instrument, + rng: np.random.Generator, + param_pos: str = "boundary", +) -> pd.DataFrame: + """Simulate the bootstrap experiment. + + Args: + n_sims: Number of simulations. + n_obs: Number of observations. + n_boot: Number of bootstrap samples. + u_hi: Upper bound of target parameter. + alpha: Bootstrap percentile for confidence interval (1-alpha for upper). + instrument: Instrument object containing properties of the instrument. + rng: Random number generator. + param_pos: Position of the parameter in the identified set. + + Returns: + DataFrame with the results of the simulation. + + """ + results = np.zeros((n_sims, 2)) + + for i in range(n_sims): + data = _draw_data(n_obs, rng, param_pos=param_pos, instrument=instrument) + + results[i] = _bootstrap_ci(alpha, u_hi, data, n_boot, rng) + + out = pd.DataFrame(results, columns=["lo", "hi"]) + out["u_hi"] = u_hi + out["true"] = _true_late(u_hi, instrument=instrument, param_pos=param_pos) + + return out + + +def _bootstrap_ci( + alpha: float, + u_hi: float, + data: np.ndarray, + n_boot: int, + rng: np.random.Generator, + return_distr: bool = False, # noqa: FBT001, FBT002 +) -> tuple[ArrayLike, ...]: + boot_lo = np.zeros(n_boot) + boot_hi = np.zeros(n_boot) + boot_late = np.zeros(n_boot) + + n_obs = data.shape[0] + + for i in range(n_boot): + boot_data = data[rng.choice(n_obs, size=n_obs, replace=True)] + late = _late(boot_data) + pscores_hat = _estimate_pscores(boot_data) + lo, hi = _idset(late, u_hi, pscores_hat) + + boot_late[i] = late + boot_lo[i] = lo + boot_hi[i] = hi + + if return_distr: + return boot_lo, boot_hi, boot_late + + # Take the alpha quantile of boot_lo and 1 - alpha quantile of boot_hi + return np.quantile(boot_lo, alpha), np.quantile(boot_hi, 1 - alpha) + + +def _idset( + b_late: float, + u_hi: float, + pscores_hat: tuple[float, float], +) -> tuple[float, float]: + w = (pscores_hat[1] - pscores_hat[0]) / (u_hi + pscores_hat[1] - pscores_hat[0]) + + lo = w * b_late - (1 - w) + hi = w * b_late + (1 - w) + + return lo, hi + + +def _late(data: np.ndarray) -> float: + yz1 = data[data[:, 2] == 1, 0].mean() + yz0 = data[data[:, 2] == 0, 0].mean() + + dz1 = data[data[:, 2] == 1, 1].mean() + dz0 = data[data[:, 2] == 0, 1].mean() + + return (yz1 - yz0) / (dz1 - dz0) + + +def _draw_data( + n_obs, + rng: np.random.Generator, + param_pos: str, + instrument: Instrument, +) -> np.ndarray: + z = rng.choice(instrument.support, size=n_obs, p=instrument.pmf) + + u = rng.uniform(low=0, high=1, size=n_obs) + + d = np.where(z == 1, u <= instrument.pscores[1], u <= instrument.pscores[0]) + + # Situation, where the parameter is at the boundary of the identified set. + if param_pos == "boundary": + y0 = 0 + y1 = np.where(u <= instrument.pscores[1], 0, 1) + y = d * y1 + (1 - d) * y0 + rng.normal(scale=0.1, size=n_obs) + + return np.column_stack((y, d, z, u)) + + +def _m0(u: float | np.ndarray) -> float | np.ndarray: + """Compute the m0 function.""" + return 0.6 * _bern(0, 2, u) + 0.4 * _bern(1, 2, u) + 0.3 * _bern(2, 2, u) + + +def _m1(u: float | np.ndarray) -> float | np.ndarray: + """Compute the m1 function.""" + return 0.75 * _bern(0, 2, u) + 0.5 * _bern(1, 2, u) + 0.25 * _bern(2, 2, u) + + +def _bern(v: int, n: int, x: float | np.ndarray) -> float | np.ndarray: + """Compute the Bernstein polynomial of degree n and index i at x.""" + return comb(n, v) * x**v * (1 - x) ** (n - v) + + +def _true_late(u_hi: float, instrument: Instrument, param_pos: str) -> float: + if param_pos == "boundary": + return 0 + 1 * (u_hi) / (u_hi + instrument.pscores[1] - instrument.pscores[0]) + + return ( + integrate.quad(_m1, instrument.pscores[0], instrument.pscores[1] + u_hi)[0] + - integrate.quad(_m0, instrument.pscores[0], instrument.pscores[1] + u_hi)[0] + ) + + +def _estimate_pscores(data: np.ndarray) -> tuple[float, float]: + """Estimate the propensity score.""" + return (data[data[:, 2] == 0, 1].mean(), data[data[:, 2] == 1, 1].mean()) diff --git a/src/thesis/bootstrap_fail/task_bootstrap_sims.py b/src/thesis/bootstrap_fail/task_bootstrap_sims.py new file mode 100644 index 00000000..f156b21b --- /dev/null +++ b/src/thesis/bootstrap_fail/task_bootstrap_sims.py @@ -0,0 +1,80 @@ +"""Task for running bootstrap simulations.""" + +from pathlib import Path +from typing import Annotated, NamedTuple + +import numpy as np +from pytask import Product, task + +from thesis.bootstrap_fail.funcs import simulation_bootstrap +from thesis.classes import Instrument +from thesis.config import BLD, RNG + + +class _Arguments(NamedTuple): + u_hi: float + path_to_data: Path + pscore_low: float + n_obs: int + pscore_hi: float = 0.6 + alpha: float = 0.05 + n_boot: int = 1_000 + n_sims: int = 1_000 + rng: np.random.Generator = RNG + + +U_HI = np.linspace(0, 0.05, num=10) +N_OBS = [100, 250] +PSCORES_LOW = np.linspace(0.55, 0.6, num=10) + +ID_TO_KWARGS = { + f"bootstrap_sims_{u_hi}_n_obs_{n_obs}_pscore_low_{pscore_low}": _Arguments( + u_hi=u_hi, + n_obs=n_obs, + pscore_low=pscore_low, + path_to_data=Path( + BLD + / "boot" + / "results" + / f"data_{u_hi}_n_obs_{n_obs}_pscore_low_{pscore_low}.pkl", + ), + ) + for u_hi in U_HI + for n_obs in N_OBS + for pscore_low in PSCORES_LOW +} + + +for id_, kwargs in ID_TO_KWARGS.items(): + + @task(id=id_, kwargs=kwargs) # type: ignore[arg-type] + def task_bootstrap_sims( + n_sims: int, + n_obs: int, + n_boot: int, + u_hi: float, + alpha: float, + pscore_low: float, + pscore_hi: float, + rng: np.random.Generator, + path_to_data: Annotated[Path, Product], + ) -> None: + """Task for running bootstrap simulations.""" + instrument = Instrument( + support=np.array([0, 1]), + pmf=np.array([0.5, 0.5]), + pscores=np.array([pscore_low, pscore_hi]), + ) + + res = simulation_bootstrap( + n_sims=n_sims, + n_obs=n_obs, + n_boot=n_boot, + u_hi=u_hi, + alpha=alpha, + instrument=instrument, + rng=rng, + param_pos="boundary", + ) + + res.to_pickle(path_to_data) diff --git a/src/thesis/bootstrap_fail/task_plot_bootstrap_sims.py b/src/thesis/bootstrap_fail/task_plot_bootstrap_sims.py new file mode 100644 index 00000000..0425c3cd --- /dev/null +++ b/src/thesis/bootstrap_fail/task_plot_bootstrap_sims.py @@ -0,0 +1,71 @@ +"""Task for running bootstrap simulations.""" + +from pathlib import Path +from typing import Annotated + +import pandas as pd # type: ignore[import-untyped] +import plotly.graph_objects as go # type: ignore[import-untyped] +from pytask import Product + +from thesis.bootstrap_fail.task_bootstrap_sims import ID_TO_KWARGS, _Arguments +from thesis.config import BLD + + +def task_plot_boostrap_sims( + id_to_kwargs: dict[str, _Arguments] = ID_TO_KWARGS, + path_to_plot_coverage: Annotated[Path, Product] = Path( + BLD / "boot" / "figures" / "coverage.png", + ), + path_to_plot_length: Annotated[Path, Product] = Path( + BLD / "boot" / "figures" / "length.png", + ), +) -> None: + """Plot the coverage probability of the confidence interval.""" + # Read all the bootstrap result and for each calculate the coverage probability + # of the confidence interval. Then plot against u_hi. + coverage = [] + + for kwargs in id_to_kwargs.values(): + res = pd.read_pickle(kwargs.path_to_data) # noqa: S301 + res["ci_covers"] = (res["lo"] < res["true"]) & (res["hi"] > res["true"]) + res["ci_length"] = res["hi"] - res["lo"] + coverage.append( + ( + kwargs.u_hi, + kwargs.n_obs, + res["ci_covers"].mean(), + res["ci_length"].mean(), + ), + ) + + data = pd.DataFrame(coverage, columns=["u_hi", "n_obs", "coverage", "length"]) + + fig = go.Figure() + for n_obs in data.n_obs.unique(): + data_sub = data[data.n_obs == n_obs] + fig.add_trace( + go.Scatter(x=data_sub.u_hi, y=data_sub.coverage, name=f"n_obs={n_obs}"), + ) + + fig.update_layout( + title="Coverage probability of confidence interval", + xaxis_title="u_hi", + yaxis_title="Coverage probability", + ) + + fig.write_image(path_to_plot_coverage) + + fig = go.Figure() + for n_obs in data.n_obs.unique(): + data_sub = data[data.n_obs == n_obs] + fig.add_trace( + go.Scatter(x=data_sub.u_hi, y=data_sub.length, name=f"n_obs={n_obs}"), + ) + + fig.update_layout( + title="Average length of confidence interval", + xaxis_title="u_hi", + yaxis_title="Average length", + ) + + fig.write_image(path_to_plot_length) diff --git a/src/thesis/bootstrap_kink/bootstrap_max.ipynb b/src/thesis/bootstrap_kink/bootstrap_max.ipynb new file mode 100644 index 00000000..0c0a60bb --- /dev/null +++ b/src/thesis/bootstrap_kink/bootstrap_max.ipynb @@ -0,0 +1,212 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Investigate performance of bootstrap for phi(theta) = max(theta, 0).\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import plotly.graph_objs as go\n", + "from joblib import Parallel, delayed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def _phi_max(theta):\n", + " return np.maximum(theta, 0)\n", + "\n", + "\n", + "def _phi_kink(theta, a=0.5):\n", + " return theta * (theta < 0) + a * theta * (theta >= 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def _experiment(n_obs, n_boot, theta_0, phi, alpha, rng, return_boot):\n", + " x = rng.normal(theta_0, 1, n_obs)\n", + "\n", + " mle = phi(x.mean())\n", + "\n", + " boot_mle = np.zeros(n_boot)\n", + "\n", + " for i in range(n_boot):\n", + " boot_idx = rng.choice(n_obs, n_obs, replace=True)\n", + " boot_x = x[boot_idx]\n", + " boot_mle[i] = phi(boot_x.mean())\n", + "\n", + " boot_distr = np.sqrt(n_obs) * (boot_mle - mle)\n", + "\n", + " boot_cval_lo = np.percentile(boot_distr, 100 * alpha / 2)\n", + " boot_cval_hi = np.percentile(boot_distr, 100 * (1 - alpha / 2))\n", + "\n", + " ci_lo = mle - boot_cval_hi / np.sqrt(n_obs)\n", + " ci_hi = mle - boot_cval_lo / np.sqrt(n_obs)\n", + "\n", + " if return_boot:\n", + " return boot_distr\n", + "\n", + " return pd.Series(\n", + " {\n", + " \"ci_lo\": ci_lo,\n", + " \"ci_hi\": ci_hi,\n", + " \"theta_0\": theta_0,\n", + " \"alpha\": alpha,\n", + " \"n_obs\": n_obs,\n", + " \"n_boot\": n_boot,\n", + " },\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def simulation(n_sim, n_obs, n_boot, theta_0, phi, alpha, rng):\n", + " \"\"\"Simulation.\"\"\"\n", + " return pd.concat(\n", + " [_experiment(n_obs, n_boot, theta_0, phi, alpha, rng) for _ in range(n_sim)],\n", + " axis=1,\n", + " ).T" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Repeat simulation for different values of theta_0 and add 0\n", + "theta_0_vals = np.linspace(-0.2, 0.2, 20)\n", + "n_obs_vals = [250, 1_000]\n", + "\n", + "N_SIM = 2_000\n", + "N_BOOT = 2_000\n", + "\n", + "ALPHA = 0.05" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "res = Parallel(n_jobs=10)(\n", + " delayed(simulation)(\n", + " n_sim=N_SIM,\n", + " n_obs=n_obs,\n", + " n_boot=N_BOOT,\n", + " theta_0=theta_0,\n", + " phi=_phi_kink,\n", + " alpha=ALPHA,\n", + " rng=np.random.default_rng(),\n", + " return_boot=False,\n", + " )\n", + " for theta_0 in theta_0_vals\n", + " for n_obs in n_obs_vals\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "res = pd.concat(res, axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "res[\"phi_0\"] = _phi_kink(res[\"theta_0\"])\n", + "res[\"covers\"] = (res[\"ci_lo\"] <= res[\"phi_0\"]) & (res[\"ci_hi\"] >= res[\"phi_0\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot mean covergae by theta_0 and n_obs\n", + "\n", + "res_grouped = res.groupby([\"theta_0\", \"n_obs\"])[\"covers\"].mean().reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = go.Figure()\n", + "\n", + "for n_obs, group in res_grouped.groupby(\"n_obs\"):\n", + " fig.add_trace(\n", + " go.Scatter(\n", + " x=group[\"theta_0\"],\n", + " y=group[\"covers\"],\n", + " mode=\"lines\",\n", + " name=f\"n_obs={n_obs}\",\n", + " ),\n", + " )\n", + "\n", + "fig.update_layout(\n", + " title=f\"Coverage of {1 - ALPHA} confidence intervals by theta_0\",\n", + " xaxis_title=\"theta_0\",\n", + " yaxis_title=\"Coverage\",\n", + " yaxis={\"tickformat\": \".2%\"},\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/thesis/bootstrap_kink/funcs_kink.py b/src/thesis/bootstrap_kink/funcs_kink.py deleted file mode 100644 index 09690f06..00000000 --- a/src/thesis/bootstrap_kink/funcs_kink.py +++ /dev/null @@ -1,63 +0,0 @@ -"""Functions for analyzing inference for simple kink model.""" - -import numpy as np -import pandas as pd # type: ignore[import-untyped] - - -def simulation(n_sim, n_obs, n_boot, theta_0, phi, alpha, rng): - """Simulation.""" - return pd.concat( - [ - _experiment(n_obs, n_boot, theta_0, phi, alpha, rng, return_boot=False) - for _ in range(n_sim) - ], - axis=1, - ).T - - -def _experiment(n_obs, n_boot, theta_0, phi, alpha, rng, return_boot): - x = rng.normal(theta_0, 1, n_obs) - - mle = phi(x.mean()) - - boot_mle = np.zeros(n_boot) - - for i in range(n_boot): - boot_idx = rng.choice(n_obs, n_obs, replace=True) - boot_x = x[boot_idx] - boot_mle[i] = phi(boot_x.mean()) - - boot_distr = np.sqrt(n_obs) * (boot_mle - mle) - - boot_cval_lo = np.percentile(boot_distr, 100 * alpha / 2) - boot_cval_hi = np.percentile(boot_distr, 100 * (1 - alpha / 2)) - - ci_lo = mle - boot_cval_hi / np.sqrt(n_obs) - ci_hi = mle - boot_cval_lo / np.sqrt(n_obs) - - if return_boot: - return boot_distr - - return pd.Series( - { - "ci_lo": ci_lo, - "ci_hi": ci_hi, - "theta_0": theta_0, - "alpha": alpha, - "n_obs": n_obs, - "n_boot": n_boot, - }, - ) - - -def _phi_max(theta: float, cutoff: float = 0): - return np.maximum(theta, cutoff) - - -def _phi_kink( - theta: float, - kink: float, - slope_left: float = 0.5, - slope_right: float = 1, -): - return slope_left * theta * (theta < kink) + slope_right * theta * (theta >= kink) diff --git a/src/thesis/classes.py b/src/thesis/classes.py index d0eec08d..dc33a1ee 100644 --- a/src/thesis/classes.py +++ b/src/thesis/classes.py @@ -1,5 +1,4 @@ """Custom classes for thesis.""" - from collections.abc import Callable from dataclasses import dataclass from typing import NamedTuple @@ -72,11 +71,3 @@ class MonteCarloSetup(NamedTuple): sample_size: int repetitions: int u_hi_range: np.ndarray | None = None - - -class LocalATEs(NamedTuple): - """LATEs for subpopulations: never-taker, complier, always-taker.""" - - always_taker: float - complier: float - never_taker: float diff --git a/src/thesis/imbens_manski/im_funcs.py b/src/thesis/imbens_manski/funcs.py similarity index 100% rename from src/thesis/imbens_manski/im_funcs.py rename to src/thesis/imbens_manski/funcs.py diff --git a/src/thesis/imbens_manski/imbens_manski.ipynb b/src/thesis/imbens_manski/imbens_manski.ipynb new file mode 100644 index 00000000..31468180 --- /dev/null +++ b/src/thesis/imbens_manski/imbens_manski.ipynb @@ -0,0 +1,44 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Try out Imbens Manski simulation functions.\"\"\"\n", + "from thesis.config import RNG\n", + "from thesis.imbens_manski.funcs import experiment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "experiment(\n", + " n_obs=1000,\n", + " p=0.75,\n", + " alpha=0.95,\n", + " rng=RNG,\n", + ")" + ] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/thesis/imbens_manski/_task_plot_imbens_manski.py b/src/thesis/imbens_manski/task_plot_imbens_manski.py similarity index 98% rename from src/thesis/imbens_manski/_task_plot_imbens_manski.py rename to src/thesis/imbens_manski/task_plot_imbens_manski.py index d32cacd4..bd16af8e 100644 --- a/src/thesis/imbens_manski/_task_plot_imbens_manski.py +++ b/src/thesis/imbens_manski/task_plot_imbens_manski.py @@ -8,7 +8,7 @@ from pytask import Product from thesis.config import BLD -from thesis.imbens_manski._task_sim_imbens_manski import ( +from thesis.imbens_manski.task_sim_imbens_manski import ( ID_TO_KWARGS, N_BOOT, N_SIMS, diff --git a/src/thesis/imbens_manski/_task_sim_imbens_manski.py b/src/thesis/imbens_manski/task_sim_imbens_manski.py similarity index 94% rename from src/thesis/imbens_manski/_task_sim_imbens_manski.py rename to src/thesis/imbens_manski/task_sim_imbens_manski.py index 6b195540..b206c04c 100644 --- a/src/thesis/imbens_manski/_task_sim_imbens_manski.py +++ b/src/thesis/imbens_manski/task_sim_imbens_manski.py @@ -4,11 +4,10 @@ from typing import Annotated, NamedTuple import numpy as np -import pytask from pytask import Product, task from thesis.config import BLD, RNG -from thesis.imbens_manski.im_funcs import simulation +from thesis.imbens_manski.funcs import simulation class _Arguments(NamedTuple): @@ -48,7 +47,6 @@ class _Arguments(NamedTuple): for id_, kwargs in ID_TO_KWARGS.items(): - @pytask.mark.skip() @task(id=id_, kwargs=kwargs) # type: ignore[arg-type] def task_manski_imbens_sims( p: float, diff --git a/src/thesis/misc/__init__.py b/src/thesis/misc/__init__.py deleted file mode 100644 index cdaaa2e9..00000000 --- a/src/thesis/misc/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Miscaellaneaous scripts and notebooks for the thesis.""" diff --git a/src/thesis/misc/estimate_avars.py b/src/thesis/misc/estimate_avars.py deleted file mode 100644 index d180047c..00000000 --- a/src/thesis/misc/estimate_avars.py +++ /dev/null @@ -1,87 +0,0 @@ -"""Estimate asymptotic variances for some key estimators.""" - -import numpy as np - -from thesis.classes import Instrument, LocalATEs -from thesis.config import RNG -from thesis.simple_model.funcs import ( - _estimate_pscores, - _late, -) -from thesis.utilities import draw_data - -# -------------------------------------------------------------------------------------- -# Parameters -# -------------------------------------------------------------------------------------- - -n_obs = 100_000 -n_sim = 10_000 -late_complier = -0.5 -u_hi = 0.2 -rn = np.sqrt(n_obs) - -# -------------------------------------------------------------------------------------- -# Preliminary calculations -# -------------------------------------------------------------------------------------- - -local_ates = LocalATEs( - always_taker=0, - complier=late_complier, - never_taker=np.min((1, 1 + late_complier)), -) - -instrument = Instrument( - support=np.array([0, 1]), - pmf=np.array([0.5, 0.5]), - pscores=np.array([0.4, 0.6]), -) - - -true_w = (instrument.pscores[1] - instrument.pscores[0]) / ( - instrument.pscores[1] + u_hi - instrument.pscores[0] -) - -true_late = late_complier -true_idset = true_w * late_complier + (1 - true_w) * local_ates.always_taker - -# -------------------------------------------------------------------------------------- -# Asymptotic variance of LATE, w * LATE, w_hat * LATE and id_set -# -------------------------------------------------------------------------------------- - -sim_late = np.zeros(n_sim) -sim_hat_w = np.zeros(n_sim) -sim_late_true_w = np.zeros(n_sim) -sim_late_hat_w = np.zeros(n_sim) -sim_id_set = np.zeros(n_sim) -sim_id_set_slope_1 = np.zeros(n_sim) - -for i in range(n_sim): - data = draw_data( - n_obs=n_obs, - local_ates=local_ates, - instrument=instrument, - rng=RNG, - ) - _pscores = _estimate_pscores(data) - hat_w = (_pscores[1] - _pscores[0]) / (_pscores[1] + u_hi - _pscores[0]) - sim_hat_w[i] = hat_w - - sim_late[i] = _late(data) - sim_late_true_w[i] = _late(data) * true_w - sim_late_hat_w[i] = _late(data) * hat_w - sim_id_set[i] = _late(data) * hat_w + (1 - hat_w) - sim_id_set_slope_1[i] = _late(data) + (1 - hat_w) - -avar_late = (rn * (sim_late - true_late)).var() -avar_late_true_w = (rn * (sim_late_true_w - true_late * true_w)).var() -avar_late_hat_w = (rn * (sim_late_hat_w - true_late * true_w)).var() -avar_id_set = (rn * (sim_id_set - (true_late * true_w + (1 - true_w)))).var() -avar_id_set_slope_1 = (rn * (sim_id_set_slope_1)).var() - -avar_hat_w = (rn * sim_hat_w).var() - -cov_hat_w_late = np.cov(rn * sim_hat_w, rn * sim_late_hat_w)[0, 1] - -avar_id_set_2 = avar_late_hat_w + avar_hat_w - 2 * cov_hat_w_late - -assert np.isclose(avar_id_set, avar_id_set_2, rtol=1e-3) diff --git a/src/thesis/r_ivmte/.Rhistory b/src/thesis/r_ivmte/.Rhistory deleted file mode 100644 index d209f554..00000000 --- a/src/thesis/r_ivmte/.Rhistory +++ /dev/null @@ -1,96 +0,0 @@ -library("AER") -library("ivmte") -knitr::kable(head(AE, n = 10)) -lm(data = AE, worked ~ morekids) -lm(data = AE, morekids ~ samesex) -ivreg(data = AE, worked ~ morekids | samesex ) -# Simple bounds for ATT using moment approach -results <- ivmte(data = AE, -target = "att", -m0 = ~ u + yob, -m1 = ~ u + yob, -ivlike = worked ~ morekids + samesex + morekids*samesex, -propensity = morekids ~ samesex + yob, -noisy = TRUE) -# Simple bounds for ATT using moment approach -results <- ivmte(data = AE, -target = "att", -m0 = ~ u + yob, -m1 = ~ u + yob, -ivlike = worked ~ morekids + samesex + morekids*samesex, -propensity = morekids ~ samesex + yob, -noisy = TRUE) -library("AER") -library("ivmte") -# install.packages("ivmte") -# install.packages("splines2 ") -install.packages("lpSolveAPI ") -library("AER") -library("ivmte") -knitr::kable(head(AE, n = 10)) -lm(data = AE, worked ~ morekids) -lm(data = AE, morekids ~ samesex) -ivreg(data = AE, worked ~ morekids | samesex ) -# Simple bounds for ATT using moment approach -results <- ivmte(data = AE, -target = "att", -m0 = ~ u + yob, -m1 = ~ u + yob, -ivlike = worked ~ morekids + samesex + morekids*samesex, -propensity = morekids ~ samesex + yob, -noisy = TRUE) -library(lpSolveAPI) -# install.packages("ivmte") -# install.packages("splines2 ") -install.packages("lpSolveAPI ") -# install.packages("ivmte") -# install.packages("splines2 ") -install.packages("lpSolveAPI") -# install.packages("ivmte") -install.packages("splines2") -library("AER") -library("ivmte") -knitr::kable(head(AE, n = 10)) -lm(data = AE, worked ~ morekids) -lm(data = AE, morekids ~ samesex) -ivreg(data = AE, worked ~ morekids | samesex ) -# Simple bounds for ATT using moment approach -results <- ivmte(data = AE, -target = "att", -m0 = ~ u + yob, -m1 = ~ u + yob, -ivlike = worked ~ morekids + samesex + morekids*samesex, -propensity = morekids ~ samesex + yob, -noisy = TRUE) -library(lpSolveAPI) -# Simple bounds for ATT using moment approach -results <- ivmte(data = AE, -target = "att", -m0 = ~ u + yob, -m1 = ~ u + yob, -ivlike = worked ~ morekids + samesex + morekids*samesex, -propensity = morekids ~ samesex + yob, -noisy = TRUE) -# Parametric restrictions on m0, m1 -args <- list(data = AE, -ivlike = worked ~ morekids + samesex + morekids*samesex, -target = "att", -m0 = ~ u + I(u^2) + yob + u*yob, -m1 = ~ u + I(u^2) + I(u^3) + yob + u*yob, -propensity = morekids ~ samesex + yob) -r <- do.call(ivmte, args) -r -AE -length(AE) -len(AE) -shape(AE) -a = AE -# Parametric restrictions on m0, m1 -args <- list(data = AE, -ivlike = worked ~ morekids + samesex + morekids*samesex, -target = "att", -m0 = ~ u + I(u^2) + yob + u*yob, -m1 = ~ u + I(u^2) + I(u^3) + yob + u*yob, -propensity = morekids ~ samesex + yob) -r <- do.call(ivmte, args) -r diff --git a/src/thesis/r_ivmte/__init__.py b/src/thesis/r_ivmte/__init__.py deleted file mode 100644 index bbdbc89b..00000000 --- a/src/thesis/r_ivmte/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Explore R ivmte package: https://github.com/jkcshea/ivmte/.""" diff --git a/src/thesis/r_ivmte/ivmte_simple_model.r b/src/thesis/r_ivmte/ivmte_simple_model.r deleted file mode 100644 index e3ab010f..00000000 --- a/src/thesis/r_ivmte/ivmte_simple_model.r +++ /dev/null @@ -1,34 +0,0 @@ -# install.packages("ivmte") -# install.packages("splines2") -# install.packages("lpSolveAPI") -# install.packages("lsei ") -# install.packages("AER") - -library("AER") -library("ivmte") -knitr::kable(head(AE, n = 10)) - -lm(data = AE, worked ~ morekids) - -lm(data = AE, morekids ~ samesex) - -ivreg(data = AE, worked ~ morekids | samesex ) - -# Simple bounds for ATT using moment approach -results <- ivmte(data = AE, - target = "att", - m0 = ~ u + yob, - m1 = ~ u + yob, - ivlike = worked ~ morekids + samesex + morekids*samesex, - propensity = morekids ~ samesex + yob, - noisy = TRUE) - -# Parametric restrictions on m0, m1 -args <- list(data = AE, - ivlike = worked ~ morekids + samesex + morekids*samesex, - target = "att", - m0 = ~ u + I(u^2) + yob + u*yob, - m1 = ~ u + I(u^2) + I(u^3) + yob + u*yob, - propensity = morekids ~ samesex + yob) -r <- do.call(ivmte, args) -r diff --git a/src/thesis/simple_model/funcs.py b/src/thesis/simple_model/funcs.py deleted file mode 100644 index 2088088f..00000000 --- a/src/thesis/simple_model/funcs.py +++ /dev/null @@ -1,581 +0,0 @@ -"""Functions for bootstrap sampling experiment.""" - -from collections.abc import Callable -from functools import partial -from math import comb - -import numpy as np -import pandas as pd # type: ignore[import-untyped] -from numpy.typing import ArrayLike -from statsmodels.api import add_constant # type: ignore[import-untyped] -from statsmodels.sandbox.regression.gmm import IV2SLS # type: ignore[import-untyped] - -from thesis.classes import Instrument, LocalATEs -from thesis.utilities import draw_data - - -def simulation_bootstrap( - n_sims: int, - n_obs: int, - n_boot: int, - u_hi: float, - local_ates: LocalATEs, - instrument: Instrument, - alpha: float, - constraint_mtr: str, - bootstrap_method: str, - rng: np.random.Generator, - bootstrap_params: dict[str, Callable] | None = None, -) -> pd.DataFrame: - """Simulate the bootstrap experiment. - - Args: - n_sims: Number of simulations. - n_obs: Number of observations. - n_boot: Number of bootstrap samples. - local_ates: Local average treatment effects by complier type. - u_hi: Upper bound of target parameter. - instrument: Instrument object containing properties of the instrument. - alpha: Bootstrap percentile for confidence interval (1-alpha for upper). - constraint_mtr: Constraint on the marginal treatment response functions. - rng: Random number generator. - bootstrap_method: Method to compute the bootstrap confidence interval. - bootstrap_params: Additional parameters for the bootstrap method depending on - the bootstrap_method. - - Returns: - DataFrame with the results of the simulation. - - """ - if bootstrap_params is None: - bootstrap_params = {} - _check_constraint_supported(constraint_mtr) - _check_bootsrap_method_supported(bootstrap_method) - _check_instrument_and_u_hi_consistent(u_hi, instrument) - _check_complier_never_taker_consistent(local_ates, constraint_mtr) - _check_bootstrap_params_supplied(bootstrap_method, bootstrap_params) - - results = np.zeros((n_sims, 2)) - - # TODO(@buddejul): Remove after debugging. - beta_lo = np.zeros(n_sims) - beta_hi = np.zeros(n_sims) - - for i in range(n_sims): - data = draw_data(n_obs, local_ates=local_ates, instrument=instrument, rng=rng) - - beta_lo[i], beta_hi[i] = _idset( - b_late=_late(data), - u_hi=u_hi, - pscores_hat=_estimate_pscores(data), - constraint_mtr=constraint_mtr, - ) - - results[i] = _bootstrap_ci( - n_boot=n_boot, - bootstrap_method=bootstrap_method, - bootstrap_params=bootstrap_params, - alpha=alpha, - u_hi=u_hi, - data=data, - constraint_mtr=constraint_mtr, - rng=rng, - ) - - out = pd.DataFrame(results, columns=["lo", "hi"]) - out["true"] = _true_late(u_hi, instrument=instrument, local_ates=local_ates) - out["beta_lo"] = beta_lo - out["beta_hi"] = beta_hi - - return out - - -def _bootstrap_ci( - bootstrap_method: str, - alpha: float, - u_hi: float, - data: np.ndarray, - n_boot: int, - constraint_mtr: str, - rng: np.random.Generator, - bootstrap_params: dict[str, Callable] | None = None, -) -> tuple[ArrayLike, ...]: - """Compute bootstrap confidence interval. - - The function allows for different type of bootstrap confidence intervals: - - standard: Calculates phi(beta_s) for every bootstrap drawn. Then it returns the - quantiles of the bootstrap distribution. - - numerical delta: Separately bootstraps the distribution of beta_s and estimates a - a derivative phi'(beta_s) using a numerical approximation. - Based on Hong and Li (2018). - - analytical delta: Separately bootstraps the distribution of beta_s and estimates a - a derivative phi'(beta_s) utilizing the analytical structure of the derivative. - Based on Fang and Santos (2017). - - """ - # Note that we currently implement this by separately bootstrapping the data for - # each method. This is not the most efficient way to do this, but keeps the code - # easy to organize. If it turns we out we spend a significant time on resampling - # we might want to consider a more efficient implementation. - - if bootstrap_params is None: - bootstrap_params = {} - if bootstrap_method == "standard": - return _ci_standard_bootstrap( - n_boot=n_boot, - data=data, - alpha=alpha, - u_hi=u_hi, - constraint_mtr=constraint_mtr, - rng=rng, - ) - if bootstrap_method == "numerical_delta": - return _ci_numerical_delta_bootstrap( - n_boot=n_boot, - data=data, - alpha=alpha, - u_hi=u_hi, - constraint_mtr=constraint_mtr, - rng=rng, - eps_fun=bootstrap_params["eps_fun"], - ) - if bootstrap_method == "analytical_delta": - return _ci_analytical_delta_bootstrap( - n_boot=n_boot, - data=data, - alpha=alpha, - u_hi=u_hi, - constraint_mtr=constraint_mtr, - rng=rng, - kappa_fun=bootstrap_params["kappa_fun"], - ) - - # In principle, this should not be needed because we check the supported methods - # after input. However, mypy does not seem to recognize this. Or maybe this function - # is called from somewhere else. - msg = f"Bootstrap method '{bootstrap_method}' not supported." - raise ValueError(msg) - - -def _ci_standard_bootstrap( - n_boot: int, - data: np.ndarray, - alpha: float, - u_hi: float, - constraint_mtr: str, - rng: np.random.Generator, -) -> tuple[float, float]: - """Compute the standard bootstrap confidence interval. - - The standard, non-parametric approach calculates phi(X_hat) for each bootstrap. In - our case this amounts to computing the identified set based on the bootstrap sample. - Then quantiles of this bootstrap distribution are used to construct CIs. - - """ - boot_lo = np.zeros(n_boot) - boot_hi = np.zeros(n_boot) - - n_obs = data.shape[0] - - for i in range(n_boot): - boot_data, pscores_boot = _draw_bootstrap_data(data=data, n_obs=n_obs, rng=rng) - - # TODO(@buddejul): This might be more efficiently implemented by vectorizing the - # _idset call on the _late and _estimate_pscores bootstrap estimates. - boot_lo[i], boot_hi[i] = _idset( - b_late=_late(boot_data), - u_hi=u_hi, - pscores_hat=pscores_boot, - constraint_mtr=constraint_mtr, - ) - - # Explicitly make this a float to avoid static typing issues. np.quantile returns a - # np.float64 type, which is not compatible with the float type hint, although this - # seems to be addressed in https://github.com/numpy/numpy/pull/27334. - # Note we also take (1 - ) alpha/2 quantiles. This is to keep to code consistent. - # Following the usual Imbens and Manski argument, when covering the parameter we - # would only use (1 - ) alpha quantiles. - return float(np.quantile(boot_lo, alpha / 2)), float( - np.quantile(boot_hi, 1 - alpha / 2), - ) - - -def _ci_numerical_delta_bootstrap( - n_boot: int, - data: np.ndarray, - alpha: float, - u_hi: float, - constraint_mtr: str, - rng: np.random.Generator, - eps_fun: Callable = lambda n: n ** (-1 / 6), -) -> tuple[float, float]: - """Compute the numerical delta bootstrap confidence interval. - - Based on Hong and Li (2018), for details see p. 382. - - The stepsize is set to n^(-1/6) by default, which is at least theoretically - consistent with eps -> zero and rn * eps -> infty, i.e. eps converges more slowly. - - """ - n_obs = data.shape[0] - - eps = eps_fun(n_obs) - - rn = np.sqrt(n_obs) - - late = _late(data) - _id_data = _idset(late, u_hi, _estimate_pscores(data), constraint_mtr) - - boot_delta_lo = np.zeros(n_boot) - boot_delta_hi = np.zeros(n_boot) - - for i in range(n_boot): - # Step 1: Draw Z_s from the bootstrap distribution. - boot_data, pscores_boot = _draw_bootstrap_data(data=data, n_obs=n_obs, rng=rng) - - z_s = rn * (_late(boot_data) - late) - - # Step 2: Compute the numerical derivative. - # Note that in our case we need to do this for both the lower and upper bound. - - _id_plus_eps_boot = _idset( - b_late=late + eps * z_s, - u_hi=u_hi, - pscores_hat=pscores_boot, - constraint_mtr=constraint_mtr, - ) - - boot_delta_lo[i], boot_delta_hi[i] = (1 / eps) * (_id_plus_eps_boot - _id_data) - - # Construct 1 - alpha two-sided equal-tailed confidence interval for phi(beta_s). - # Note: Based on the Imbens and Manski argument for covering the parameter - instead - # of the identified set - we might only need to take "alpha" critical values here, - # instead of alpha. To achieve this, the user may supply alpha * 2 as the alpha. - - # Compute the lower CI bound for the lower bound of the identified set. - ci_lo = _id_data[0] - (1 / rn) * np.quantile(boot_delta_lo, 1 - alpha / 2) - - # Compute the upper CI bound for the upper bound of the identified set. - ci_hi = _id_data[1] - (1 / rn) * np.quantile(boot_delta_hi, alpha / 2) - - return ci_lo, ci_hi - - -def _ci_analytical_delta_bootstrap( - n_boot: int, - data: np.ndarray, - alpha: float, - u_hi: float, - constraint_mtr: str, - rng: np.random.Generator, - kappa_fun: Callable, -) -> tuple[float, float]: - """Compute the analytical delta bootstrap confidence interval. - - Based on Fang and Santos (2017), adapted from Example 2.1 equation (26). kappa_fun - is the tuning parameter used for the pretest to estimate the derivative. - - """ - n_obs = data.shape[0] - - kappa_n = kappa_fun(n_obs) - - rn = np.sqrt(n_obs) - - # Estimate late using 2SLS to get the standard error for estimating the derivative. - late, se_late = _late_2sls(data) - - # Note se_late is the standard error of the LATE estimate, i.e. the estimate of the - # asymptotic variance divided by rn. Hence we rescale to the asymptotic variance. - sigma_hat = rn * se_late - - pscores = _estimate_pscores(data) - - boot_late_scaled = np.zeros(n_boot) - boot_w_scaled = np.zeros(n_boot) - - w = (pscores[1] - pscores[0]) / (u_hi + pscores[1] - pscores[0]) - - # Step 1: Bootstrap quantiles for the identified parameter beta_s. - for i in range(n_boot): - # Step 1: Draw Z_s from the bootstrap distribution of beta_s. - boot_data, _ = _draw_bootstrap_data(data=data, n_obs=n_obs, rng=rng) - - boot_pscores = _estimate_pscores(boot_data) - - boot_late_scaled[i] = rn * (_late(boot_data) - late) - boot_w = (boot_pscores[1] - boot_pscores[0]) / ( - u_hi + boot_pscores[1] - boot_pscores[0] - ) - - boot_w_scaled[i] = rn * (boot_w - w) - - # Step 2: Estimate the derivative. - # We need two separate derivatives, since the upper and lower bound have - # different solutions. - # We apply the derivative also to the bootstrapped w, since this is a nuisance - # parameter. Otherwise, we understate the variance. - _kwargs = { - "b": boot_late_scaled, - "w": boot_w_scaled, - "beta_late": late, - "sigma_hat": sigma_hat, - "kappa_n": kappa_n, - "rn": rn, - } - - _d_phi = partial(_d_phi_kink, **_kwargs) - - # Note the solution for the upper bound has a "+(1-w)" and for the lower bound a - # "-(1-w)" term. Hence the slopes are -1 and 1, respectively. - - if constraint_mtr == "none": - boot_d_phi_upper = _d_phi( - slope_left=boot_w, - slope_right=boot_w, - slope_w=-1, - ) - boot_d_phi_lower = _d_phi( - slope_left=boot_w, - slope_right=boot_w, - slope_w=1, - ) - - elif constraint_mtr == "increasing": - boot_d_phi_upper = _d_phi(slope_left=1, slope_right=boot_w, slope_w=-1) - boot_d_phi_lower = _d_phi(slope_left=boot_w, slope_right=1, slope_w=1) - - # Step 3: Construct confidence intervals - id_lo, id_hi = _idset( - b_late=late, - u_hi=u_hi, - pscores_hat=pscores, - constraint_mtr=constraint_mtr, - ) - - _c_1_minus_alpha_half = np.quantile(boot_d_phi_lower, 1 - alpha / 2) - boot_ci_lo = id_lo - _c_1_minus_alpha_half / rn - - _c_alpha_half = np.quantile(boot_d_phi_upper, alpha / 2) - boot_ci_hi = id_hi - _c_alpha_half / rn - - return boot_ci_lo, boot_ci_hi - - -def _idset( - b_late: float, - u_hi: float, - pscores_hat: tuple[float, float], - constraint_mtr: str, -) -> np.ndarray: - w = (pscores_hat[1] - pscores_hat[0]) / (u_hi + pscores_hat[1] - pscores_hat[0]) - - if constraint_mtr == "none": - lo = w * b_late - (1 - w) - hi = w * b_late + (1 - w) - - # Restricting the MTRs to be increasing changes the solution to have a kink when - # viewed as a function of the identified parameter. - elif constraint_mtr == "increasing": - lo = _phi_kink(theta=b_late, kink=0, slope_left=w, slope_right=1) - (1 - w) - hi = _phi_kink(theta=b_late, kink=0, slope_left=1, slope_right=w) + (1 - w) - - return np.array([lo, hi]) - - -def _late(data: np.ndarray) -> float: - """Estimate the LATE using the Wald estimator.""" - yz1 = data[data[:, 2] == 1, 0].mean() - yz0 = data[data[:, 2] == 0, 0].mean() - - dz1 = data[data[:, 2] == 1, 1].mean() - dz0 = data[data[:, 2] == 0, 1].mean() - - return (yz1 - yz0) / (dz1 - dz0) - - -def _late_2sls(data: np.ndarray) -> tuple[float, float]: - """Estimate the LATE using 2SLS; also returns standard error.""" - # Data order is y, d, z, u. - - # In Statsmodels endog refers to the dependent variable, while exog refers to the - # explanatory variables, including all controls and the - endogenous - treatment - # variable. - model = IV2SLS( - endog=data[:, 0], - exog=add_constant(data[:, 1]), - instrument=add_constant(data[:, 2]), - ) - - res = model.fit() - - return res.params[1], res.bse[1] - - -def _draw_bootstrap_data( - data: np.ndarray, - n_obs: int, - rng: np.random.Generator, - max_iter: int = 100, -) -> tuple[np.ndarray, tuple[float, float]]: - """Draw data for the bootstrap; redraw if propensity scores are equal.""" - boot_data = data[rng.choice(n_obs, size=n_obs, replace=True)] - - pscores_boot = _estimate_pscores(boot_data) - - if pscores_boot[0] != pscores_boot[1]: - return boot_data, pscores_boot - - # If the pscores are not the same we redraw the data. We set a maximum number of - # iterations to avoid infinite loops. - i = 0 - while pscores_boot[0] == pscores_boot[1]: - boot_data = data[rng.choice(n_obs, size=n_obs, replace=True)] - pscores_boot = _estimate_pscores(boot_data) - i = i + 1 - - if i == max_iter: - msg = ( - f"Bootstrap failed to generate different propensity scores" - f"({max_iter} draws)." - ) - raise ValueError(msg) - - return boot_data, pscores_boot - - -def _m0(u: float | np.ndarray) -> float | np.ndarray: - """Compute the m0 function.""" - return 0.6 * _bern(0, 2, u) + 0.4 * _bern(1, 2, u) + 0.3 * _bern(2, 2, u) - - -def _m1(u: float | np.ndarray) -> float | np.ndarray: - """Compute the m1 function.""" - return 0.75 * _bern(0, 2, u) + 0.5 * _bern(1, 2, u) + 0.25 * _bern(2, 2, u) - - -def _bern(v: int, n: int, x: float | np.ndarray) -> float | np.ndarray: - """Compute the Bernstein polynomial of degree n and index i at x.""" - return comb(n, v) * x**v * (1 - x) ** (n - v) - - -def _true_late(u_hi: float, instrument: Instrument, local_ates: LocalATEs) -> float: - _target_pop_size = u_hi + instrument.pscores[1] - instrument.pscores[0] - _complier_share = (instrument.pscores[1] - instrument.pscores[0]) / _target_pop_size - - return ( - _complier_share * local_ates.complier - + (1 - _complier_share) * local_ates.never_taker - ) - - -def _estimate_pscores(data: np.ndarray) -> tuple[float, float]: - """Estimate the propensity score.""" - return (data[data[:, 2] == 0, 1].mean(), data[data[:, 2] == 1, 1].mean()) - - -def _phi_max(theta: float, cutoff: float = 0): - return np.maximum(theta, cutoff) - - -def _phi_kink( - theta: float, - kink: float, - slope_left: float, - slope_right: float, -) -> float: - return slope_left * theta * (theta < kink) + slope_right * theta * (theta >= kink) - - -def _d_phi_kink( - b: float, - w: float, - beta_late: float, - sigma_hat: float, - kappa_n: float, - rn: float, - slope_left: float, - slope_right: float, - slope_w: float, - kink: float = 0, -) -> float: - """Estimator for the derivative of the identified set.""" - # TODO(@buddejul): Wrifte more general version allowing for different kink point. - # Currently we do this for a kink at zero; this should show up in the pre-test. - - cond_right = rn * (beta_late - kink) / sigma_hat > kappa_n - cond_left = rn * (beta_late - kink) / sigma_hat < -kappa_n - cond_mid = (not cond_right) & (not cond_left) - - # Note we need to add the variability resulting from estimating the propensity score - # in the derivative of the function: w * b +- (1 - w) - # TODO(@buddejul): This should probably also include a product-rule term. - return ( - cond_right * b * slope_right - + cond_left * b * slope_left - + cond_mid * ((b < 0) * b * slope_left + (b >= 0) * b * slope_right) - + w * slope_w - + beta_late * w * (b >= 0) - ) - - -def _check_constraint_supported(constraint_mtr: str) -> None: - supported = ["none", "increasing"] - if constraint_mtr not in supported: - msg = ( - f"Constraint '{constraint_mtr}' not supported.\n" - f"Supported constraints: {supported}" - ) - raise ValueError(msg) - - -def _check_bootsrap_method_supported(bootstrap_method: str) -> None: - supported = ["standard", "numerical_delta", "analytical_delta"] - if bootstrap_method not in supported: - msg = ( - f"Bootstrap method '{bootstrap_method}' not supported.\n" - f"Supported bootstrap methods: {supported}" - ) - raise ValueError(msg) - - -def _check_instrument_and_u_hi_consistent(u_hi: float, instrument: Instrument) -> None: - if u_hi + instrument.pscores[1] > 1: - msg = ( - f"Upper bound u_hi + pscores[1] = {u_hi + instrument.pscores[1]} " - f"exceeds 1. This is not allowed." - ) - raise ValueError(msg) - - -def _check_complier_never_taker_consistent(local_ates, constraint_mtr): - if constraint_mtr == "increasing" and ( - local_ates.complier < 0 and local_ates.never_taker >= 1 - ): - # To be consistent with increasing MTR funcs, the never-taker ATE can be at - # most min(1, 1 + complier ate). Hence, if the complier ATE is negative, the - # never-taker ATE needs to be smaller than 1. - msg = ( - "Whenever late_complier < 0, the largest possible never-taker ATE is " - "1 + later_complier < 1." - ) - raise ValueError(msg) - - -def _check_bootstrap_params_supplied( - bootstrap_method: str, - bootstrap_params: dict[str, Callable], -) -> None: - if bootstrap_method == "numerical_delta" and "eps_fun" not in bootstrap_params: - msg = ( - "Numerical delta bootstrap method requires the user to supply an epsilon " - "function via bootstrap_params under key 'eps_fun'." - ) - raise ValueError(msg) - - if bootstrap_method == "analytical_delta" and "kappa_fun" not in bootstrap_params: - msg = ( - "Analytical delta bootstrap method requires the user to supply a kappa " - "function via bootstrap_params under key 'kappa_fun'." - ) - raise ValueError(msg) diff --git a/src/thesis/simple_model/task_combine_sim_results.py b/src/thesis/simple_model/task_combine_sim_results.py deleted file mode 100644 index 2e06c07e..00000000 --- a/src/thesis/simple_model/task_combine_sim_results.py +++ /dev/null @@ -1,72 +0,0 @@ -"""Task for combining simulation results into dataset.""" - -from pathlib import Path -from typing import Annotated - -import pandas as pd # type: ignore[import-untyped] # type: ignore[import-untyped] -from pytask import Product - -from thesis.config import BLD -from thesis.simple_model.task_simple_model_sims import ( - EPS_FUNS_NUMERICAL_DELTA, - ID_TO_KWARGS, - _Arguments, -) -from thesis.utilities import get_func_as_string - -EPS_FUNS_STRINGS = [get_func_as_string(eps_fun) for eps_fun in EPS_FUNS_NUMERICAL_DELTA] -KAPPA_FUNS_STRINGS = [ - get_func_as_string(kappa_fun) for kappa_fun in EPS_FUNS_NUMERICAL_DELTA -] - - -def task_combine_sim_results( - id_to_kwargs: dict[str, _Arguments] = ID_TO_KWARGS, - path_to_data: Annotated[Path, Product] = Path( - BLD / "simple_model" / "sim_results_combined.pkl", - ), -) -> None: - """Combine all simulation results into single dataframe.""" - coverage = [] - - for kwargs in id_to_kwargs.values(): - res = pd.read_pickle(kwargs.path_to_data) # noqa: S301 - res["ci_covers_true_param"] = (res["lo"] <= res["true"]) & ( - res["hi"] >= res["true"] - ) - res["ci_length"] = res["hi"] - res["lo"] - coverage.append( - ( - kwargs.u_hi, - kwargs.n_obs, - kwargs.local_ates.complier, - kwargs.bootstrap_method, - get_func_as_string(kwargs.bootstrap_params["eps_fun"]), - get_func_as_string(kwargs.bootstrap_params["kappa_fun"]), - kwargs.constraint_mtr, - res["true"].mean(), - res["ci_covers_true_param"].mean(), - res["ci_length"].mean(), - res["lo"].mean(), - res["hi"].mean(), - ), - ) - - cols = [ - "u_hi", - "n_obs", - "late_complier", - "bootstrap_method", - "eps_fun", - "kappa_fun", - "constraint_mtr", - "true", - "coverage", - "length", - "ci_lo", - "ci_hi", - ] - - data = pd.DataFrame(coverage, columns=cols) - - data.to_pickle(path_to_data) diff --git a/src/thesis/simple_model/task_plot_simple_model_sims.py b/src/thesis/simple_model/task_plot_simple_model_sims.py deleted file mode 100644 index 82cdc394..00000000 --- a/src/thesis/simple_model/task_plot_simple_model_sims.py +++ /dev/null @@ -1,248 +0,0 @@ -"""Task for running bootstrap simulations.""" - -from pathlib import Path -from typing import Annotated - -import pandas as pd # type: ignore[import-untyped] -import plotly.graph_objects as go # type: ignore[import-untyped] -from pytask import Product - -from thesis.config import BLD -from thesis.simple_model.task_simple_model_sims import ( - EPS_FUNS_NUMERICAL_DELTA, -) -from thesis.utilities import get_func_as_string - -EPS_FUNS_STRINGS = [get_func_as_string(eps_fun) for eps_fun in EPS_FUNS_NUMERICAL_DELTA] -KAPPA_FUNS_STRINGS = [ - get_func_as_string(kappa_fun) for kappa_fun in EPS_FUNS_NUMERICAL_DELTA -] - - -# TODO(@buddejul): Put graphs below in a loop, currently there is a lot of copy/paste. -# TODO(@buddejul): Include true parameters in plots. -# TODO(@buddejul): Split tasks, function is too complex, see noqa below. -def task_plot_simple_model_sims( # noqa: C901, PLR0912 - path_to_data: Path = BLD / "simple_model" / "sim_results_combined.pkl", - path_to_plot_coverage: Annotated[Path, Product] = Path( - BLD / "simple_model" / "figures" / "coverage.png", - ), - path_to_plot_length: Annotated[Path, Product] = Path( - BLD / "simple_model" / "figures" / "length.png", - ), - path_to_plot_means_hi: Annotated[Path, Product] = Path( - BLD / "simple_model" / "figures" / "means_hi.png", - ), - path_to_plot_means_lo: Annotated[Path, Product] = Path( - BLD / "simple_model" / "figures" / "means_lo.png", - ), - path_to_plot_coverage_by_eps_fun: Annotated[Path, Product] = Path( - BLD / "simple_model" / "figures" / "coverage_eps_fun.png", - ), - path_to_plot_coverage_by_kappa_fun: Annotated[Path, Product] = Path( - BLD / "simple_model" / "figures" / "coverage_kappa_fun.png", - ), -) -> None: - """Plot the coverage probability of the confidence interval.""" - data = pd.read_pickle(path_to_data) # noqa: S301 - - # TODO(@buddejul): Do this properly and loop over mtr constraints. - data = data[data.constraint_mtr == "increasing"] - - data = data.sort_values("late_complier") - - color_by_bootstrap_method = { - "standard": "blue", - "numerical_delta": "red", - "analytical_delta": "green", - } - - color_by_eps_fun = { - "npow(-1div1)": "black", - "npow(-1div2)": "blue", - "npow(-1div3)": "red", - "npow(-1div4)": "green", - "npow(-1div5)": "orange", - "npow(-1div6)": "purple", - } - - color_by_kappa_fun = { - "npow(1div2)": "blue", - "npow(1div3)": "red", - "npow(1div6)": "purple", - "np.log(n)pow(1div2)": "green", - "(2xnp.log(np.log(n)))pow(1div2)": "orange", - } - - line_type_by_n_obs = { - 250: "solid", - 1_000: "dash", - 10_000: "dot", - } - - for col_to_plot in ["coverage", "length"]: - fig = go.Figure() - for n_obs in data.n_obs.unique(): - for bootstrap_method in ["standard", "numerical_delta", "analytical_delta"]: - data_sub = data[ - (data.n_obs == n_obs) & (data.bootstrap_method == bootstrap_method) - ] - if bootstrap_method == "numerical_delta": - data_sub = data_sub[data_sub["eps_fun"] == "npow(-1div2)"] - if bootstrap_method == "analytical_delta": - data_sub = data_sub[data_sub["kappa_fun"] == "np.log(n)pow(1div2)"] - - fig.add_trace( - go.Scatter( - x=data_sub.late_complier, - y=data_sub[col_to_plot], - name=f"n_obs={n_obs}", - legendgroup=f"{bootstrap_method}", - legendgrouptitle_text=( - f"{bootstrap_method.replace('_', ' ').capitalize()} " - "Bootstrap" - ), - line={ - "color": color_by_bootstrap_method[bootstrap_method], - "dash": line_type_by_n_obs[int(n_obs)], - "width": 2, - }, - ), - ) - - fig.update_layout( - title=f"{col_to_plot.capitalize()} of CI for true parameter", - xaxis_title="LATE Complier", - yaxis_title=f"{col_to_plot.capitalize()}", - ) - - if col_to_plot == "coverage": - path_to_plot = path_to_plot_coverage - elif col_to_plot == "length": - path_to_plot = path_to_plot_length - - fig.write_image(path_to_plot) - - # ================================================================================== - # Plot Means - # ================================================================================== - params_to_stat = { - "ci_lo": {"title": "Means of CI Lower Bounds", "path": path_to_plot_means_lo}, - "ci_hi": {"title": "Means of CI Upper Bounds", "path": path_to_plot_means_hi}, - } - - for stat in ["ci_hi", "ci_lo"]: - fig = go.Figure() - for n_obs in data.n_obs.unique(): - for bootstrap_method in ["standard", "numerical_delta", "analytical_delta"]: - data_sub = data[ - (data.n_obs == n_obs) & (data.bootstrap_method == bootstrap_method) - ] - if bootstrap_method == "numerical_delta": - data_sub = data_sub[data_sub["eps_fun"] == "npow(-1div2)"] - if bootstrap_method == "analytical_delta": - data_sub = data_sub[data_sub["kappa_fun"] == "npow(1div2)"] - fig.add_trace( - go.Scatter( - x=data_sub.late_complier, - y=data_sub[stat], - name=f"n_obs={n_obs}", - legendgroup=f"{bootstrap_method}", - legendgrouptitle_text=( - f"{bootstrap_method.replace('_', ' ').capitalize()}" - "Bootstrap" - ), - line={ - "color": color_by_bootstrap_method[bootstrap_method], - "dash": line_type_by_n_obs[int(n_obs)], - }, - ), - ) - - fig.update_layout( - title=params_to_stat[stat]["title"], - xaxis_title="late_complier", - yaxis_title="Mean CI Bounds", - ) - - fig.write_image(params_to_stat[stat]["path"]) - - # ================================================================================== - # Plot coverage by eps_fun - # ================================================================================== - fig = go.Figure() - - for eps_fun in data.eps_fun.unique(): - eps_fun_to_print = ( - eps_fun.replace("npow", "n^") - .replace("div", "/") - .replace("(", "") - .replace(")", "") - ) - - for n_obs in data.n_obs.unique(): - data_sub = data[data.eps_fun == eps_fun] - data_sub = data_sub[data_sub.n_obs == n_obs] - data_sub = data_sub[data_sub["bootstrap_method"] == "numerical_delta"] - fig.add_trace( - go.Scatter( - x=data_sub.late_complier, - y=data_sub.coverage, - name=f"n_obs={n_obs}", - legendgroup=f"{eps_fun}", - legendgrouptitle_text=(f"eps(n) = {eps_fun_to_print}"), - line={ - "dash": line_type_by_n_obs[int(n_obs)], - "width": 2, - "color": color_by_eps_fun[eps_fun], - }, - ), - ) - - fig.update_layout( - title="Coverage by eps_fun", - xaxis_title="late_complier", - yaxis_title="Coverage", - ) - - fig.write_image(path_to_plot_coverage_by_eps_fun) - - # ================================================================================== - # Plot coverage by kappa_fun - # ================================================================================== - fig = go.Figure() - - for kappa_fun in data.kappa_fun.unique(): - kappa_fun_to_print = ( - kappa_fun.replace("npow", "n^") - .replace("div", "/") - .replace("(", "") - .replace(")", "") - ) - - for n_obs in data.n_obs.unique(): - data_sub = data[data.kappa_fun == kappa_fun] - data_sub = data_sub[data_sub.n_obs == n_obs] - data_sub = data_sub[data_sub["bootstrap_method"] == "analytical_delta"] - fig.add_trace( - go.Scatter( - x=data_sub.late_complier, - y=data_sub.coverage, - name=f"n_obs={n_obs}", - legendgroup=f"{kappa_fun}", - legendgrouptitle_text=(f"eps(n) = {kappa_fun_to_print}"), - line={ - "dash": line_type_by_n_obs[int(n_obs)], - "width": 2, - "color": color_by_kappa_fun[kappa_fun], - }, - ), - ) - - fig.update_layout( - title="Coverage by kappa_fun", - xaxis_title="late_complier", - yaxis_title="Coverage", - ) - - fig.write_image(path_to_plot_coverage_by_kappa_fun) diff --git a/src/thesis/simple_model/task_plot_solution.py b/src/thesis/simple_model/task_plot_solution.py deleted file mode 100644 index 458bedf7..00000000 --- a/src/thesis/simple_model/task_plot_solution.py +++ /dev/null @@ -1,104 +0,0 @@ -"""Plot the true solution for the simple model setting.""" - -from pathlib import Path -from typing import Annotated, NamedTuple - -import numpy as np -import pandas as pd # type: ignore[import-untyped] -import plotly.graph_objects as go # type: ignore[import-untyped] -from pytask import Product, task - -from thesis.config import BLD -from thesis.simple_model.funcs import _idset - - -class _Arguments(NamedTuple): - constraint_mtr: str - path_to_plot: Annotated[Path, Product] - u_hi: float = 0.2 - grid_points: int = 250 - endpoints: tuple[float, float] = (-0.1, 0.1) - pscores: tuple[float, float] = (0.4, 0.6) - - -constr_mtr_to_plot = ["none", "increasing"] - -ID_TO_KWARGS = { - f"plot_constraint_mtr_{constraint_mtr}": _Arguments( - constraint_mtr=constraint_mtr, - path_to_plot=Path( - BLD - / "simple_model" - / "solutions" - / f"solution_constraint_mtr_{constraint_mtr}.png", - ), - ) - for constraint_mtr in constr_mtr_to_plot -} - -for id_, kwargs in ID_TO_KWARGS.items(): - - @task(id=id_, kwargs=kwargs) # type: ignore[arg-type] - def task_plot_solution( - u_hi: float, - grid_points: int, - endpoints: tuple[float, float], - pscores: tuple[float, float], - constraint_mtr: str, - path_to_plot: Annotated[Path, Product], - ) -> None: - """Plot the solution for the simple model.""" - beta_grid = np.linspace(endpoints[0], endpoints[1], grid_points) - - solutions = np.zeros((grid_points, 2)) - - for i, beta in enumerate(beta_grid): - solutions[i, :] = _idset( - b_late=beta, - u_hi=u_hi, - pscores_hat=pscores, - constraint_mtr=constraint_mtr, - ) - - data = pd.DataFrame(solutions, columns=["lower_bound", "upper_bound"]) - - fig = go.Figure() - - fig.add_trace( - go.Scatter( - x=beta_grid, - y=data["lower_bound"], - mode="lines", - name="Lower Bound", - ), - ) - - fig.add_trace( - go.Scatter( - x=beta_grid, - y=data["upper_bound"], - mode="lines", - name="Upper Bound", - ), - ) - - fig.update_layout( - title=( - "Solution for the Simple Model: " - f"MTR Constraint = {constraint_mtr.capitalize()}" - ), - xaxis_title="Beta", - yaxis_title="Value", - ) - - # Add a note with all parameters - fig.add_annotation( - text=f"u_hi = {u_hi}, pscores = {pscores}", - xref="paper", - yref="paper", - x=0.5, - y=-0.1, - showarrow=False, - ) - - fig.write_image(path_to_plot) diff --git a/src/thesis/simple_model/task_simple_model_sims.py b/src/thesis/simple_model/task_simple_model_sims.py deleted file mode 100644 index 8e8ce273..00000000 --- a/src/thesis/simple_model/task_simple_model_sims.py +++ /dev/null @@ -1,142 +0,0 @@ -"""Task for running bootstrap simulations.""" - -from collections.abc import Callable -from pathlib import Path -from typing import Annotated, NamedTuple - -import numpy as np -from pytask import Product, task - -from thesis.classes import Instrument, LocalATEs -from thesis.config import BLD, RNG -from thesis.simple_model.funcs import simulation_bootstrap - - -class _Arguments(NamedTuple): - u_hi: float - pscore_low: float - n_obs: int - local_ates: LocalATEs - constraint_mtr: str - bootstrap_method: str - bootstrap_params: dict[str, Callable] - path_to_data: Path | None = None - pscore_hi: float = 0.6 - alpha: float = 0.05 - n_boot: int = 250 - n_sims: int = 1_000 - rng: np.random.Generator = RNG - - -U_HI = [0.2] -N_OBS = [1_000, 10_000] -PSCORES_LOW = [0.4] -CONSTRAINTS_MTR = ["increasing"] -BOOTSTRAP_METHODS = ["standard", "numerical_delta", "analytical_delta"] -LATES_COMPLIER = np.concat((np.linspace(-0.3, 0.3, num=14), np.zeros(1))) -EPS_FUNS_NUMERICAL_DELTA = [ - lambda n: n ** (-1 / 2), -] -KAPPA_FUNS_ANALYTICAL_DELTA = [ - lambda n: np.log(n) ** (1 / 2), -] - - -# TODO(@buddejul): An alternative way to do this would be to simply give this a -# non-meaningful name (e.g. simple_models_sims_`i`) and store all params in the dataset -# or a separate dict (probably more efficient). We can then retrieve the params from the -# dataset or dict after saving. We can then query the settings dict before merging. -KWARGS = [ - _Arguments( - u_hi=u_hi, - n_obs=n_obs, - pscore_low=pscore_low, - local_ates=LocalATEs( - always_taker=0, - complier=late_complier, - # The following ensures that the model is always correctly specified under - # increasing MTR functions: Whenever late_complier < 0, the largest possible - # never-taker ATE is 1 + later_complier < 1. - never_taker=np.min((1, 1 + late_complier)), - ), - constraint_mtr=constraint_mtr, - bootstrap_method=bootstrap_method, - bootstrap_params={"eps_fun": eps_fun, "kappa_fun": kappa_fun}, - ) - for u_hi in U_HI - for n_obs in N_OBS - for pscore_low in PSCORES_LOW - for late_complier in LATES_COMPLIER - for constraint_mtr in CONSTRAINTS_MTR - for bootstrap_method in BOOTSTRAP_METHODS - for eps_fun in EPS_FUNS_NUMERICAL_DELTA - for kappa_fun in KAPPA_FUNS_ANALYTICAL_DELTA - # For standard bootstrap, we only need to run the simulation once - if not ( - bootstrap_method == "standard" - and ( - eps_fun is not EPS_FUNS_NUMERICAL_DELTA[0] - or kappa_fun is not KAPPA_FUNS_ANALYTICAL_DELTA[0] - ) - ) - # For the analytical delta method, we only need to run the simulation once for each - # kappa_fun, but not for different eps_fun - if not ( - bootstrap_method == "analytical_delta" - and eps_fun is not EPS_FUNS_NUMERICAL_DELTA[0] - ) - # For the numerical delta method, we only need to run the simulation once for each - # eps_fun, but not for different kappa_fun - if not ( - bootstrap_method == "numerical_delta" - and kappa_fun is not KAPPA_FUNS_ANALYTICAL_DELTA[0] - ) -] - -ID_TO_KWARGS = {str(id_): kwargs for id_, kwargs in enumerate(KWARGS)} - -for id_, kwargs in ID_TO_KWARGS.items(): - ID_TO_KWARGS[id_] = kwargs._replace( - path_to_data=BLD / "simple_model" / kwargs.bootstrap_method / f"{id_}.pkl", - ) - -for id_, kwargs in ID_TO_KWARGS.items(): - - @task(id=id_, kwargs=kwargs) # type: ignore[arg-type] - def task_bootstrap_sims( - n_sims: int, - n_obs: int, - n_boot: int, - u_hi: float, - alpha: float, - pscore_low: float, - pscore_hi: float, - local_ates: LocalATEs, - constraint_mtr: str, - bootstrap_method: str, - bootstrap_params: dict[str, Callable], - rng: np.random.Generator, - path_to_data: Annotated[Path, Product], - ) -> None: - """Task for running bootstrap simulations.""" - instrument = Instrument( - support=np.array([0, 1]), - pmf=np.array([0.5, 0.5]), - pscores=np.array([pscore_low, pscore_hi]), - ) - - res = simulation_bootstrap( - n_sims=n_sims, - n_obs=n_obs, - n_boot=n_boot, - u_hi=u_hi, - local_ates=local_ates, - alpha=alpha, - instrument=instrument, - constraint_mtr=constraint_mtr, - rng=rng, - bootstrap_method=bootstrap_method, - bootstrap_params=bootstrap_params, - ) - - res.to_pickle(path_to_data) diff --git a/src/thesis/utilities.py b/src/thesis/utilities.py index c2b83a55..9fd8aa06 100644 --- a/src/thesis/utilities.py +++ b/src/thesis/utilities.py @@ -1,14 +1,9 @@ """Utilities used in various parts of the project.""" -import inspect -from collections.abc import Callable from pathlib import Path -import numpy as np import yaml # type: ignore[import-untyped] -from thesis.classes import Instrument, LocalATEs - def read_yaml(path): """Read a YAML file. @@ -30,45 +25,3 @@ def read_yaml(path): ) raise ValueError(info) from error return out - - -def get_func_as_string(func: Callable) -> str: - """Inspects a function and converts the first body line into a string.""" - func_str = str(inspect.getsourcelines(func)[0]) - func_str = func_str.split(":")[1].strip() - func_str = func_str.removesuffix(")\\n']") - func_str = func_str.removesuffix(",\\n']") - func_str = func_str.replace(" ", "") - func_str = func_str.replace("**", "pow") - func_str = func_str.replace("*", "x") - return func_str.replace("/", "div") - - -def draw_data( - n_obs: int, - local_ates: LocalATEs, - instrument: Instrument, - rng: np.random.Generator, -) -> np.ndarray: - """Draw data for given local ates and instrument.""" - z = rng.choice(instrument.support, size=n_obs, p=instrument.pmf) - - u = rng.uniform(low=0, high=1, size=n_obs) - - d = np.where(z == 1, u <= instrument.pscores[1], u <= instrument.pscores[0]) - - _never_takers = u <= instrument.pscores[0] - _compliers = (u > instrument.pscores[0]) & (u <= instrument.pscores[1]) - _always_takers = u > instrument.pscores[1] - - y0 = np.zeros(n_obs) - - y1 = ( - _never_takers * local_ates.never_taker - + _compliers * local_ates.complier - + _always_takers * local_ates.always_taker - ) - - y = d * y1 + (1 - d) * y0 + rng.normal(scale=0.1, size=n_obs) - - return np.column_stack((y, d, z, u)) diff --git a/tests/analysis/__init__.py b/tests/analysis/__init__.py new file mode 100644 index 00000000..5f03a7a4 --- /dev/null +++ b/tests/analysis/__init__.py @@ -0,0 +1 @@ +"""Tests for the analysis module.""" diff --git a/tests/bootstrap/__init__.py b/tests/bootstrap/__init__.py deleted file mode 100644 index 0ec0c327..00000000 --- a/tests/bootstrap/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for bootstrap procedures.""" diff --git a/tests/bootstrap/test_bootstrap_off_kink.py b/tests/bootstrap/test_bootstrap_off_kink.py deleted file mode 100644 index 8d37a54a..00000000 --- a/tests/bootstrap/test_bootstrap_off_kink.py +++ /dev/null @@ -1,81 +0,0 @@ -"""Test bootstrap confidence intervals have right coverage far off the kink.""" - -import numpy as np -import pytest -from thesis.classes import Instrument, LocalATEs -from thesis.config import RNG -from thesis.simple_model.funcs import simulation_bootstrap - - -@pytest.fixture() -def setup(): - complier_late = -0.5 - - local_ates = LocalATEs( - always_taker=0, - complier=complier_late, - never_taker=np.min((1, 1 + complier_late)), - ) - - instrument = Instrument( - support=np.array([0, 1]), - pmf=np.array([0.5, 0.5]), - pscores=np.array([0.4, 0.6]), - ) - - return local_ates, instrument, complier_late - - -def _bic(n): - """BIC.""" - return np.sqrt(np.log(n)) - - -@pytest.mark.parametrize("method", ["numerical_delta"]) -def test_bootstrap_coverage(setup, method): - local_ates, instrument, complier_late = setup - - expected = 0.95 - - n_obs = 10_000 - n_boot = 500 - n_sims = 100 - - bootstrap_params = { - "eps_fun": np.sqrt, - "kappa_fun": _bic, - } - - res = simulation_bootstrap( - n_sims=n_sims, - n_obs=n_obs, - n_boot=n_boot, - u_hi=0.2, - local_ates=local_ates, - instrument=instrument, - alpha=0.05, - constraint_mtr="increasing", - bootstrap_method=method, - rng=RNG, - bootstrap_params=bootstrap_params, - ) - - res["covers"] = (res["lo"] <= res["true"]) & (res["hi"] >= res["true"]) - res["covers_hi"] = res["hi"] >= res["true"] - res["covers_lo"] = res["lo"] <= res["true"] - - # Calculate critical values of CI - res["c_hi"] = res["hi"] - res["beta_hi"] - res["c_lo"] = res["lo"] - res["beta_lo"] - - # Assert they are always >= 0 - assert np.all(res["c_hi"] >= 0) - assert np.all(res["c_lo"] <= 0) - - # Coverage should be determined by upper bound - assert res["covers_lo"].mean() == 1 - assert np.all(res["covers_hi"] == res["covers"]) - - actual = res.mean()["covers"] - - assert actual == pytest.approx(expected, abs=0.051) diff --git a/tests/bootstrap_fail/__init__.py b/tests/bootstrap_fail/__init__.py new file mode 100644 index 00000000..e3fe255c --- /dev/null +++ b/tests/bootstrap_fail/__init__.py @@ -0,0 +1 @@ +"""Tests for the bootstrap_fail functions.""" diff --git a/tests/bootstrap_fail/test_funcs.py b/tests/bootstrap_fail/test_funcs.py new file mode 100644 index 00000000..decff75f --- /dev/null +++ b/tests/bootstrap_fail/test_funcs.py @@ -0,0 +1,56 @@ +"""Test functions for bootstrap_fail.""" + +import numpy as np +import pandas as pd # type: ignore[import-untyped] +import pytest +from thesis.bootstrap_fail.funcs import _draw_data, _estimate_pscores +from thesis.classes import Instrument +from thesis.config import RNG + + +@pytest.fixture() +def instrument() -> Instrument: + return Instrument( + support=np.array([0, 1]), + pmf=np.array([0.5, 0.5]), + pscores=np.array([0.4, 0.6]), + ) + + +def test_data_moments_boundary(instrument) -> None: + n_obs = 100_000 + + expected = pd.DataFrame( + { + "y_given_z": [0.0, 0.0], + "d_given_z": instrument.pscores, + }, + index=[0, 1], + ) + + data = pd.DataFrame( + _draw_data(n_obs, RNG, instrument=instrument, param_pos="boundary"), + columns=["y", "d", "z", "u"], + ) + + actual = pd.DataFrame( + { + "y_given_z": data.groupby("z")["y"].mean(), + "d_given_z": data.groupby("z")["d"].mean(), + }, + index=[0, 1], + ) + + pd.testing.assert_frame_equal(actual, expected, atol=0.01) + + +def test_compute_pscores(instrument) -> None: + n_obs = 1_000_000 + + data = _draw_data(n_obs, RNG, instrument=instrument, param_pos="boundary") + + expected = (0.4, 0.6) + + actual = _estimate_pscores(data) + + assert expected == pytest.approx(actual, abs=3 / np.sqrt(n_obs)) diff --git a/tests/imbens_manski/test_im_funcs.py b/tests/imbens_manski/test_funcs.py similarity index 100% rename from tests/imbens_manski/test_im_funcs.py rename to tests/imbens_manski/test_funcs.py diff --git a/tests/simple_model/__init__.py b/tests/simple_model/__init__.py deleted file mode 100644 index 461f7e2a..00000000 --- a/tests/simple_model/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for the simple_model functions.""" diff --git a/tests/simple_model/test_debug_sm_cis.py b/tests/simple_model/test_debug_sm_cis.py deleted file mode 100644 index abfd78d2..00000000 --- a/tests/simple_model/test_debug_sm_cis.py +++ /dev/null @@ -1,41 +0,0 @@ -"""Tests for debugging simple model CIs.""" - -import numpy as np -from thesis.classes import Instrument, LocalATEs -from thesis.config import RNG -from thesis.simple_model.funcs import simulation_bootstrap - - -def test_debug_sm(): - n_sims = 2 - n_obs = 10_000 - n_boot = 2_000 - u_hi = 0.2 - alpha = 0.05 - constraint_mtr = "none" - bootstrap_method = "standard" - - instrument = Instrument( - support=np.array([0, 1]), - pmf=np.array([0.5, 0.5]), - pscores=np.array([0.4, 0.6]), - ) - - local_ates = LocalATEs( - always_taker=0, - complier=-0.1, - never_taker=1, - ) - - simulation_bootstrap( - n_sims=n_sims, - n_obs=n_obs, - n_boot=n_boot, - u_hi=u_hi, - alpha=alpha, - rng=RNG, - constraint_mtr=constraint_mtr, - bootstrap_method=bootstrap_method, - instrument=instrument, - local_ates=local_ates, - ) diff --git a/tests/simple_model/test_funcs.py b/tests/simple_model/test_funcs.py deleted file mode 100644 index a2ded3f9..00000000 --- a/tests/simple_model/test_funcs.py +++ /dev/null @@ -1,284 +0,0 @@ -"""Test functions for simple_model.""" - -from functools import partial - -import numpy as np -import pytest -from thesis.classes import Instrument, LocalATEs -from thesis.config import RNG -from thesis.simple_model.funcs import ( - _estimate_pscores, - _idset, - _late, - _late_2sls, - _true_late, - simulation_bootstrap, -) -from thesis.utilities import draw_data - - -@pytest.fixture() -def instrument() -> Instrument: - return Instrument( - support=np.array([0, 1]), - pmf=np.array([0.5, 0.5]), - pscores=np.array([0.4, 0.6]), - ) - - -@pytest.fixture() -def local_ates_zero() -> LocalATEs: - return LocalATEs( - always_taker=0, - complier=0, - never_taker=0, - ) - - -@pytest.fixture() -def local_ates_nonzero() -> LocalATEs: - return LocalATEs( - always_taker=0, - complier=0.5, - never_taker=1, - ) - - -# Setting where true parameter is at upper boundary of ID set and at kink of solution -@pytest.fixture() -def local_ates_boundary_hi() -> LocalATEs: - return LocalATEs( - always_taker=0, - complier=0, - never_taker=1, - ) - - -# Setting where true parameter is at lower boundary of ID set and at kink of solution -@pytest.fixture() -def local_ates_boundary_lo() -> LocalATEs: - return LocalATEs( - always_taker=0, - complier=0, - never_taker=-1, - ) - - -@pytest.fixture() -def local_ates_complier_negative() -> LocalATEs: - return LocalATEs( - always_taker=0, - complier=-0.5, - never_taker=1, - ) - - -@pytest.fixture() -def sim_boot(): - return partial( - simulation_bootstrap, - n_sims=2, - n_obs=10_000, - n_boot=2_000, - u_hi=0.1, - alpha=0.05, - rng=RNG, - ) - - -def test_compute_pscores(instrument, local_ates_nonzero) -> None: - n_obs = 1_000_000 - - data = draw_data( - n_obs, - rng=RNG, - instrument=instrument, - local_ates=local_ates_nonzero, - ) - - expected = (0.4, 0.6) - - actual = _estimate_pscores(data) - - assert expected == pytest.approx(actual, abs=3 / np.sqrt(n_obs)) - - -def test_simulation_runs(local_ates_boundary_hi, instrument, sim_boot) -> None: - for boot_met in ["analytical_delta", "standard", "numerical_delta"]: - for const_mtr in ["increasing", "none"]: - if boot_met == "numerical_delta": - bootstrap_params = {"eps_fun": lambda n: n ** (-1 / 3)} - elif boot_met == "analytical_delta": - bootstrap_params = {"kappa_fun": lambda n: n ** (1 / 3)} - else: - bootstrap_params = {} - - sim_boot( - local_ates=local_ates_boundary_hi, - instrument=instrument, - constraint_mtr=const_mtr, - bootstrap_method=boot_met, - bootstrap_params=bootstrap_params, - ) - - -def test_check_invalid_constraint_mtr(instrument, local_ates_nonzero, sim_boot) -> None: - with pytest.raises(ValueError, match="Constraint 'invalid' not supported."): - sim_boot( - local_ates=local_ates_nonzero, - instrument=instrument, - constraint_mtr="invalid", - bootstrap_method="standard", - ) - - -def test_check_invalid_bootstrap_method( - instrument, - local_ates_nonzero, - sim_boot, -) -> None: - with pytest.raises(ValueError, match="Bootstrap method 'invalid' not supported."): - sim_boot( - local_ates=local_ates_nonzero, - instrument=instrument, - constraint_mtr="increasing", - bootstrap_method="invalid", - ) - - -def test_true_late(): - instrument = Instrument( - support=np.array([0, 1]), - pmf=np.array([0.5, 0.5]), - pscores=np.array([0.4, 0.6]), - ) - - u_hi = 0.2 - - complier_lates = np.linspace(-0.1, 0.1, num=10) - - # With u_hi = 0.2, the weights are 0.5 (both relevant populations have mass 0.2). - expected = 0.5 * complier_lates + 0.5 * 1 - - local_ates = [ - LocalATEs( - always_taker=0, - complier=cp_late, - never_taker=1, - ) - for cp_late in complier_lates - ] - - actual = [ - _true_late(instrument=instrument, local_ates=la, u_hi=u_hi) for la in local_ates - ] - - np.testing.assert_allclose(actual, expected, atol=1e-10) - - -def test_id_set_consistent() -> None: - instrument = Instrument( - support=np.array([0, 1]), - pmf=np.array([0.5, 0.5]), - pscores=np.array([0.4, 0.6]), - ) - - u_hi = 0.2 - - local_ates = LocalATEs( - always_taker=0, - complier=-0.1, - never_taker=1, - ) - - constraint_mtr = "none" - - n_obs = 10_000 - n_sims = 1_000 - - res = np.zeros((n_sims, 2)) - - for i in range(n_sims): - data = draw_data( - n_obs=n_obs, - local_ates=local_ates, - instrument=instrument, - rng=RNG, - ) - - res[i] = _idset( - b_late=_late(data), - u_hi=u_hi, - pscores_hat=_estimate_pscores(data), - constraint_mtr=constraint_mtr, - ) - - # Take means over columns - - actual = res.mean(axis=0) - - actual[0] - mean_hi = actual[1] - - expected_hi = 0.45 - assert mean_hi == pytest.approx(expected_hi, abs=3 / np.sqrt(n_obs)) - - -def test_inconsistent_complier_and_never_taker_ate( - sim_boot, - local_ates_complier_negative, - instrument, -) -> None: - # In settings with no constraint on the MTR functions it is possible to have a - # negative complier ATE and a never-taker ATE of 1. - sim_boot( - local_ates=local_ates_complier_negative, - instrument=instrument, - constraint_mtr="none", - bootstrap_method="standard", - ) - - # With increasing MTR functions this is not possible, as the largest possible value - # given a negative complier ATE is 1 + complier ATE < 1. - - with pytest.raises(ValueError, match="largest possible never-taker ATE"): - sim_boot( - local_ates=local_ates_complier_negative, - instrument=instrument, - constraint_mtr="increasing", - bootstrap_method="standard", - ) - - -def test_bootstrap_params_supplied(sim_boot, local_ates_nonzero, instrument) -> None: - with pytest.raises(ValueError, match="Numerical delta bootstrap method requires"): - sim_boot( - local_ates=local_ates_nonzero, - instrument=instrument, - constraint_mtr="increasing", - bootstrap_method="numerical_delta", - bootstrap_params={}, - ) - with pytest.raises(ValueError, match="Analytical delta bootstrap method requires"): - sim_boot( - local_ates=local_ates_nonzero, - instrument=instrument, - constraint_mtr="increasing", - bootstrap_method="analytical_delta", - bootstrap_params={}, - ) - - -def test_late_and_late_2sls_equivalent(local_ates_nonzero, instrument) -> None: - data = draw_data( - n_obs=10_000, - local_ates=local_ates_nonzero, - instrument=instrument, - rng=RNG, - ) - - late = _late(data) - - late_2sls, _ = _late_2sls(data) - - assert late == pytest.approx(late_2sls) diff --git a/tests/utilities/__init__.py b/tests/utilities/__init__.py deleted file mode 100644 index 874535a0..00000000 --- a/tests/utilities/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for utilities.""" diff --git a/tests/utilities/test_utilities.py b/tests/utilities/test_utilities.py deleted file mode 100644 index 845dbe28..00000000 --- a/tests/utilities/test_utilities.py +++ /dev/null @@ -1,119 +0,0 @@ -"""Tests for the utilities module.""" - -import numpy as np -import pandas as pd # type: ignore[import-untyped] -import pytest -from thesis.classes import Instrument, LocalATEs -from thesis.config import RNG -from thesis.simple_model.funcs import _late, _true_late -from thesis.utilities import draw_data - - -@pytest.fixture() -def instrument() -> Instrument: - return Instrument( - support=np.array([0, 1]), - pmf=np.array([0.5, 0.5]), - pscores=np.array([0.4, 0.6]), - ) - - -@pytest.fixture() -def local_ates_zero() -> LocalATEs: - return LocalATEs( - always_taker=0, - complier=0, - never_taker=0, - ) - - -@pytest.fixture() -def local_ates_nonzero() -> LocalATEs: - return LocalATEs( - always_taker=0, - complier=0.5, - never_taker=1, - ) - - -# Setting where true parameter is at upper boundary of ID set and at kink of solution -@pytest.fixture() -def local_ates_boundary_hi() -> LocalATEs: - return LocalATEs( - always_taker=0, - complier=0, - never_taker=1, - ) - - -# Setting where true parameter is at lower boundary of ID set and at kink of solution -@pytest.fixture() -def local_ates_boundary_lo() -> LocalATEs: - return LocalATEs( - always_taker=0, - complier=0, - never_taker=-1, - ) - - -@pytest.fixture() -def local_ates_complier_negative() -> LocalATEs: - return LocalATEs( - always_taker=0, - complier=-0.5, - never_taker=1, - ) - - -def test_data_moments_boundary(instrument, local_ates_zero) -> None: - n_obs = 100_000 - - expected = pd.DataFrame( - { - "y_given_z": [0.0, 0.0], - "d_given_z": instrument.pscores, - }, - index=[0, 1], - ) - - data = pd.DataFrame( - draw_data(n_obs, rng=RNG, instrument=instrument, local_ates=local_ates_zero), - columns=["y", "d", "z", "u"], - ) - - actual = pd.DataFrame( - { - "y_given_z": data.groupby("z")["y"].mean(), - "d_given_z": data.groupby("z")["d"].mean(), - }, - index=[0, 1], - ) - - pd.testing.assert_frame_equal(actual, expected, atol=0.01) - - -def test_generate_late(instrument): - n_obs = 1_000_000 - - complier_lates = [-1, -0.75, -0.5, -0.25, 0, 0.25, 0.5, 0.75, 1] - - actual = np.zeros(len(complier_lates)) - expected = np.zeros(len(complier_lates)) - - for i, complier_late in enumerate(complier_lates): - local_ates = LocalATEs( - always_taker=0, - complier=complier_late, - never_taker=np.min((1, 1 + complier_late)), - ) - data = draw_data( - n_obs=n_obs, - local_ates=local_ates, - instrument=instrument, - rng=RNG, - ) - - actual[i] = _late(data) - expected[i] = _true_late(u_hi=0, instrument=instrument, local_ates=local_ates) - - assert actual == pytest.approx(expected, abs=20 / np.sqrt(n_obs))