Skip to content

Commit

Permalink
Bootstrap weights (#485)
Browse files Browse the repository at this point in the history
  • Loading branch information
alanlujan91 authored Nov 20, 2024
1 parent 84be118 commit 15ac4c2
Show file tree
Hide file tree
Showing 7 changed files with 181 additions and 9 deletions.
5 changes: 4 additions & 1 deletion src/estimagic/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def bootstrap(
existing_result=None,
outcome_kwargs=None,
n_draws=1_000,
weight_by=None,
cluster_by=None,
seed=None,
n_cores=1,
Expand All @@ -41,6 +42,7 @@ def bootstrap(
n_draws (int): Number of bootstrap samples to draw.
If len(existing_outcomes) >= n_draws, a random subset of existing_outcomes
is used.
weight_by (str): Column name of variable with weights or None.
cluster_by (str): Column name of variable to cluster by or None.
seed (Union[None, int, numpy.random.Generator]): If seed is None or int the
numpy.random.default_rng is used seeded with seed. If seed is already a
Expand All @@ -59,7 +61,7 @@ def bootstrap(
"""
if callable(outcome):
check_inputs(data=data, cluster_by=cluster_by)
check_inputs(data=data, weight_by=weight_by, cluster_by=cluster_by)

if outcome_kwargs is not None:
outcome = functools.partial(outcome, **outcome_kwargs)
Expand All @@ -82,6 +84,7 @@ def bootstrap(
new_outcomes = get_bootstrap_outcomes(
data=data,
outcome=outcome,
weight_by=weight_by,
cluster_by=cluster_by,
rng=rng,
n_draws=n_draws - n_existing,
Expand Down
12 changes: 11 additions & 1 deletion src/estimagic/bootstrap_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,18 @@


def check_inputs(
data=None, cluster_by=None, ci_method="percentile", ci_level=0.95, skipdata=False
data=None,
weight_by=None,
cluster_by=None,
ci_method="percentile",
ci_level=0.95,
skipdata=False,
):
"""Check validity of inputs.
Args:
data (pd.DataFrame): Dataset.
weight_by (str): Column name of variable with weights.
cluster_by (str): Column name of variable to cluster by.
ci_method (str): Method of choice for computing confidence intervals.
The default is "percentile".
Expand All @@ -21,6 +27,10 @@ def check_inputs(
if not skipdata:
if not isinstance(data, pd.DataFrame) and not isinstance(data, pd.Series):
raise TypeError("Data must be a pandas.DataFrame or pandas.Series.")
elif (weight_by is not None) and (weight_by not in data.columns.tolist()):
raise ValueError(
"Input 'weight_by' must be None or a column name of 'data'."
)
elif (cluster_by is not None) and (cluster_by not in data.columns.tolist()):
raise ValueError(
"Input 'cluster_by' must be None or a column name of 'data'."
Expand Down
5 changes: 4 additions & 1 deletion src/estimagic/bootstrap_outcomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
def get_bootstrap_outcomes(
data,
outcome,
weight_by=None,
cluster_by=None,
rng=None,
n_draws=1000,
Expand All @@ -19,6 +20,7 @@ def get_bootstrap_outcomes(
data (pandas.DataFrame): original dataset.
outcome (callable): function of the dataset calculating statistic of interest.
Returns a general pytree (e.g. pandas Series, dict, numpy array, etc.).
weight_by (str): column name of the variable with weights.
cluster_by (str): column name of the variable to cluster by.
rng (numpy.random.Generator): A random number generator.
n_draws (int): number of bootstrap draws.
Expand All @@ -34,12 +36,13 @@ def get_bootstrap_outcomes(
estimates (list): List of pytrees of estimated bootstrap outcomes.
"""
check_inputs(data=data, cluster_by=cluster_by)
check_inputs(data=data, weight_by=weight_by, cluster_by=cluster_by)
batch_evaluator = process_batch_evaluator(batch_evaluator)

indices = get_bootstrap_indices(
data=data,
rng=rng,
weight_by=weight_by,
cluster_by=cluster_by,
n_draws=n_draws,
)
Expand Down
54 changes: 50 additions & 4 deletions src/estimagic/bootstrap_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@
import pandas as pd


def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
def get_bootstrap_indices(
data,
rng,
weight_by=None,
cluster_by=None,
n_draws=1000,
):
"""Draw positional indices for the construction of bootstrap samples.
Storing the positional indices instead of the full bootstrap samples saves a lot
Expand All @@ -11,6 +17,7 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
Args:
data (pandas.DataFrame): original dataset.
rng (numpy.random.Generator): A random number generator.
weight_by (str): column name of the variable with weights.
cluster_by (str): column name of the variable to cluster by.
n_draws (int): number of draws, only relevant if seeds is None.
Expand All @@ -19,12 +26,16 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
"""
n_obs = len(data)
probs = _calculate_bootstrap_indices_weights(data, weight_by, cluster_by)

if cluster_by is None:
bootstrap_indices = list(rng.integers(0, n_obs, size=(n_draws, n_obs)))
bootstrap_indices = list(
rng.choice(n_obs, size=(n_draws, n_obs), replace=True, p=probs)
)
else:
clusters = data[cluster_by].unique()
drawn_clusters = rng.choice(
clusters, size=(n_draws, len(clusters)), replace=True
clusters, size=(n_draws, len(clusters)), replace=True, p=probs
)

bootstrap_indices = _convert_cluster_ids_to_indices(
Expand All @@ -34,6 +45,33 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
return bootstrap_indices


def _calculate_bootstrap_indices_weights(data, weight_by, cluster_by):
"""Calculate weights for drawing bootstrap indices.
If weights_by is not None and cluster_by is None, the weights are normalized to sum
to one. If weights_by and cluster_by are both not None, the weights are normalized
to sum to one within each cluster.
Args:
data (pandas.DataFrame): original dataset.
weight_by (str): column name of the variable with weights.
cluster_by (str): column name of the variable to cluster by.
Returns:
list: None or pd.Series of weights.
"""
if weight_by is None:
probs = None
else:
if cluster_by is None:
probs = data[weight_by] / data[weight_by].sum()
else:
cluster_weights = data.groupby(cluster_by, sort=False)[weight_by].sum()
probs = cluster_weights / cluster_weights.sum()
return probs


def _convert_cluster_ids_to_indices(cluster_col, drawn_clusters):
"""Convert the drawn clusters to positional indices of individual observations.
Expand All @@ -48,7 +86,13 @@ def _convert_cluster_ids_to_indices(cluster_col, drawn_clusters):
return bootstrap_indices


def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
def get_bootstrap_samples(
data,
rng,
weight_by=None,
cluster_by=None,
n_draws=1000,
):
"""Draw bootstrap samples.
If you have memory issues you should use get_bootstrap_indices instead and construct
Expand All @@ -57,6 +101,7 @@ def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
Args:
data (pandas.DataFrame): original dataset.
rng (numpy.random.Generator): A random number generator.
weight_by (str): weights for the observations.
cluster_by (str): column name of the variable to cluster by.
n_draws (int): number of draws, only relevant if seeds is None.
Expand All @@ -67,6 +112,7 @@ def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
indices = get_bootstrap_indices(
data=data,
rng=rng,
weight_by=weight_by,
cluster_by=cluster_by,
n_draws=n_draws,
)
Expand Down
5 changes: 3 additions & 2 deletions src/estimagic/msm_weighting.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ def get_moments_cov(
moment_kwargs (dict): Additional keyword arguments for calculate_moments.
bootstrap_kwargs (dict): Additional keyword arguments that govern the
bootstrapping. Allowed arguments are "n_draws", "seed", "n_cores",
"batch_evaluator", "cluster_by" and "error_handling". For details see the
bootstrap function.
"batch_evaluator", "weight_by", "cluster_by" and "error_handling".
For details see the bootstrap function.
Returns:
pandas.DataFrame or numpy.ndarray: The covariance matrix of the moment
Expand All @@ -39,6 +39,7 @@ def get_moments_cov(
"n_draws",
"seed",
"batch_evaluator",
"weight_by",
"cluster_by",
"error_handling",
"existing_result",
Expand Down
25 changes: 25 additions & 0 deletions tests/estimagic/test_bootstrap_ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
from pybaum import tree_just_flatten

from estimagic.bootstrap_ci import calculate_ci, check_inputs
from estimagic.bootstrap_samples import get_bootstrap_indices
from optimagic.parameters.tree_registry import get_registry
from optimagic.utilities import get_rng


def aaae(obj1, obj2, decimal=6):
Expand Down Expand Up @@ -88,6 +90,29 @@ def test_check_inputs_data():
assert str(error.value) == expected_msg


def test_check_inputs_weight_by(setup):
expected_error_msg = "Input 'weight_by' must be None or a column name of 'data'."
with pytest.raises(ValueError, match=expected_error_msg):
check_inputs(data=setup["df"], weight_by="this is not a column name of df")


def test_get_bootstrap_indices_heterogeneous_weights():
data = pd.DataFrame(
{"id": [0, 1], "w_homogenous": [0.5, 0.5], "w_heterogenous": [0.1, 0.9]}
)

res_homogenous = get_bootstrap_indices(
data, weight_by="w_homogenous", n_draws=1_000, rng=get_rng(seed=0)
)
res_heterogenous = get_bootstrap_indices(
data, weight_by="w_heterogenous", n_draws=1_000, rng=get_rng(seed=0)
)

# Given the weights, the first sample mean should be close to 0.5,
# while the second one should be close to 0.9
assert np.mean(res_homogenous) < 0.75 < np.mean(res_heterogenous)


def test_check_inputs_cluster_by(setup):
cluster_by = "this is not a column name of df"
expected_msg = "Input 'cluster_by' must be None or a column name of 'data'."
Expand Down
84 changes: 84 additions & 0 deletions tests/estimagic/test_bootstrap_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
import pytest
from numpy.testing import assert_array_equal as aae
from pandas.testing import assert_frame_equal as afe
from pandas.testing import assert_series_equal as ase

from estimagic.bootstrap_samples import (
_calculate_bootstrap_indices_weights,
_convert_cluster_ids_to_indices,
_get_bootstrap_samples_from_indices,
get_bootstrap_indices,
Expand All @@ -18,6 +20,7 @@ def data():
df = pd.DataFrame()
df["id"] = np.arange(900)
df["hh"] = [3, 1, 2, 0, 0, 2, 5, 4, 5] * 100
df["weights"] = np.ones(900)
return df


Expand All @@ -33,6 +36,37 @@ def test_get_bootstrap_indices_radomization_works_with_clustering(data):
assert set(res[0]) != set(res[1])


def test_get_bootstrap_indices_randomization_works_with_weights(data):
rng = get_rng(seed=12345)
res = get_bootstrap_indices(data, weight_by="weights", n_draws=2, rng=rng)
assert set(res[0]) != set(res[1])


def test_get_bootstrap_indices_randomization_works_with_weights_and_clustering(data):
rng = get_rng(seed=12345)
res = get_bootstrap_indices(
data, weight_by="weights", cluster_by="hh", n_draws=2, rng=rng
)
assert set(res[0]) != set(res[1])


def test_get_bootstrap_indices_randomization_works_with_and_without_weights(data):
rng1 = get_rng(seed=12345)
rng2 = get_rng(seed=12345)
res1 = get_bootstrap_indices(data, n_draws=1, rng=rng1)
res2 = get_bootstrap_indices(data, weight_by="weights", n_draws=1, rng=rng2)
assert not np.array_equal(res1, res2)


def test_get_boostrap_indices_randomization_works_with_extreme_case(data):
rng = get_rng(seed=12345)
weights = np.zeros(900)
weights[0] = 1.0
data["weights"] = weights
res = get_bootstrap_indices(data, weight_by="weights", n_draws=1, rng=rng)
assert len(np.unique(res)) == 1


def test_clustering_leaves_households_intact(data):
rng = get_rng(seed=12345)
indices = get_bootstrap_indices(data, cluster_by="hh", n_draws=1, rng=rng)[0]
Expand Down Expand Up @@ -63,3 +97,53 @@ def test_get_bootstrap_samples_from_indices():
def test_get_bootstrap_samples_runs(data):
rng = get_rng(seed=12345)
get_bootstrap_samples(data, n_draws=2, rng=rng)


@pytest.fixture
def sample_data():
return pd.DataFrame({"weight": [1, 2, 3, 4], "cluster": ["A", "A", "B", "B"]})


def test_no_weights_no_clusters(sample_data):
result = _calculate_bootstrap_indices_weights(sample_data, None, None)
assert result is None


def test_weights_no_clusters(sample_data):
result = _calculate_bootstrap_indices_weights(sample_data, "weight", None)
expected = pd.Series([0.1, 0.2, 0.3, 0.4], index=sample_data.index, name="weight")
pd.testing.assert_series_equal(result, expected)


def test_weights_and_clusters(sample_data):
result = _calculate_bootstrap_indices_weights(sample_data, "weight", "cluster")
expected = pd.Series(
[0.3, 0.7], index=pd.Index(["A", "B"], name="cluster"), name="weight"
)
ase(result, expected)


def test_invalid_weight_column():
data = pd.DataFrame({"x": [1, 2, 3]})
with pytest.raises(KeyError):
_calculate_bootstrap_indices_weights(data, "weight", None)


def test_invalid_cluster_column(sample_data):
with pytest.raises(KeyError):
_calculate_bootstrap_indices_weights(sample_data, "weight", "invalid_cluster")


def test_empty_dataframe():
empty_df = pd.DataFrame()
result = _calculate_bootstrap_indices_weights(empty_df, None, None)
assert result is None


def test_some_zero_weights_with_clusters():
data = pd.DataFrame({"weight": [0, 1, 0, 2], "cluster": ["A", "A", "B", "B"]})
result = _calculate_bootstrap_indices_weights(data, "weight", "cluster")
expected = pd.Series(
[1 / 3, 2 / 3], index=pd.Index(["A", "B"], name="cluster"), name="weight"
)
ase(result, expected)

0 comments on commit 15ac4c2

Please sign in to comment.