diff --git a/src/estimagic/bootstrap.py b/src/estimagic/bootstrap.py index 179b4e5cc..76d75c4fb 100644 --- a/src/estimagic/bootstrap.py +++ b/src/estimagic/bootstrap.py @@ -24,6 +24,7 @@ def bootstrap( existing_result=None, outcome_kwargs=None, n_draws=1_000, + weight_by=None, cluster_by=None, seed=None, n_cores=1, @@ -41,6 +42,7 @@ def bootstrap( n_draws (int): Number of bootstrap samples to draw. If len(existing_outcomes) >= n_draws, a random subset of existing_outcomes is used. + weight_by (str): Column name of variable with weights or None. cluster_by (str): Column name of variable to cluster by or None. seed (Union[None, int, numpy.random.Generator]): If seed is None or int the numpy.random.default_rng is used seeded with seed. If seed is already a @@ -59,7 +61,7 @@ def bootstrap( """ if callable(outcome): - check_inputs(data=data, cluster_by=cluster_by) + check_inputs(data=data, weight_by=weight_by, cluster_by=cluster_by) if outcome_kwargs is not None: outcome = functools.partial(outcome, **outcome_kwargs) @@ -82,6 +84,7 @@ def bootstrap( new_outcomes = get_bootstrap_outcomes( data=data, outcome=outcome, + weight_by=weight_by, cluster_by=cluster_by, rng=rng, n_draws=n_draws - n_existing, diff --git a/src/estimagic/bootstrap_helpers.py b/src/estimagic/bootstrap_helpers.py index 4c619a2dc..7e72bac82 100644 --- a/src/estimagic/bootstrap_helpers.py +++ b/src/estimagic/bootstrap_helpers.py @@ -2,12 +2,18 @@ def check_inputs( - data=None, cluster_by=None, ci_method="percentile", ci_level=0.95, skipdata=False + data=None, + weight_by=None, + cluster_by=None, + ci_method="percentile", + ci_level=0.95, + skipdata=False, ): """Check validity of inputs. Args: data (pd.DataFrame): Dataset. + weight_by (str): Column name of variable with weights. cluster_by (str): Column name of variable to cluster by. ci_method (str): Method of choice for computing confidence intervals. The default is "percentile". @@ -21,6 +27,10 @@ def check_inputs( if not skipdata: if not isinstance(data, pd.DataFrame) and not isinstance(data, pd.Series): raise TypeError("Data must be a pandas.DataFrame or pandas.Series.") + elif (weight_by is not None) and (weight_by not in data.columns.tolist()): + raise ValueError( + "Input 'weight_by' must be None or a column name of 'data'." + ) elif (cluster_by is not None) and (cluster_by not in data.columns.tolist()): raise ValueError( "Input 'cluster_by' must be None or a column name of 'data'." diff --git a/src/estimagic/bootstrap_outcomes.py b/src/estimagic/bootstrap_outcomes.py index 5ba5e6bea..0ddeb6644 100644 --- a/src/estimagic/bootstrap_outcomes.py +++ b/src/estimagic/bootstrap_outcomes.py @@ -6,6 +6,7 @@ def get_bootstrap_outcomes( data, outcome, + weight_by=None, cluster_by=None, rng=None, n_draws=1000, @@ -19,6 +20,7 @@ def get_bootstrap_outcomes( data (pandas.DataFrame): original dataset. outcome (callable): function of the dataset calculating statistic of interest. Returns a general pytree (e.g. pandas Series, dict, numpy array, etc.). + weight_by (str): column name of the variable with weights. cluster_by (str): column name of the variable to cluster by. rng (numpy.random.Generator): A random number generator. n_draws (int): number of bootstrap draws. @@ -34,12 +36,13 @@ def get_bootstrap_outcomes( estimates (list): List of pytrees of estimated bootstrap outcomes. """ - check_inputs(data=data, cluster_by=cluster_by) + check_inputs(data=data, weight_by=weight_by, cluster_by=cluster_by) batch_evaluator = process_batch_evaluator(batch_evaluator) indices = get_bootstrap_indices( data=data, rng=rng, + weight_by=weight_by, cluster_by=cluster_by, n_draws=n_draws, ) diff --git a/src/estimagic/bootstrap_samples.py b/src/estimagic/bootstrap_samples.py index 4163eef56..7b5d66e29 100644 --- a/src/estimagic/bootstrap_samples.py +++ b/src/estimagic/bootstrap_samples.py @@ -2,7 +2,13 @@ import pandas as pd -def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000): +def get_bootstrap_indices( + data, + rng, + weight_by=None, + cluster_by=None, + n_draws=1000, +): """Draw positional indices for the construction of bootstrap samples. Storing the positional indices instead of the full bootstrap samples saves a lot @@ -11,6 +17,7 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000): Args: data (pandas.DataFrame): original dataset. rng (numpy.random.Generator): A random number generator. + weight_by (str): column name of the variable with weights. cluster_by (str): column name of the variable to cluster by. n_draws (int): number of draws, only relevant if seeds is None. @@ -19,12 +26,16 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000): """ n_obs = len(data) + probs = _calculate_bootstrap_indices_weights(data, weight_by, cluster_by) + if cluster_by is None: - bootstrap_indices = list(rng.integers(0, n_obs, size=(n_draws, n_obs))) + bootstrap_indices = list( + rng.choice(n_obs, size=(n_draws, n_obs), replace=True, p=probs) + ) else: clusters = data[cluster_by].unique() drawn_clusters = rng.choice( - clusters, size=(n_draws, len(clusters)), replace=True + clusters, size=(n_draws, len(clusters)), replace=True, p=probs ) bootstrap_indices = _convert_cluster_ids_to_indices( @@ -34,6 +45,33 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000): return bootstrap_indices +def _calculate_bootstrap_indices_weights(data, weight_by, cluster_by): + """Calculate weights for drawing bootstrap indices. + + If weights_by is not None and cluster_by is None, the weights are normalized to sum + to one. If weights_by and cluster_by are both not None, the weights are normalized + to sum to one within each cluster. + + Args: + data (pandas.DataFrame): original dataset. + weight_by (str): column name of the variable with weights. + cluster_by (str): column name of the variable to cluster by. + + Returns: + list: None or pd.Series of weights. + + """ + if weight_by is None: + probs = None + else: + if cluster_by is None: + probs = data[weight_by] / data[weight_by].sum() + else: + cluster_weights = data.groupby(cluster_by, sort=False)[weight_by].sum() + probs = cluster_weights / cluster_weights.sum() + return probs + + def _convert_cluster_ids_to_indices(cluster_col, drawn_clusters): """Convert the drawn clusters to positional indices of individual observations. @@ -48,7 +86,13 @@ def _convert_cluster_ids_to_indices(cluster_col, drawn_clusters): return bootstrap_indices -def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000): +def get_bootstrap_samples( + data, + rng, + weight_by=None, + cluster_by=None, + n_draws=1000, +): """Draw bootstrap samples. If you have memory issues you should use get_bootstrap_indices instead and construct @@ -57,6 +101,7 @@ def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000): Args: data (pandas.DataFrame): original dataset. rng (numpy.random.Generator): A random number generator. + weight_by (str): weights for the observations. cluster_by (str): column name of the variable to cluster by. n_draws (int): number of draws, only relevant if seeds is None. @@ -67,6 +112,7 @@ def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000): indices = get_bootstrap_indices( data=data, rng=rng, + weight_by=weight_by, cluster_by=cluster_by, n_draws=n_draws, ) diff --git a/src/estimagic/msm_weighting.py b/src/estimagic/msm_weighting.py index 4b26a7e42..991e9b54f 100644 --- a/src/estimagic/msm_weighting.py +++ b/src/estimagic/msm_weighting.py @@ -24,8 +24,8 @@ def get_moments_cov( moment_kwargs (dict): Additional keyword arguments for calculate_moments. bootstrap_kwargs (dict): Additional keyword arguments that govern the bootstrapping. Allowed arguments are "n_draws", "seed", "n_cores", - "batch_evaluator", "cluster_by" and "error_handling". For details see the - bootstrap function. + "batch_evaluator", "weight_by", "cluster_by" and "error_handling". + For details see the bootstrap function. Returns: pandas.DataFrame or numpy.ndarray: The covariance matrix of the moment @@ -39,6 +39,7 @@ def get_moments_cov( "n_draws", "seed", "batch_evaluator", + "weight_by", "cluster_by", "error_handling", "existing_result", diff --git a/tests/estimagic/test_bootstrap_ci.py b/tests/estimagic/test_bootstrap_ci.py index a6684b3a2..64562438d 100644 --- a/tests/estimagic/test_bootstrap_ci.py +++ b/tests/estimagic/test_bootstrap_ci.py @@ -6,7 +6,9 @@ from pybaum import tree_just_flatten from estimagic.bootstrap_ci import calculate_ci, check_inputs +from estimagic.bootstrap_samples import get_bootstrap_indices from optimagic.parameters.tree_registry import get_registry +from optimagic.utilities import get_rng def aaae(obj1, obj2, decimal=6): @@ -88,6 +90,29 @@ def test_check_inputs_data(): assert str(error.value) == expected_msg +def test_check_inputs_weight_by(setup): + expected_error_msg = "Input 'weight_by' must be None or a column name of 'data'." + with pytest.raises(ValueError, match=expected_error_msg): + check_inputs(data=setup["df"], weight_by="this is not a column name of df") + + +def test_get_bootstrap_indices_heterogeneous_weights(): + data = pd.DataFrame( + {"id": [0, 1], "w_homogenous": [0.5, 0.5], "w_heterogenous": [0.1, 0.9]} + ) + + res_homogenous = get_bootstrap_indices( + data, weight_by="w_homogenous", n_draws=1_000, rng=get_rng(seed=0) + ) + res_heterogenous = get_bootstrap_indices( + data, weight_by="w_heterogenous", n_draws=1_000, rng=get_rng(seed=0) + ) + + # Given the weights, the first sample mean should be close to 0.5, + # while the second one should be close to 0.9 + assert np.mean(res_homogenous) < 0.75 < np.mean(res_heterogenous) + + def test_check_inputs_cluster_by(setup): cluster_by = "this is not a column name of df" expected_msg = "Input 'cluster_by' must be None or a column name of 'data'." diff --git a/tests/estimagic/test_bootstrap_samples.py b/tests/estimagic/test_bootstrap_samples.py index 3abf5920e..4af80d502 100644 --- a/tests/estimagic/test_bootstrap_samples.py +++ b/tests/estimagic/test_bootstrap_samples.py @@ -3,8 +3,10 @@ import pytest from numpy.testing import assert_array_equal as aae from pandas.testing import assert_frame_equal as afe +from pandas.testing import assert_series_equal as ase from estimagic.bootstrap_samples import ( + _calculate_bootstrap_indices_weights, _convert_cluster_ids_to_indices, _get_bootstrap_samples_from_indices, get_bootstrap_indices, @@ -18,6 +20,7 @@ def data(): df = pd.DataFrame() df["id"] = np.arange(900) df["hh"] = [3, 1, 2, 0, 0, 2, 5, 4, 5] * 100 + df["weights"] = np.ones(900) return df @@ -33,6 +36,37 @@ def test_get_bootstrap_indices_radomization_works_with_clustering(data): assert set(res[0]) != set(res[1]) +def test_get_bootstrap_indices_randomization_works_with_weights(data): + rng = get_rng(seed=12345) + res = get_bootstrap_indices(data, weight_by="weights", n_draws=2, rng=rng) + assert set(res[0]) != set(res[1]) + + +def test_get_bootstrap_indices_randomization_works_with_weights_and_clustering(data): + rng = get_rng(seed=12345) + res = get_bootstrap_indices( + data, weight_by="weights", cluster_by="hh", n_draws=2, rng=rng + ) + assert set(res[0]) != set(res[1]) + + +def test_get_bootstrap_indices_randomization_works_with_and_without_weights(data): + rng1 = get_rng(seed=12345) + rng2 = get_rng(seed=12345) + res1 = get_bootstrap_indices(data, n_draws=1, rng=rng1) + res2 = get_bootstrap_indices(data, weight_by="weights", n_draws=1, rng=rng2) + assert not np.array_equal(res1, res2) + + +def test_get_boostrap_indices_randomization_works_with_extreme_case(data): + rng = get_rng(seed=12345) + weights = np.zeros(900) + weights[0] = 1.0 + data["weights"] = weights + res = get_bootstrap_indices(data, weight_by="weights", n_draws=1, rng=rng) + assert len(np.unique(res)) == 1 + + def test_clustering_leaves_households_intact(data): rng = get_rng(seed=12345) indices = get_bootstrap_indices(data, cluster_by="hh", n_draws=1, rng=rng)[0] @@ -63,3 +97,53 @@ def test_get_bootstrap_samples_from_indices(): def test_get_bootstrap_samples_runs(data): rng = get_rng(seed=12345) get_bootstrap_samples(data, n_draws=2, rng=rng) + + +@pytest.fixture +def sample_data(): + return pd.DataFrame({"weight": [1, 2, 3, 4], "cluster": ["A", "A", "B", "B"]}) + + +def test_no_weights_no_clusters(sample_data): + result = _calculate_bootstrap_indices_weights(sample_data, None, None) + assert result is None + + +def test_weights_no_clusters(sample_data): + result = _calculate_bootstrap_indices_weights(sample_data, "weight", None) + expected = pd.Series([0.1, 0.2, 0.3, 0.4], index=sample_data.index, name="weight") + pd.testing.assert_series_equal(result, expected) + + +def test_weights_and_clusters(sample_data): + result = _calculate_bootstrap_indices_weights(sample_data, "weight", "cluster") + expected = pd.Series( + [0.3, 0.7], index=pd.Index(["A", "B"], name="cluster"), name="weight" + ) + ase(result, expected) + + +def test_invalid_weight_column(): + data = pd.DataFrame({"x": [1, 2, 3]}) + with pytest.raises(KeyError): + _calculate_bootstrap_indices_weights(data, "weight", None) + + +def test_invalid_cluster_column(sample_data): + with pytest.raises(KeyError): + _calculate_bootstrap_indices_weights(sample_data, "weight", "invalid_cluster") + + +def test_empty_dataframe(): + empty_df = pd.DataFrame() + result = _calculate_bootstrap_indices_weights(empty_df, None, None) + assert result is None + + +def test_some_zero_weights_with_clusters(): + data = pd.DataFrame({"weight": [0, 1, 0, 2], "cluster": ["A", "A", "B", "B"]}) + result = _calculate_bootstrap_indices_weights(data, "weight", "cluster") + expected = pd.Series( + [1 / 3, 2 / 3], index=pd.Index(["A", "B"], name="cluster"), name="weight" + ) + ase(result, expected)