Skip to content

Commit

Permalink
change reference and analysis to reference_df and analysis_df
Browse files Browse the repository at this point in the history
  • Loading branch information
santiviquez committed Nov 13, 2023
1 parent 193e242 commit b0e506a
Show file tree
Hide file tree
Showing 39 changed files with 280 additions and 301 deletions.
24 changes: 12 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,9 @@ import pandas as pd
from IPython.display import display

# Load real-world data:
df_reference, df_analysis, _ = nml.load_us_census_ma_employment_data()
display(df_reference.head())
display(df_analysis.head())
reference_df, analysis_df, _ = nml.load_us_census_ma_employment_data()
display(reference_df.head())
display(analysis_df.head())

# Choose a chunker or set a chunk size:
chunk_size = 5000
Expand All @@ -175,8 +175,8 @@ estimator = nml.CBPE(
metrics=['roc_auc'],
chunk_size=chunk_size,
)
estimator = estimator.fit(df_reference)
estimated_performance = estimator.estimate(df_analysis)
estimator = estimator.fit(reference_df)
estimated_performance = estimator.estimate(analysis_df)

# Show results:
figure = estimated_performance.plot()
Expand All @@ -192,8 +192,8 @@ univariate_calculator = nml.UnivariateDriftCalculator(
chunk_size=chunk_size
)

univariate_calculator.fit(df_reference)
univariate_drift = univariate_calculator.calculate(df_analysis)
univariate_calculator.fit(reference_df)
univariate_drift = univariate_calculator.calculate(analysis_df)

# Get features that drift the most with count-based ranker:
alert_count_ranker = nml.AlertCountRanker()
Expand All @@ -214,10 +214,10 @@ figure = univariate_drift.filter(period='analysis', column_names=['RELP','AGEP',
figure.show()

# Get target data, calculate, plot and compare realized performance with estimated performance:
_, _, analysis_targets = nml.load_us_census_ma_employment_data()
_, _, analysis_targets_df = nml.load_us_census_ma_employment_data()

df_analysis_with_targets = pd.concat([df_analysis, analysis_targets], axis=1)
display(df_analysis_with_targets.head())
analysis_with_targets_df = pd.concat([analysis_df, analysis_targets_df], axis=1)
display(analysis_with_targets_df.head())

performance_calculator = nml.PerformanceCalculator(
problem_type='classification_binary',
Expand All @@ -227,8 +227,8 @@ performance_calculator = nml.PerformanceCalculator(
metrics=['roc_auc'],
chunk_size=chunk_size)

performance_calculator.fit(df_reference)
calculated_performance = performance_calculator.calculate(df_analysis_with_targets)
performance_calculator.fit(reference_df)
calculated_performance = performance_calculator.calculate(analysis_with_targets_df)

figure = estimated_performance.filter(period='analysis').compare(calculated_performance).plot()
figure.show()
Expand Down
8 changes: 4 additions & 4 deletions docs/datasets/binary_car_loan.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ A sample of the dataset can be seen below.
.. code-block:: python
>>> import nannyml as nml
>>> reference, analysis, analysis_targets = nml.load_synthetic_car_loan_dataset()
>>> display(reference.head(3))
>>> reference_df, analysis_df, analysis_targets_df = nml.load_synthetic_car_loan_dataset()
>>> display(reference_df.head(3))
+----+-------------+----------------+------------------------+---------------+---------------------------+-----------------------+-----------------+----------------+----------+----------+-------------------------+
Expand Down Expand Up @@ -79,9 +79,9 @@ same. You can access this dataset with:
.. code-block:: python
>>> import nannyml as nml
>>> reference, analysis, analysis_targets = nml.load_synthetic_car_loan_data_quality_dataset()
>>> reference_df, analysis_df, analysis_targets_df = nml.load_synthetic_car_loan_data_quality_dataset()
>>> # let's show an instance where new and missing values are present.
>>> display(analysis.iloc[41515:41520])
>>> display(analysis_df.iloc[41515:41520])
+-------+-------------+----------------+------------------------+---------------+---------------------------+-----------------------+-----------------+-------------------------+----------------+----------+----------+
| | car_value | salary_range | debt_to_income_ratio | loan_length | repaid_loan_on_prev_car | size_of_downpayment | driver_tenure | timestamp | y_pred_proba | period | y_pred |
Expand Down
16 changes: 8 additions & 8 deletions docs/datasets/california.rst
Original file line number Diff line number Diff line change
Expand Up @@ -108,17 +108,17 @@ The data are now being split to satisfy NannyML format requirements.
>>> df_for_nanny = df[df['partition']!='train'].reset_index(drop=True)
>>> df_for_nanny['partition'] = df_for_nanny['partition'].map({'test':'reference', 'production':'analysis'})
>>> reference = df_for_nanny[df_for_nanny['partition']=='reference'].copy()
>>> analysis = df_for_nanny[df_for_nanny['partition']=='analysis'].copy()
>>> analysis_target = analysis[['clf_target']].copy()
>>> analysis = analysis.drop('clf_target', axis=1)
>>> reference_df = df_for_nanny[df_for_nanny['partition']=='reference'].copy()
>>> analysis_df = df_for_nanny[df_for_nanny['partition']=='analysis'].copy()
>>> analysis_targets_df = analysis_df[['clf_target']].copy()
>>> analysis_df = analysis_df.drop('clf_target', axis=1)
>>> # dropping partition column that is now removed from requirements.
>>> reference.drop('partition', axis=1, inplace=True)
>>> analysis.drop('partition', axis=1, inplace=True)
>>> reference_df.drop('partition', axis=1, inplace=True)
>>> analysis_df.drop('partition', axis=1, inplace=True)
The ``reference`` dataframe represents the reference :term:`Data Period` and the ``analysis``
dataframe represents the analysis period. The ``analysis_target`` dataframe contains the targets
The ``reference_df`` dataframe represents the reference :term:`Data Period` and the ``analysis_df``
dataframe represents the analysis period. The ``analysis_targets_df`` dataframe contains the targets
for the analysis period, which is provided separately.


Expand Down
4 changes: 2 additions & 2 deletions docs/datasets/regression.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ A sample of the dataset can be seen below.
.. code-block:: python
>>> import nannyml as nml
>>> reference, analysis, analysis_targets = nml.datasets.load_synthetic_car_price_dataset()
>>> display(reference.head())
>>> reference_df, analysis_df, analysis_targets_df = nml.datasets.load_synthetic_car_price_dataset()
>>> display(reference_df.head())
+----+-----------+-------------+-------------+------------------+--------------+----------+----------------+----------+----------+-------------------------+
| | car_age | km_driven | price_new | accident_count | door_count | fuel | transmission | y_true | y_pred | timestamp |
Expand Down
4 changes: 2 additions & 2 deletions docs/datasets/titanic.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ A sample of the dataset can be seen below.
.. code-block:: python
>>> import nannyml as nml
>>> reference, analysis, analysis_targets = nml.load_titanic_dataset()
>>> reference.head()
>>> reference_df, analysis_df, analysis_targets_df = nml.load_titanic_dataset()
>>> reference_df.head()
+----+---------------+----------+-----------------------------------------------------+--------+-------+---------+---------+------------------+---------+---------+------------+--------+--------+---------------------------------------------------+------------+
| | PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | boat | body | home.dest | Survived |
Expand Down
14 changes: 3 additions & 11 deletions docs/example_notebooks/Datasets - Multiclass.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,8 @@
],
"source": [
"import nannyml as nml\n",
"reference, analysis, analysis_targets = nml.load_synthetic_multiclass_classification_dataset()\n",
"display(reference.head())"
"reference_df, analysis_df, analysis_targets_df = nml.load_synthetic_multiclass_classification_dataset()\n",
"display(reference_df.head())"
]
},
{
Expand Down Expand Up @@ -199,16 +199,8 @@
],
"source": [
"from docs.utils import print_multi_index_markdown\n",
"print_multi_index_markdown(reference.head())"
"print_multi_index_markdown(reference_df.head())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "375d3449-b097-4163-9aed-2cb40870e759",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
32 changes: 16 additions & 16 deletions docs/example_notebooks/Examples California Housing.ipynb

Large diffs are not rendered by default.

30 changes: 15 additions & 15 deletions docs/example_notebooks/Examples Green Taxi.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -430,14 +430,14 @@
},
"outputs": [],
"source": [
"reference = X_test.copy() # using the test set as a reference\n",
"reference['y_pred'] = y_pred_test # reference predictions\n",
"reference['tip_amount'] = y_test # ground truth (currect targets)\n",
"reference = reference.join(data['lpep_pickup_datetime']) # date\n",
"reference_df = X_test.copy() # using the test set as a reference\n",
"reference_df['y_pred'] = y_pred_test # reference predictions\n",
"reference_df['tip_amount'] = y_test # ground truth (currect targets)\n",
"reference_df = reference_df.join(data['lpep_pickup_datetime']) # date\n",
"\n",
"analysis = X_prod.copy() # features\n",
"analysis['y_pred'] = y_pred_prod # prod predictions\n",
"analysis = analysis.join(data['lpep_pickup_datetime']) # date"
"analysis_df = X_prod.copy() # features\n",
"analysis_df['y_pred'] = y_pred_prod # prod predictions\n",
"analysis_df = analysis_df.join(data['lpep_pickup_datetime']) # date"
]
},
{
Expand Down Expand Up @@ -472,8 +472,8 @@
" chunk_period='d' # perform an estimation daily\n",
")\n",
"\n",
"dle.fit(reference) # fit on the reference (test) data\n",
"estimated_performance = dle.estimate(analysis) # estimate on the prod data"
"dle.fit(reference_df) # fit on the reference (test) data\n",
"estimated_performance = dle.estimate(analysis_df) # estimate on the prod data"
]
},
{
Expand Down Expand Up @@ -510,8 +510,8 @@
" chunk_period='d',\n",
")\n",
"\n",
"drdc.fit(reference)\n",
"multivariate_data_drift = drdc.calculate(analysis)"
"drdc.fit(reference_df)\n",
"multivariate_data_drift = drdc.calculate(analysis_df)"
]
},
{
Expand Down Expand Up @@ -547,8 +547,8 @@
" chunk_period='d',\n",
")\n",
"\n",
"udc.fit(reference)\n",
"univariate_data_drift = udc.calculate(analysis)"
"udc.fit(reference_df)\n",
"univariate_data_drift = udc.calculate(analysis_df)"
]
},
{
Expand Down Expand Up @@ -630,8 +630,8 @@
" chunk_period='d'\n",
")\n",
"\n",
"perfc.fit(reference)\n",
"realized_performance = perfc.calculate(analysis.assign(tip_amount = y_prod))\n",
"perfc.fit(reference_df)\n",
"realized_performance = perfc.calculate(analysis_df.assign(tip_amount = y_prod))\n",
"\n",
"figure = estimated_performance.filter(period='analysis').compare(realized_performance).plot()\n",
"figure.write_image(f'../_static/example_green_taxi_dle_vs_realized.svg')"
Expand Down

Large diffs are not rendered by default.

16 changes: 4 additions & 12 deletions docs/example_notebooks/How it Works - Chunking Data.ipynb

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions docs/example_notebooks/How it Works - Ranking.ipynb

Large diffs are not rendered by default.

44 changes: 18 additions & 26 deletions docs/example_notebooks/Quickstart.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -388,9 +388,9 @@
}
],
"source": [
"df_reference, df_analysis, _ = nml.load_us_census_ma_employment_data()\n",
"display(df_reference.head())\n",
"display(df_analysis.head())"
"reference_df, analysis_df, _ = nml.load_us_census_ma_employment_data()\n",
"display(reference_df.head())\n",
"display(analysis_df.head())"
]
},
{
Expand Down Expand Up @@ -420,7 +420,7 @@
}
],
"source": [
"print_some_of_the_columns_only_markdown(df_reference, 2, 5)"
"print_some_of_the_columns_only_markdown(reference_df, 2, 5)"
]
},
{
Expand Down Expand Up @@ -450,7 +450,7 @@
}
],
"source": [
"print_some_of_the_columns_only_markdown(df_analysis, 2, 5)"
"print_some_of_the_columns_only_markdown(analysis_df, 2, 5)"
]
},
{
Expand Down Expand Up @@ -487,8 +487,8 @@
"metadata": {},
"outputs": [],
"source": [
"estimator = estimator.fit(df_reference)\n",
"estimated_performance = estimator.estimate(df_analysis)"
"estimator = estimator.fit(reference_df)\n",
"estimated_performance = estimator.estimate(analysis_df)"
]
},
{
Expand Down Expand Up @@ -2917,16 +2917,16 @@
"metadata": {},
"outputs": [],
"source": [
"features = ['AGEP', 'SCHL', 'MAR', 'RELP', 'DIS', 'ESP', 'CIT', 'MIG', 'MIL', 'ANC',\n",
" 'NATIVITY', 'DEAR', 'DEYE', 'DREM', 'SEX', 'RAC1P']\n",
"feature_column_names = ['AGEP', 'SCHL', 'MAR', 'RELP', 'DIS', 'ESP', 'CIT', 'MIG',\n",
" 'MIL', 'ANC', 'NATIVITY', 'DEAR', 'DEYE', 'DREM', 'SEX', 'RAC1P']\n",
"\n",
"univariate_calculator = nml.UnivariateDriftCalculator(\n",
" column_names=features,\n",
" column_names=feature_column_names,\n",
" chunk_size=chunk_size\n",
")\n",
"\n",
"univariate_calculator.fit(df_reference)\n",
"univariate_drift = univariate_calculator.calculate(df_analysis)"
"univariate_calculator.fit(reference_df)\n",
"univariate_drift = univariate_calculator.calculate(analysis_df)"
]
},
{
Expand Down Expand Up @@ -397854,7 +397854,7 @@
"metadata": {},
"outputs": [],
"source": [
"_, _, analysis_targets = nml.load_us_census_ma_employment_data()"
"_, _, analysis_targets_df = nml.load_us_census_ma_employment_data()"
]
},
{
Expand Down Expand Up @@ -398047,8 +398047,8 @@
}
],
"source": [
"df_analysis_with_targets = pd.concat([df_analysis, analysis_targets], axis=1)\n",
"display(df_analysis_with_targets.head())"
"analysis_with_targets_df = pd.concat([analysis_df, analysis_targets_df], axis=1)\n",
"display(analysis_with_targets_df.head())"
]
},
{
Expand Down Expand Up @@ -398078,7 +398078,7 @@
}
],
"source": [
"print_some_of_the_columns_only_markdown(df_analysis_with_targets.head(), 2, 5)"
"print_some_of_the_columns_only_markdown(analysis_with_targets_df.head(), 2, 5)"
]
},
{
Expand Down Expand Up @@ -400350,8 +400350,8 @@
" metrics=['roc_auc'],\n",
" chunk_size=chunk_size)\n",
"\n",
"performance_calculator.fit(df_reference)\n",
"calculated_performance = performance_calculator.calculate(df_analysis_with_targets)\n",
"performance_calculator.fit(reference_df)\n",
"calculated_performance = performance_calculator.calculate(analysis_with_targets_df)\n",
"\n",
"figure = estimated_performance.filter(period='analysis').compare(calculated_performance).plot()\n",
"figure.show()"
Expand All @@ -400366,14 +400366,6 @@
"source": [
"figure.write_image(f'../_static/quickstart/quick-start-estimated-and-realized.svg', width=1000)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "220b48c7",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
Loading

0 comments on commit b0e506a

Please sign in to comment.