change reference and analysis to reference_df and analysis_df

NannyML · Nov 13, 2023 · b0e506a · b0e506a
1 parent 193e242
commit b0e506a
Show file tree

Hide file tree

Showing 39 changed files with 280 additions and 301 deletions.
diff --git a/README.md b/README.md
@@ -159,9 +159,9 @@ import pandas as pd
 from IPython.display import display
 
 # Load real-world data:
-df_reference, df_analysis, _ = nml.load_us_census_ma_employment_data()
-display(df_reference.head())
-display(df_analysis.head())
+reference_df, analysis_df, _ = nml.load_us_census_ma_employment_data()
+display(reference_df.head())
+display(analysis_df.head())
 
 # Choose a chunker or set a chunk size:
 chunk_size = 5000
@@ -175,8 +175,8 @@ estimator = nml.CBPE(
     metrics=['roc_auc'],
     chunk_size=chunk_size,
 )
-estimator = estimator.fit(df_reference)
-estimated_performance = estimator.estimate(df_analysis)
+estimator = estimator.fit(reference_df)
+estimated_performance = estimator.estimate(analysis_df)
 
 # Show results:
 figure = estimated_performance.plot()
@@ -192,8 +192,8 @@ univariate_calculator = nml.UnivariateDriftCalculator(
     chunk_size=chunk_size
 )
 
-univariate_calculator.fit(df_reference)
-univariate_drift = univariate_calculator.calculate(df_analysis)
+univariate_calculator.fit(reference_df)
+univariate_drift = univariate_calculator.calculate(analysis_df)
 
 # Get features that drift the most with count-based ranker:
 alert_count_ranker = nml.AlertCountRanker()
@@ -214,10 +214,10 @@ figure = univariate_drift.filter(period='analysis', column_names=['RELP','AGEP',
 figure.show()
 
 # Get target data, calculate, plot and compare realized performance with estimated performance:
-_, _, analysis_targets = nml.load_us_census_ma_employment_data()
+_, _, analysis_targets_df = nml.load_us_census_ma_employment_data()
 
-df_analysis_with_targets = pd.concat([df_analysis, analysis_targets], axis=1)
-display(df_analysis_with_targets.head())
+analysis_with_targets_df = pd.concat([analysis_df, analysis_targets_df], axis=1)
+display(analysis_with_targets_df.head())
 
 performance_calculator = nml.PerformanceCalculator(
     problem_type='classification_binary',
@@ -227,8 +227,8 @@ performance_calculator = nml.PerformanceCalculator(
     metrics=['roc_auc'],
     chunk_size=chunk_size)
 
-performance_calculator.fit(df_reference)
-calculated_performance = performance_calculator.calculate(df_analysis_with_targets)
+performance_calculator.fit(reference_df)
+calculated_performance = performance_calculator.calculate(analysis_with_targets_df)
 
 figure = estimated_performance.filter(period='analysis').compare(calculated_performance).plot()
 figure.show()

diff --git a/docs/datasets/binary_car_loan.rst b/docs/datasets/binary_car_loan.rst
@@ -23,8 +23,8 @@ A sample of the dataset can be seen below.
 .. code-block:: python
 
     >>> import nannyml as nml
-    >>> reference, analysis, analysis_targets = nml.load_synthetic_car_loan_dataset()
-    >>> display(reference.head(3))
+    >>> reference_df, analysis_df, analysis_targets_df = nml.load_synthetic_car_loan_dataset()
+    >>> display(reference_df.head(3))
 
 
 +----+-------------+----------------+------------------------+---------------+---------------------------+-----------------------+-----------------+----------------+----------+----------+-------------------------+
@@ -79,9 +79,9 @@ same. You can access this dataset with:
 .. code-block:: python
 
     >>> import nannyml as nml
-    >>> reference, analysis, analysis_targets = nml.load_synthetic_car_loan_data_quality_dataset()
+    >>> reference_df, analysis_df, analysis_targets_df = nml.load_synthetic_car_loan_data_quality_dataset()
     >>> # let's show an instance where new and missing values are present.
-    >>> display(analysis.iloc[41515:41520])
+    >>> display(analysis_df.iloc[41515:41520])
 
 +-------+-------------+----------------+------------------------+---------------+---------------------------+-----------------------+-----------------+-------------------------+----------------+----------+----------+
 |       |   car_value | salary_range   |   debt_to_income_ratio |   loan_length | repaid_loan_on_prev_car   | size_of_downpayment   |   driver_tenure | timestamp               |   y_pred_proba | period   |   y_pred |

diff --git a/docs/datasets/california.rst b/docs/datasets/california.rst
@@ -108,17 +108,17 @@ The data are now being split to satisfy NannyML format requirements.
     >>> df_for_nanny = df[df['partition']!='train'].reset_index(drop=True)
     >>> df_for_nanny['partition'] = df_for_nanny['partition'].map({'test':'reference', 'production':'analysis'})
 
-    >>> reference = df_for_nanny[df_for_nanny['partition']=='reference'].copy()
-    >>> analysis = df_for_nanny[df_for_nanny['partition']=='analysis'].copy()
-    >>> analysis_target = analysis[['clf_target']].copy()
-    >>> analysis = analysis.drop('clf_target', axis=1)
+    >>> reference_df = df_for_nanny[df_for_nanny['partition']=='reference'].copy()
+    >>> analysis_df = df_for_nanny[df_for_nanny['partition']=='analysis'].copy()
+    >>> analysis_targets_df = analysis_df[['clf_target']].copy()
+    >>> analysis_df = analysis_df.drop('clf_target', axis=1)
 
     >>> # dropping partition column that is now removed from requirements.
-    >>> reference.drop('partition', axis=1, inplace=True)
-    >>> analysis.drop('partition', axis=1, inplace=True)
+    >>> reference_df.drop('partition', axis=1, inplace=True)
+    >>> analysis_df.drop('partition', axis=1, inplace=True)
 
-The ``reference`` dataframe represents the reference :term:`Data Period` and the ``analysis``
-dataframe represents the analysis period. The ``analysis_target`` dataframe contains the targets
+The ``reference_df`` dataframe represents the reference :term:`Data Period` and the ``analysis_df``
+dataframe represents the analysis period. The ``analysis_targets_df`` dataframe contains the targets
 for the analysis period, which is provided separately.
 
 

diff --git a/docs/datasets/regression.rst b/docs/datasets/regression.rst
@@ -23,8 +23,8 @@ A sample of the dataset can be seen below.
 .. code-block:: python
 
     >>> import nannyml as nml
-    >>> reference, analysis, analysis_targets = nml.datasets.load_synthetic_car_price_dataset()
-    >>> display(reference.head())
+    >>> reference_df, analysis_df, analysis_targets_df = nml.datasets.load_synthetic_car_price_dataset()
+    >>> display(reference_df.head())
 
 +----+-----------+-------------+-------------+------------------+--------------+----------+----------------+----------+----------+-------------------------+
 |    |   car_age |   km_driven |   price_new |   accident_count |   door_count | fuel     | transmission   |   y_true |   y_pred | timestamp               |

diff --git a/docs/datasets/titanic.rst b/docs/datasets/titanic.rst
@@ -27,8 +27,8 @@ A sample of the dataset can be seen below.
 .. code-block:: python
 
     >>> import nannyml as nml
-    >>> reference, analysis, analysis_targets = nml.load_titanic_dataset()
-    >>> reference.head()
+    >>> reference_df, analysis_df, analysis_targets_df = nml.load_titanic_dataset()
+    >>> reference_df.head()
 
 +----+---------------+----------+-----------------------------------------------------+--------+-------+---------+---------+------------------+---------+---------+------------+--------+--------+---------------------------------------------------+------------+
 |    | PassengerId   | Pclass   | Name                                                | Sex    | Age   | SibSp   | Parch   | Ticket           | Fare    | Cabin   | Embarked   | boat   | body   | home.dest                                         | Survived   |

diff --git a/docs/example_notebooks/Datasets - Multiclass.ipynb b/docs/example_notebooks/Datasets - Multiclass.ipynb
@@ -165,8 +165,8 @@
    ],
    "source": [
     "import nannyml as nml\n",
-    "reference, analysis, analysis_targets = nml.load_synthetic_multiclass_classification_dataset()\n",
-    "display(reference.head())"
+    "reference_df, analysis_df, analysis_targets_df = nml.load_synthetic_multiclass_classification_dataset()\n",
+    "display(reference_df.head())"
    ]
   },
   {
@@ -199,16 +199,8 @@
    ],
    "source": [
     "from docs.utils import print_multi_index_markdown\n",
-    "print_multi_index_markdown(reference.head())"
+    "print_multi_index_markdown(reference_df.head())"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "375d3449-b097-4163-9aed-2cb40870e759",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

diff --git a/docs/example_notebooks/Examples California Housing.ipynb b/docs/example_notebooks/Examples California Housing.ipynb
diff --git a/docs/example_notebooks/Examples Green Taxi.ipynb b/docs/example_notebooks/Examples Green Taxi.ipynb
@@ -430,14 +430,14 @@
    },
    "outputs": [],
    "source": [
-    "reference = X_test.copy() # using the test set as a reference\n",
-    "reference['y_pred'] = y_pred_test # reference predictions\n",
-    "reference['tip_amount'] = y_test # ground truth (currect targets)\n",
-    "reference = reference.join(data['lpep_pickup_datetime']) # date\n",
+    "reference_df = X_test.copy() # using the test set as a reference\n",
+    "reference_df['y_pred'] = y_pred_test # reference predictions\n",
+    "reference_df['tip_amount'] = y_test # ground truth (currect targets)\n",
+    "reference_df = reference_df.join(data['lpep_pickup_datetime']) # date\n",
     "\n",
-    "analysis = X_prod.copy() # features\n",
-    "analysis['y_pred'] = y_pred_prod # prod predictions\n",
-    "analysis = analysis.join(data['lpep_pickup_datetime']) # date"
+    "analysis_df = X_prod.copy() # features\n",
+    "analysis_df['y_pred'] = y_pred_prod # prod predictions\n",
+    "analysis_df = analysis_df.join(data['lpep_pickup_datetime']) # date"
    ]
   },
   {
@@ -472,8 +472,8 @@
     "    chunk_period='d' # perform an estimation daily\n",
     ")\n",
     "\n",
-    "dle.fit(reference) # fit on the reference (test) data\n",
-    "estimated_performance = dle.estimate(analysis) # estimate on the prod data"
+    "dle.fit(reference_df) # fit on the reference (test) data\n",
+    "estimated_performance = dle.estimate(analysis_df) # estimate on the prod data"
    ]
   },
   {
@@ -510,8 +510,8 @@
     "    chunk_period='d',\n",
     ")\n",
     "\n",
-    "drdc.fit(reference)\n",
-    "multivariate_data_drift = drdc.calculate(analysis)"
+    "drdc.fit(reference_df)\n",
+    "multivariate_data_drift = drdc.calculate(analysis_df)"
    ]
   },
   {
@@ -547,8 +547,8 @@
     "    chunk_period='d',\n",
     ")\n",
     "\n",
-    "udc.fit(reference)\n",
-    "univariate_data_drift = udc.calculate(analysis)"
+    "udc.fit(reference_df)\n",
+    "univariate_data_drift = udc.calculate(analysis_df)"
    ]
   },
   {
@@ -630,8 +630,8 @@
     "    chunk_period='d'\n",
     ")\n",
     "\n",
-    "perfc.fit(reference)\n",
-    "realized_performance = perfc.calculate(analysis.assign(tip_amount = y_prod))\n",
+    "perfc.fit(reference_df)\n",
+    "realized_performance = perfc.calculate(analysis_df.assign(tip_amount = y_prod))\n",
     "\n",
     "figure = estimated_performance.filter(period='analysis').compare(realized_performance).plot()\n",
     "figure.write_image(f'../_static/example_green_taxi_dle_vs_realized.svg')"

diff --git a/docs/example_notebooks/How It Works - Data Reconstruction with PCA.ipynb b/docs/example_notebooks/How It Works - Data Reconstruction with PCA.ipynb
diff --git a/docs/example_notebooks/How it Works - Chunking Data.ipynb b/docs/example_notebooks/How it Works - Chunking Data.ipynb
diff --git a/docs/example_notebooks/How it Works - Ranking.ipynb b/docs/example_notebooks/How it Works - Ranking.ipynb
diff --git a/docs/example_notebooks/Quickstart.ipynb b/docs/example_notebooks/Quickstart.ipynb
@@ -388,9 +388,9 @@
     }
    ],
    "source": [
-    "df_reference, df_analysis, _ = nml.load_us_census_ma_employment_data()\n",
-    "display(df_reference.head())\n",
-    "display(df_analysis.head())"
+    "reference_df, analysis_df, _ = nml.load_us_census_ma_employment_data()\n",
+    "display(reference_df.head())\n",
+    "display(analysis_df.head())"
    ]
   },
   {
@@ -420,7 +420,7 @@
     }
    ],
    "source": [
-    "print_some_of_the_columns_only_markdown(df_reference, 2, 5)"
+    "print_some_of_the_columns_only_markdown(reference_df, 2, 5)"
    ]
   },
   {
@@ -450,7 +450,7 @@
     }
    ],
    "source": [
-    "print_some_of_the_columns_only_markdown(df_analysis, 2, 5)"
+    "print_some_of_the_columns_only_markdown(analysis_df, 2, 5)"
    ]
   },
   {
@@ -487,8 +487,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "estimator = estimator.fit(df_reference)\n",
-    "estimated_performance = estimator.estimate(df_analysis)"
+    "estimator = estimator.fit(reference_df)\n",
+    "estimated_performance = estimator.estimate(analysis_df)"
    ]
   },
   {
@@ -2917,16 +2917,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "features = ['AGEP', 'SCHL', 'MAR', 'RELP', 'DIS', 'ESP', 'CIT', 'MIG', 'MIL', 'ANC',\n",
-    "       'NATIVITY', 'DEAR', 'DEYE', 'DREM', 'SEX', 'RAC1P']\n",
+    "feature_column_names = ['AGEP', 'SCHL', 'MAR', 'RELP', 'DIS', 'ESP', 'CIT', 'MIG',\n",
+    "                        'MIL', 'ANC', 'NATIVITY', 'DEAR', 'DEYE', 'DREM', 'SEX', 'RAC1P']\n",
     "\n",
     "univariate_calculator = nml.UnivariateDriftCalculator(\n",
-    "    column_names=features,\n",
+    "    column_names=feature_column_names,\n",
     "    chunk_size=chunk_size\n",
     ")\n",
     "\n",
-    "univariate_calculator.fit(df_reference)\n",
-    "univariate_drift = univariate_calculator.calculate(df_analysis)"
+    "univariate_calculator.fit(reference_df)\n",
+    "univariate_drift = univariate_calculator.calculate(analysis_df)"
    ]
   },
   {
@@ -397854,7 +397854,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "_, _, analysis_targets = nml.load_us_census_ma_employment_data()"
+    "_, _, analysis_targets_df = nml.load_us_census_ma_employment_data()"
    ]
   },
   {
@@ -398047,8 +398047,8 @@
     }
    ],
    "source": [
-    "df_analysis_with_targets = pd.concat([df_analysis, analysis_targets], axis=1)\n",
-    "display(df_analysis_with_targets.head())"
+    "analysis_with_targets_df = pd.concat([analysis_df, analysis_targets_df], axis=1)\n",
+    "display(analysis_with_targets_df.head())"
    ]
   },
   {
@@ -398078,7 +398078,7 @@
     }
    ],
    "source": [
-    "print_some_of_the_columns_only_markdown(df_analysis_with_targets.head(), 2, 5)"
+    "print_some_of_the_columns_only_markdown(analysis_with_targets_df.head(), 2, 5)"
    ]
   },
   {
@@ -400350,8 +400350,8 @@
     "    metrics=['roc_auc'],\n",
     "    chunk_size=chunk_size)\n",
     "\n",
-    "performance_calculator.fit(df_reference)\n",
-    "calculated_performance = performance_calculator.calculate(df_analysis_with_targets)\n",
+    "performance_calculator.fit(reference_df)\n",
+    "calculated_performance = performance_calculator.calculate(analysis_with_targets_df)\n",
     "\n",
     "figure = estimated_performance.filter(period='analysis').compare(calculated_performance).plot()\n",
     "figure.show()"
@@ -400366,14 +400366,6 @@
    "source": [
     "figure.write_image(f'../_static/quickstart/quick-start-estimated-and-realized.svg', width=1000)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "220b48c7",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {