From c98653401ff1c6735aa7e127abedab4489e9e2d9 Mon Sep 17 00:00:00 2001 From: Nikolaos Perrakis <89025229+nikml@users.noreply.github.com> Date: Thu, 6 Jun 2024 20:01:28 +0300 Subject: [PATCH] add NannyML profiling notebooks (#396) --- development/Profiling BC.ipynb | 1416 +++++++++++++++++++++++++++++++ development/Profiling MC.ipynb | 919 ++++++++++++++++++++ development/Profiling REG.ipynb | 765 +++++++++++++++++ 3 files changed, 3100 insertions(+) create mode 100644 development/Profiling BC.ipynb create mode 100644 development/Profiling MC.ipynb create mode 100644 development/Profiling REG.ipynb diff --git a/development/Profiling BC.ipynb b/development/Profiling BC.ipynb new file mode 100644 index 00000000..44278078 --- /dev/null +++ b/development/Profiling BC.ipynb @@ -0,0 +1,1416 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Profiling NannyML\n", + "\n", + "Sample Notebook to Profile NannyML library.\n", + "\n", + "To run this notebook jupyterlab, nannyml and pyinstrument need to be installed in your python environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717686705107 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import nannyml as nml\n", + "\n", + "from sklearn.datasets import make_classification\n", + "from lightgbm import LGBMClassifier\n", + "from pyinstrument import Profiler\n", + "from math import floor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717686689927 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "OUTPUT_PREFIX = \"Profiling_BC_\"\n", + "\n", + "# Change Values below to make the dataset bigger/smaller\n", + "CHUNK_SIZE = 1000\n", + "N_FEATURES = 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717500675067 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "# Create Sample Dataset to benchmark NannyML\n", + "print(\"creating data\")\n", + "RANDOM_STATE = 13\n", + "\n", + "N_SAMPLES = CHUNK_SIZE * 25\n", + "N_INFORMATIVE = floor(N_FEATURES*0.95)\n", + "N_REDUNDANT = floor(N_FEATURES*0.03)\n", + "N_REPEATED = floor(N_FEATURES*0.01)\n", + "N_CLASSES = 2\n", + "N_CLUSTERS_PER_CLASS = 4\n", + "\n", + "x, y = make_classification(\n", + " n_samples=N_SAMPLES,\n", + " n_features=N_FEATURES,\n", + " n_informative=N_INFORMATIVE,\n", + " n_redundant=N_REDUNDANT,\n", + " n_repeated=N_REPEATED,\n", + " n_classes=N_CLASSES,\n", + " random_state=RANDOM_STATE,\n", + " n_clusters_per_class=N_CLUSTERS_PER_CLASS,\n", + " shuffle=True,\n", + " # scale=1.5,\n", + " flip_y=0.05,\n", + " class_sep=2\n", + ")\n", + "\n", + "features_selected = ['f'+str(el+1) for el in range(0, x.shape[1])]\n", + "data = pd.DataFrame(x, columns=features_selected)\n", + "data['y_true'] = y\n", + "del x,y\n", + "print(\"creating model\")\n", + "cat_n = len(features_selected)//7\n", + "for el in features_selected[-cat_n:]:\n", + " data[el] = pd.cut(data[el], bins=5, labels=['a', 'b', 'c', 'd', 'e'])\n", + "\n", + "model = LGBMClassifier(random_state=14)\n", + "model.fit(\n", + " data.loc[:5*CHUNK_SIZE, features_selected],\n", + " data.loc[:5*CHUNK_SIZE, 'y_true']\n", + ")\n", + "data['y_pred'] = model.predict(data.loc[:, features_selected])\n", + "data['y_pred_proba'] = model.predict_proba(data.loc[:, features_selected])[:,1]\n", + "# data.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717687229262 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "reference = data.loc[5*CHUNK_SIZE+1:15*CHUNK_SIZE].reset_index(drop=True)\n", + "analysis = data.loc[15*CHUNK_SIZE:].reset_index(drop=True)\n", + "\n", + "del data" + ] + }, + { + "cell_type": "raw", + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717512258966 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "# A small test of the dataset\n", + "calc = nml.SummaryStatsAvgCalculator(\n", + " column_names=['y_true'],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "results.to_df()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151709163 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.UnivariateDriftCalculator(\n", + " column_names=features_selected,\n", + " treat_as_categorical=['y_pred'],\n", + " # timestamp_column_name='timestamp',\n", + " continuous_methods=['jensen_shannon'],\n", + " categorical_methods=['jensen_shannon'],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_UNIV1_JS.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151709229 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.UnivariateDriftCalculator(\n", + " column_names=features_selected,\n", + " treat_as_categorical=['y_pred'],\n", + " continuous_methods=['kolmogorov_smirnov'],\n", + " categorical_methods=['chi2'],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_UNIV2_KS_CHI2.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151709298 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.UnivariateDriftCalculator(\n", + " column_names=features_selected,\n", + " treat_as_categorical=['y_pred'],\n", + " continuous_methods=['hellinger'],\n", + " categorical_methods=['hellinger'],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_UNIV3_HL.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151709373 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.UnivariateDriftCalculator(\n", + " column_names=features_selected,\n", + " treat_as_categorical=['y_pred'],\n", + " continuous_methods=['wasserstein'],\n", + " categorical_methods=['l_infinity'],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_UNIV4_WS_L8.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151709453 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.DataReconstructionDriftCalculator(\n", + " column_names=features_selected,\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_MULTIV_DRE.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151709538 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.DomainClassifierCalculator(\n", + " feature_column_names=features_selected,\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_MULTIV_DC.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151709628 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.SummaryStatsSumCalculator(\n", + " column_names=features_selected[:-cat_n],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_STATS_SUM.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151709706 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.SummaryStatsAvgCalculator(\n", + " column_names=features_selected[:-cat_n],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_STATS_AVG.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151709777 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.SummaryStatsStdCalculator(\n", + " column_names=features_selected[:-cat_n],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_STATS_STD.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151709851 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.SummaryStatsMedianCalculator(\n", + " column_names=features_selected[:-cat_n],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_STATS_MED.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151709924 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.CBPE(\n", + " y_pred_proba='y_pred_proba',\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['accuracy',],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='classification_binary',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_ACC.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151709993 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.CBPE(\n", + " y_pred_proba='y_pred_proba',\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['f1',],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='classification_binary',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_F1.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151710070 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.CBPE(\n", + " y_pred_proba='y_pred_proba',\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['roc_auc',],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='classification_binary',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_AUROC.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151710144 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.CBPE(\n", + " y_pred_proba='y_pred_proba',\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['average_precision',],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='classification_binary',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_AP.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151710223 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.CBPE(\n", + " y_pred_proba='y_pred_proba',\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['precision',],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='classification_binary',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_PREC.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151710292 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.CBPE(\n", + " y_pred_proba='y_pred_proba',\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['recall',],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='classification_binary',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_RECL.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151710368 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.CBPE(\n", + " y_pred_proba='y_pred_proba',\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['specificity',],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='classification_binary',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_SPEC.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151710445 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.CBPE(\n", + " y_pred_proba='y_pred_proba',\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['confusion_matrix',],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='classification_binary',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_CM.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151710513 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.CBPE(\n", + " y_pred_proba='y_pred_proba',\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['business_value',],\n", + " business_value_matrix=[[5, -10], [-50, 50]],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='classification_binary',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_BV.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151710594 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.CBPE(\n", + " y_pred_proba='y_pred_proba',\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['roc_auc', 'f1', 'accuracy', 'average_precision', 'precision', 'recall', 'specificity', 'confusion_matrix', 'business_value',],\n", + " business_value_matrix=[[5, -10], [-50, 50]],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='classification_binary',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_ALL.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151711416 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred_proba='y_pred_proba',\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " problem_type='classification_binary',\n", + " metrics=['accuracy',],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_ACC.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151711487 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred_proba='y_pred_proba',\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " problem_type='classification_binary',\n", + " metrics=['specificity',],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_SPEC.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151711566 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred_proba='y_pred_proba',\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " problem_type='classification_binary',\n", + " metrics=['recall',],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_PECL.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151711644 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred_proba='y_pred_proba',\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " problem_type='classification_binary',\n", + " metrics=['average_precision',],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_AP.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151711722 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred_proba='y_pred_proba',\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " problem_type='classification_binary',\n", + " metrics=['precision',],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_PREC.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151711802 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred_proba='y_pred_proba',\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " problem_type='classification_binary',\n", + " metrics=['f1',],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_F1.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151711884 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred_proba='y_pred_proba',\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " problem_type='classification_binary',\n", + " metrics=['roc_auc',],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_AUROC.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151711961 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred_proba='y_pred_proba',\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " problem_type='classification_binary',\n", + " metrics=['confusion_matrix',],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_CM.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151712038 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred_proba='y_pred_proba',\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " problem_type='classification_binary',\n", + " metrics=['business_value',],\n", + " business_value_matrix=[[5, -10], [-50, 50]],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_BV.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717151712124 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred_proba='y_pred_proba',\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " problem_type='classification_binary',\n", + " metrics=['roc_auc', 'f1', 'accuracy', 'average_precision', 'precision', 'recall', 'specificity', 'confusion_matrix', 'business_value',],\n", + " business_value_matrix=[[5, -10], [-50, 50]],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_ALL.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1717613761674 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.MissingValuesCalculator(\n", + " column_names=features_selected,\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_DQ_MV.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.UnseenValuesCalculator(\n", + " column_names=features_selected[-cat_n:],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_DQ_UNS.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernel_info": { + "name": "python310-sdkv2" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + }, + "microsoft": { + "host": { + "AzureML": { + "notebookHasBeenCompleted": true + } + }, + "ms_spell_check": { + "ms_spell_check_language": "en" + } + }, + "nteract": { + "version": "nteract-front-end@1.0.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/development/Profiling MC.ipynb b/development/Profiling MC.ipynb new file mode 100644 index 00000000..c3a11016 --- /dev/null +++ b/development/Profiling MC.ipynb @@ -0,0 +1,919 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Profiling NannyML\n", + "\n", + "Sample Notebook to Profile NannyML library.\n", + "\n", + "To run this notebook jupyterlab, nannyml and pyinstrument need to be installed in your python environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import nannyml as nml\n", + "\n", + "from sklearn.datasets import make_classification\n", + "from lightgbm import LGBMClassifier\n", + "from pyinstrument import Profiler\n", + "from math import floor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716031571668 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "OUTPUT_PREFIX = \"Profiling_MC_\"\n", + "\n", + "# Change Values below to make the dataset bigger/smaller\n", + "CHUNK_SIZE = 1000\n", + "N_FEATURES = 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716035957830 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "print(\"creating data\")\n", + "RANDOM_STATE = 13\n", + "\n", + "N_SAMPLES = CHUNK_SIZE * 25\n", + "N_INFORMATIVE = floor(N_FEATURES*0.95)\n", + "N_REDUNDANT = floor(N_FEATURES*0.03)\n", + "N_REPEATED = floor(N_FEATURES*0.01)\n", + "N_CLASSES = 3\n", + "N_CLUSTERS_PER_CLASS = 4\n", + "\n", + "x, y = make_classification(\n", + " n_samples=N_SAMPLES,\n", + " n_features=N_FEATURES,\n", + " n_informative=N_INFORMATIVE,\n", + " n_redundant=N_REDUNDANT,\n", + " n_repeated=N_REPEATED,\n", + " n_classes=N_CLASSES,\n", + " random_state=RANDOM_STATE,\n", + " n_clusters_per_class=N_CLUSTERS_PER_CLASS,\n", + " shuffle=True,\n", + " # scale=1.5,\n", + " flip_y=0.05,\n", + " class_sep=2\n", + ")\n", + "\n", + "features_selected = ['f'+str(el+1) for el in range(0, x.shape[1])]\n", + "data = pd.DataFrame(x, columns=features_selected)\n", + "data['y_true'] = y\n", + "del x,y\n", + "print(\"creating model\")\n", + "cat_n = len(features_selected)//7\n", + "for el in features_selected[-cat_n:]:\n", + " data[el] = pd.cut(data[el], bins=5, labels=['a', 'b', 'c', 'd', 'e'])\n", + "\n", + "model = LGBMClassifier(random_state=14)\n", + "model.fit(\n", + " data.loc[:5*CHUNK_SIZE, features_selected],\n", + " data.loc[:5*CHUNK_SIZE, 'y_true']\n", + ")\n", + "data['y_pred'] = model.predict(data.loc[:, features_selected])\n", + "preds = model.predict_proba(data.loc[:, features_selected])\n", + "data['y_pred_proba_0'] = preds[:,0]\n", + "data['y_pred_proba_1'] = preds[:,1]\n", + "data['y_pred_proba_2'] = preds[:,2]\n", + "# data.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716036064362 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "reference = data.loc[5*CHUNK_SIZE+1:15*CHUNK_SIZE].reset_index(drop=True)\n", + "analysis = data.loc[15*CHUNK_SIZE:].reset_index(drop=True)\n", + "del data" + ] + }, + { + "cell_type": "raw", + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716036130626 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "# A small test of the dataset\n", + "calc = nml.SummaryStatsAvgCalculator(\n", + " column_names=['y_true'],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "results.to_df()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716003879932 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.SummaryStatsAvgCalculator(\n", + " column_names=features_selected[:-cat_n],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_STATS_AVG.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716004223620 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.CBPE(\n", + " y_pred_proba={\n", + " 0: 'y_pred_proba_0',\n", + " 1: 'y_pred_proba_1',\n", + " 2: 'y_pred_proba_2'},\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['accuracy',],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='classification_multiclass',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_ACC.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716004570981 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.CBPE(\n", + " y_pred_proba={\n", + " 0: 'y_pred_proba_0',\n", + " 1: 'y_pred_proba_1',\n", + " 2: 'y_pred_proba_2'},\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['f1',],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='classification_multiclass',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_F1.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716004929101 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.CBPE(\n", + " y_pred_proba={\n", + " 0: 'y_pred_proba_0',\n", + " 1: 'y_pred_proba_1',\n", + " 2: 'y_pred_proba_2'},\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['roc_auc',],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='classification_multiclass',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_AUROC.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716017719109 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.CBPE(\n", + " y_pred_proba={\n", + " 0: 'y_pred_proba_0',\n", + " 1: 'y_pred_proba_1',\n", + " 2: 'y_pred_proba_2'},\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['precision',],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='classification_multiclass',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_PREC.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716017977451 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.CBPE(\n", + " y_pred_proba={\n", + " 0: 'y_pred_proba_0',\n", + " 1: 'y_pred_proba_1',\n", + " 2: 'y_pred_proba_2'},\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['recall',],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='classification_multiclass',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_RECL.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716018229824 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.CBPE(\n", + " y_pred_proba={\n", + " 0: 'y_pred_proba_0',\n", + " 1: 'y_pred_proba_1',\n", + " 2: 'y_pred_proba_2'},\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['specificity',],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='classification_multiclass',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_SPEC.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716018456746 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.CBPE(\n", + " y_pred_proba={\n", + " 0: 'y_pred_proba_0',\n", + " 1: 'y_pred_proba_1',\n", + " 2: 'y_pred_proba_2'},\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['confusion_matrix',],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='classification_multiclass',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_CM.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716036870205 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.CBPE(\n", + " y_pred_proba={\n", + " 0: 'y_pred_proba_0',\n", + " 1: 'y_pred_proba_1',\n", + " 2: 'y_pred_proba_2'},\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['roc_auc', 'f1', 'accuracy', 'precision', 'recall', 'specificity', 'confusion_matrix',],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='classification_multiclass',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_CBPE_ALL.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716018459008 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred_proba={\n", + " 0: 'y_pred_proba_0',\n", + " 1: 'y_pred_proba_1',\n", + " 2: 'y_pred_proba_2'},\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " problem_type='classification_multiclass',\n", + " metrics=['accuracy',],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_ACC.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716018459070 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred_proba={\n", + " 0: 'y_pred_proba_0',\n", + " 1: 'y_pred_proba_1',\n", + " 2: 'y_pred_proba_2'},\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " problem_type='classification_multiclass',\n", + " metrics=['specificity',],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_SPEC.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716018459118 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred_proba={\n", + " 0: 'y_pred_proba_0',\n", + " 1: 'y_pred_proba_1',\n", + " 2: 'y_pred_proba_2'},\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " problem_type='classification_multiclass',\n", + " metrics=['recall',],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_PECL.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716018459215 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred_proba={\n", + " 0: 'y_pred_proba_0',\n", + " 1: 'y_pred_proba_1',\n", + " 2: 'y_pred_proba_2'},\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " problem_type='classification_multiclass',\n", + " metrics=['precision',],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_PREC.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716018459261 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred_proba={\n", + " 0: 'y_pred_proba_0',\n", + " 1: 'y_pred_proba_1',\n", + " 2: 'y_pred_proba_2'},\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " problem_type='classification_multiclass',\n", + " metrics=['f1',],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_F1.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716018459307 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred_proba={\n", + " 0: 'y_pred_proba_0',\n", + " 1: 'y_pred_proba_1',\n", + " 2: 'y_pred_proba_2'},\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " problem_type='classification_multiclass',\n", + " metrics=['roc_auc',],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_AUROC.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716018459353 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred_proba={\n", + " 0: 'y_pred_proba_0',\n", + " 1: 'y_pred_proba_1',\n", + " 2: 'y_pred_proba_2'},\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " problem_type='classification_multiclass',\n", + " metrics=['confusion_matrix',],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_CM.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716018459398 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred_proba='y_pred_proba',\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " problem_type='classification_binary',\n", + " metrics=['roc_auc', 'f1', 'accuracy', 'precision', 'recall', 'specificity', 'confusion_matrix',],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_ALL.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernel_info": { + "name": "python310-sdkv2" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + }, + "microsoft": { + "host": { + "AzureML": { + "notebookHasBeenCompleted": true + } + }, + "ms_spell_check": { + "ms_spell_check_language": "en" + } + }, + "nteract": { + "version": "nteract-front-end@1.0.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/development/Profiling REG.ipynb b/development/Profiling REG.ipynb new file mode 100644 index 00000000..d43b35bf --- /dev/null +++ b/development/Profiling REG.ipynb @@ -0,0 +1,765 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Profiling NannyML\n", + "\n", + "Sample Notebook to Profile NannyML library.\n", + "\n", + "To run this notebook jupyterlab, nannyml and pyinstrument need to be installed in your python environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import nannyml as nml\n", + "\n", + "from sklearn.datasets import make_regression\n", + "from lightgbm import LGBMRegressor\n", + "from pyinstrument import Profiler\n", + "from math import floor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "OUTPUT_PREFIX = \"Profiling_REG_\"\n", + "\n", + "# Change Values below to make the dataset bigger/smaller\n", + "CHUNK_SIZE = 1000\n", + "N_FEATURES = 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716044745298 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "print(\"creating data\")\n", + "RANDOM_STATE = 13\n", + "\n", + "N_SAMPLES = CHUNK_SIZE * 25\n", + "N_INFORMATIVE = floor(N_FEATURES*0.95)\n", + "N_REDUNDANT = floor(N_FEATURES*0.03)\n", + "N_REPEATED = floor(N_FEATURES*0.01)\n", + "\n", + "x, y = make_regression(\n", + " n_samples=N_SAMPLES,\n", + " n_features=N_FEATURES,\n", + " n_informative=N_INFORMATIVE,\n", + " random_state=RANDOM_STATE,\n", + " shuffle=True,\n", + " bias=10_000\n", + ")\n", + "\n", + "features_selected = ['f'+str(el+1) for el in range(0, x.shape[1])]\n", + "data = pd.DataFrame(x, columns=features_selected)\n", + "data['y_true'] = y\n", + "del x,y\n", + "print(\"creating model\")\n", + "cat_n = len(features_selected)//7\n", + "for el in features_selected[-cat_n:]:\n", + " data[el] = pd.cut(data[el], bins=5, labels=['a', 'b', 'c', 'd', 'e'])\n", + "\n", + "model = LGBMRegressor(random_state=14)\n", + "model.fit(\n", + " data.loc[:5*CHUNK_SIZE, features_selected],\n", + " data.loc[:5*CHUNK_SIZE, 'y_true']\n", + ")\n", + "data['y_pred'] = model.predict(data.loc[:, features_selected])\n", + "# data.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716044871927 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "reference = data.loc[5*CHUNK_SIZE+1:15*CHUNK_SIZE].reset_index(drop=True)\n", + "analysis = data.loc[15*CHUNK_SIZE:].reset_index(drop=True)\n", + "del data" + ] + }, + { + "cell_type": "raw", + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716044939472 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "# A small test of the dataset\n", + "calc = nml.SummaryStatsAvgCalculator(\n", + " column_names=['y_true'],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "results.to_df()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716040351673 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.SummaryStatsAvgCalculator(\n", + " column_names=features_selected[:-cat_n],\n", + " chunk_size=CHUNK_SIZE\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_STATS_AVG.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716040361145 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "estimator = nml.DLE(\n", + " feature_column_names=features_selected,\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['mae',],\n", + " chunk_size=CHUNK_SIZE,\n", + " tune_hyperparameters=False\n", + ")\n", + "estimator.fit(reference)\n", + "results = estimator.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_DLE_MAE.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716040369886 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "estimator = nml.DLE(\n", + " feature_column_names=features_selected,\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['mape',],\n", + " chunk_size=CHUNK_SIZE,\n", + " tune_hyperparameters=False\n", + ")\n", + "estimator.fit(reference)\n", + "results = estimator.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_DLE_MAPE.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716040379499 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "estimator = nml.DLE(\n", + " feature_column_names=features_selected,\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['mse',],\n", + " chunk_size=CHUNK_SIZE,\n", + " tune_hyperparameters=False\n", + ")\n", + "estimator.fit(reference)\n", + "results = estimator.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_DLE_MSE.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716040389443 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "estimator = nml.DLE(\n", + " feature_column_names=features_selected,\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['msle',],\n", + " chunk_size=CHUNK_SIZE,\n", + " tune_hyperparameters=False\n", + ")\n", + "estimator.fit(reference)\n", + "results = estimator.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_DLE_MSLE.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716040398616 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "estimator = nml.DLE(\n", + " feature_column_names=features_selected,\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['rmse'],\n", + " chunk_size=CHUNK_SIZE,\n", + " tune_hyperparameters=False\n", + ")\n", + "estimator.fit(reference)\n", + "results = estimator.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_DLE_RMSE.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716040407901 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "estimator = nml.DLE(\n", + " feature_column_names=features_selected,\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['rmsle'],\n", + " chunk_size=CHUNK_SIZE,\n", + " tune_hyperparameters=False\n", + ")\n", + "estimator.fit(reference)\n", + "results = estimator.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_DLE_RMLSE.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716040455072 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "estimator = nml.DLE(\n", + " feature_column_names=features_selected,\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['mae', 'mape', 'mse', 'msle', 'rmse', 'rmsle'],\n", + " chunk_size=CHUNK_SIZE,\n", + " tune_hyperparameters=False\n", + ")\n", + "estimator.fit(reference)\n", + "results = estimator.estimate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_DLE_ALL.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716040457886 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['rmsle'],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='regression',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_RMSLE.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716040460160 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['rmse',],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='regression',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_RMSE.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716040462955 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['msle',],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='regression',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_MSLE.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716040465224 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['mse',],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='regression',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_MSE.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716040467555 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['mape',],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='regression',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_MAPE.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716040469946 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['mae',],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='regression',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_MAE.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "gather": { + "logged": 1716040478351 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "profiler = Profiler()\n", + "profiler.start()\n", + "\n", + "calc = nml.PerformanceCalculator(\n", + " y_pred='y_pred',\n", + " y_true='y_true',\n", + " metrics=['mae', 'mape', 'mse', 'msle', 'rmse', 'rmsle'],\n", + " chunk_size=CHUNK_SIZE,\n", + " problem_type='regression',\n", + ")\n", + "calc.fit(reference)\n", + "results = calc.calculate(analysis)\n", + "\n", + "profiler.stop()\n", + "profiler.write_html(f'{OUTPUT_PREFIX}_{CHUNK_SIZE//1000}K_PERF_ALL.html')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernel_info": { + "name": "python310-sdkv2" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + }, + "microsoft": { + "host": { + "AzureML": { + "notebookHasBeenCompleted": true + } + }, + "ms_spell_check": { + "ms_spell_check_language": "en" + } + }, + "nteract": { + "version": "nteract-front-end@1.0.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}