From af9110091d6889418ba7d53aad242bb8f00cd168 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20de=20Ryckel?= <f.deryckel@gmail.com>
Date: Sat, 20 Apr 2024 17:24:34 +0700
Subject: [PATCH] ML part 2

---
 docs/blog.html                                |   4 +-
 docs/machine-learning-part2.html              |   4 +-
 docs/search.json                              | 121 +++++++++++++++++-
 docs/sitemap.xml                              |   6 +-
 .../machine-learning-part2/xgboost/index.qmd  |  69 ++++++----
 5 files changed, 173 insertions(+), 31 deletions(-)
diff --git a/docs/blog.html b/docs/blog.html
index a1f6d43..2fe8af6 100644
--- a/docs/blog.html
+++ b/docs/blog.html
@@ -260,7 +260,7 @@ <h3 class="no-anchor listing-title">
 </a>
 </div>
 </div>
-<div class="quarto-post image-right" data-index="1" data-categories="xgboost,tidymodel" data-listing-date-sort="1713114000000" data-listing-file-modified-sort="1713266271062" data-listing-date-modified-sort="1713200400000" data-listing-reading-time-sort="4">
+<div class="quarto-post image-right" data-index="1" data-categories="xgboost,tidymodel" data-listing-date-sort="1713114000000" data-listing-file-modified-sort="1713342969219" data-listing-date-modified-sort="1713200400000" data-listing-reading-time-sort="5">
 <div class="body">
 <a href="./posts/machine-learning-part2/xgboost/index.html">
 <h3 class="no-anchor listing-title">
@@ -290,7 +290,7 @@ <h3 class="no-anchor listing-title">
 Apr 15, 2024
 </div>
 <div class="listing-reading-time">
-4 min
+5 min
 </div>
 </a>
 </div>
diff --git a/docs/machine-learning-part2.html b/docs/machine-learning-part2.html
index 3a84b84..32d7b77 100644
--- a/docs/machine-learning-part2.html
+++ b/docs/machine-learning-part2.html
@@ -168,7 +168,7 @@ <h1 class="title">Series: Machine Learning - Part 2</h1>
 
 <div class="quarto-listing quarto-listing-container-default" id="listing-listing">
 <div class="list quarto-listing-default">
-<div class="quarto-post image-right" data-index="0" data-categories="xgboost,tidymodel" data-listing-date-sort="1713114000000" data-listing-file-modified-sort="1713266271062" data-listing-date-modified-sort="1713200400000" data-listing-reading-time-sort="4">
+<div class="quarto-post image-right" data-index="0" data-categories="xgboost,tidymodel" data-listing-date-sort="1713114000000" data-listing-file-modified-sort="1713342969219" data-listing-date-modified-sort="1713200400000" data-listing-reading-time-sort="5">
 <div class="body">
 <a href="./posts/machine-learning-part2/xgboost/index.html">
 <h3 class="no-anchor listing-title">
@@ -188,7 +188,7 @@ <h3 class="no-anchor listing-title">
 Apr 15, 2024
 </div>
 <div class="listing-reading-time">
-4 min
+5 min
 </div>
 </a>
 </div>
diff --git a/docs/search.json b/docs/search.json
index c77b281..55b4052 100644
--- a/docs/search.json
+++ b/docs/search.json
@@ -690,7 +690,7 @@
     "href": "machine-learning-part1.html",
     "title": "Series: Machine Learning - Part 1",
     "section": "",
-    "text": "KNN\n\n\n\n\n\nUsing KNN in both python and R\n\n\n\n\n\n\nNov 14, 2023\n\n\n12 min\n\n\n\n\n\n\n\n\nNaive-Bayes - Part 1\n\n\n\n\n\nMaking Naive-Bayes work in R\n\n\n\n\n\n\nMay 16, 2023\n\n\n1 min\n\n\n\n\n\n\n\n\nLinear Regression\n\n\n\n\n\nA dive into the math behind the linear regression algorithm.\n\n\n\n\n\n\nApr 14, 2023\n\n\n5 min\n\n\n\n\n\n\n\n\nIntro to Kmeans\n\n\n\n\n\n\n\n\n\n\n\n\nOct 31, 2022\n\n\n8 min\n\n\n\n\n\n\nNo matching items"
+    "text": "Defining Success\n\n\n\n\n\n\n\n\n\n\n\n\nApr 16, 2024\n\n\n1 min\n\n\n\n\n\n\n\n\nKNN\n\n\n\n\n\nUsing KNN in both python and R\n\n\n\n\n\n\nNov 14, 2023\n\n\n12 min\n\n\n\n\n\n\n\n\nNaive-Bayes - Part 1\n\n\n\n\n\nMaking Naive-Bayes work in R\n\n\n\n\n\n\nMay 16, 2023\n\n\n1 min\n\n\n\n\n\n\n\n\nLinear Regression\n\n\n\n\n\nA dive into the math behind the linear regression algorithm.\n\n\n\n\n\n\nApr 14, 2023\n\n\n5 min\n\n\n\n\n\n\n\n\nIntro to Kmeans\n\n\n\n\n\n\n\n\n\n\n\n\nOct 31, 2022\n\n\n8 min\n\n\n\n\n\n\n\n\nKmeans with regime changes\n\n\n\n\n\n\n\n\n\n\n\n\nOct 12, 2022\n\n\n6 min\n\n\n\n\n\n\nNo matching items"
   },
   {
     "objectID": "quant-part2.html",
@@ -1083,5 +1083,124 @@
     "title": "Series",
     "section": "",
     "text": "A series of posts on machine learning algorithms that focuses on trees, bagging and boosting."
+  },
+  {
+    "objectID": "posts/machine-learning-part2/xgboost/index.html",
+    "href": "posts/machine-learning-part2/xgboost/index.html",
+    "title": "Xgboost",
+    "section": "",
+    "text": "Using Xgboost from a quant perspective. We do a whole cycle of model building on a financial time-series. We’ll again show how to do it with both framework Sklearn for Python and tidymodel for R.\nWe have taken a stock, but this can be applied on an index, or commodity futures, etc.\n\nSetting up the data frame\nWe are just loading the data set and doing the initial cleaning so the features engineering can be achieved smoothly.\n\nPythonR\n\n\n\nimport pandas as pd\nimport matplotlib.pyplot as plt \nimport numpy as np\n\ndf = pd.read_csv('../../../raw_data/AA.csv')\ndf['date'] = pd.to_datetime(df['date'])\ndf = df.sort_values(by = 'date', inplace = False)\ndf.set_index('date', inplace=True)\n\ndf = df[['open', 'high', 'low', 'close', 'volume', 'adjClose']]\n\ndf.head()\n\n             open   high    low  close   volume  adjClose\ndate                                                     \n2001-01-02  80.50  80.95  76.60  77.50  1592010     57.23\n2001-01-03  77.50  80.50  75.24  78.55  2011985     58.01\n2001-01-04  78.55  81.25  77.65  81.10  1992468     59.89\n2001-01-05  81.10  81.70  78.85  79.60  1623845     58.78\n2001-01-08  79.60  85.91  79.00  80.80  3073616     59.67\n\ndf.describe()\n\n              open         high  ...        volume     adjClose\ncount  5821.000000  5821.000000  ...  5.821000e+03  5821.000000\nmean     46.372343    47.097133  ...  6.519558e+06    40.404558\nstd      24.755757    25.075361  ...  5.452542e+06    18.945874\nmin       5.500000     5.950000  ...  4.254680e+05     5.360000\n25%      24.990000    25.420000  ...  2.656970e+06    23.620000\n50%      38.260000    38.780000  ...  5.129900e+06    36.270000\n75%      69.210000    69.980000  ...  8.773242e+06    56.910000\nmax     115.010000   117.190000  ...  1.007518e+08    96.360000\n\n[8 rows x 6 columns]\n\ndf.isnull().sum()\n\nopen        0\nhigh        0\nlow         0\nclose       0\nvolume      0\nadjClose    0\ndtype: int64\n\n\n\n\n\nlibrary(readr)\nlibrary(dplyr)\nlibrary(skimr)\n\ndfr = read_csv('../../../raw_data/AA.csv') |&gt; \n  select(date, open, high, low, close, volume, adjClose)\n\nskim(dfr)\n\n\nData summary\n\n\nName\ndfr\n\n\nNumber of rows\n5821\n\n\nNumber of columns\n7\n\n\n_______________________\n\n\n\nColumn type frequency:\n\n\n\nDate\n1\n\n\nnumeric\n6\n\n\n________________________\n\n\n\nGroup variables\nNone\n\n\n\nVariable type: Date\n\n\n\n\n\n\n\n\n\n\n\n\nskim_variable\nn_missing\ncomplete_rate\nmin\nmax\nmedian\nn_unique\n\n\n\n\ndate\n0\n1\n2001-01-02\n2024-02-22\n2012-07-27\n5821\n\n\n\nVariable type: numeric\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nskim_variable\nn_missing\ncomplete_rate\nmean\nsd\np0\np25\np50\np75\np100\nhist\n\n\n\n\nopen\n0\n1\n46.37\n24.76\n5.50\n24.99\n38.26\n69.21\n115.01\n▇▇▃▅▁\n\n\nhigh\n0\n1\n47.10\n25.08\n5.95\n25.42\n38.78\n69.98\n117.19\n▇▇▃▅▁\n\n\nlow\n0\n1\n45.57\n24.38\n5.16\n24.53\n37.72\n68.03\n111.88\n▇▇▃▅▁\n\n\nclose\n0\n1\n46.33\n24.74\n5.48\n25.01\n38.26\n69.04\n113.78\n▇▇▃▅▁\n\n\nvolume\n0\n1\n6519558.18\n5452542.48\n425468.00\n2656970.00\n5129900.00\n8773242.00\n100751769.00\n▇▁▁▁▁\n\n\nadjClose\n0\n1\n40.40\n18.95\n5.36\n23.62\n36.27\n56.91\n96.36\n▆▇▅▅▁\n\n\n\n\n\n\n\n\n\n\nFeature engineering\n\nPythonR\n\n\n\ndf['returns'] = np.log(df['adjClose'] / df['adjClose'].shift(1))\ndf['ret_1m'] = df['returns'].rolling(20).sum()\n\nfeature_list = []\n\nfor r in range(11, 81, 5): \n  df['ret_' + str(r)] = df['returns'].rolling(r).sum()\n  df['std_' + str(r)] = df['returns'].rolling(r).std()\n  feature_list.append('ret_' + str(r))\n  feature_list.append('std_' + str(r))\n\ndf1a = df\n\ndf1a['o_c'] = (df1a['open'] - df1a['close']) / df1a['close']\ndf1a['h_l'] = (df1a['high'] - df1a['low']) / df1a['close']\ndf1a['ret_21d'] = np.log(df1a['close'] / df1a['close'].shift(21))\ndf1a['roll_sd_ret21d_1Y'] = df1a['ret_21d'].rolling(window = 251).std()\ndf1a['volum_sma200'] = df1a['volume'].rolling(window = 200).mean()\ndf1a['perc_above_volu_sma200'] = np.log(df1a['volume'] / df1a['volum_sma200'])\ndf1a['roll_sd_volum_1Y'] = df1a['volume'].rolling(window = 251).std()\ndf1a['sma50'] = df1a['close'].rolling(window = 50).mean()\ndf1a['perc_above_sma50'] = np.log(df1a['close'] / df1a['sma50'])\ndf1a['sma200'] = df1a['close'].rolling(window = 200).mean()\ndf1a['perc_above_sma200'] = np.log(df1a['close'] / df1a['sma200'])\ndf1a['roll_corr_sma50_sma200'] = df1a['sma200'].rolling(window = 252).corr(df1a['sma50'])\n\n# setting up a target variable. \n# is the stock above 5% in 2 weeks time. \ndf1a['target'] = np.where(df1a['close'].shift(-41) &gt; 1.01 * df1a['close'], 1, 0)\n\ndf1a = df1a.drop(['open', 'high', 'low', 'close', 'adjClose', 'volume', 'sma50', 'sma200', 'volum_sma200', 'returns'], axis = 1)\ndf1a = df1a.dropna()\n\ntarget = df1a['target']\ndf1a = df1a.drop(['target'], axis = 1)\n\n\ndf.dropna(inplace = True)  \n\ndf1a.values\n\narray([[ 0.15564846,  0.21637398,  0.04071166, ...,  0.05192485,\n        -0.27383066,  0.86749277],\n       [ 0.1761608 ,  0.22536981,  0.03979403, ...,  0.03793276,\n        -0.28789777,  0.87185623],\n       [ 0.123086  ,  0.19948936,  0.0422375 , ...,  0.00811607,\n        -0.31777658,  0.87606062],\n       ...,\n       [-0.03425118, -0.09976226,  0.04401878, ..., -0.09835741,\n        -0.12808572,  0.89240416],\n       [-0.05395427,  0.04088161,  0.03663959, ..., -0.05392038,\n        -0.0805411 ,  0.88992296],\n       [-0.06992937,  0.00469569,  0.03579207, ..., -0.06330803,\n        -0.08669331,  0.8874239 ]])\n\n\n\n\n\n\n\n\n\n\nBase Model\n\nPythonR\n\n\n\nfrom sklearn.model_selection import (train_test_split, RandomizedSearchCV, TimeSeriesSplit)\n\nx_train, x_test, y_train, y_test = train_test_split(df1a, target, test_size = 0.2, random_state = 42, shuffle = False)\n\nprint(f\"Train set size is {len(x_train)} and test set size is {len(x_test)}\")\n\nTrain set size is 4296 and test set size is 1075\n\n\nLet’s now fit a basic model without any tuning\n\nfrom xgboost import XGBClassifier\n\nmodel_xgb = XGBClassifier(verbosity = 1, random_state = 42)\nmodel_xgb.fit(x_train, y_train)\n\nXGBClassifier(base_score=None, booster=None, callbacks=None,\n              colsample_bylevel=None, colsample_bynode=None,\n              colsample_bytree=None, device=None, early_stopping_rounds=None,\n              enable_categorical=False, eval_metric=None, feature_types=None,\n              gamma=None, grow_policy=None, importance_type=None,\n              interaction_constraints=None, learning_rate=None, max_bin=None,\n              max_cat_threshold=None, max_cat_to_onehot=None,\n              max_delta_step=None, max_depth=None, max_leaves=None,\n              min_child_weight=None, missing=nan, monotone_constraints=None,\n              multi_strategy=None, n_estimators=None, n_jobs=None,\n              num_parallel_tree=None, random_state=42, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. XGBClassifieriFittedXGBClassifier(base_score=None, booster=None, callbacks=None,\n              colsample_bylevel=None, colsample_bynode=None,\n              colsample_bytree=None, device=None, early_stopping_rounds=None,\n              enable_categorical=False, eval_metric=None, feature_types=None,\n              gamma=None, grow_policy=None, importance_type=None,\n              interaction_constraints=None, learning_rate=None, max_bin=None,\n              max_cat_threshold=None, max_cat_to_onehot=None,\n              max_delta_step=None, max_depth=None, max_leaves=None,\n              min_child_weight=None, missing=nan, monotone_constraints=None,\n              multi_strategy=None, n_estimators=None, n_jobs=None,\n              num_parallel_tree=None, random_state=42, ...) \n\n\n# and now go onto prediction \ny_pred = model_xgb.predict(x_test)\n\n# or we can also use probability prediction\ny_pred_proba = model_xgb.predict_proba(x_test)\n\nAnd we can check our result on this basic xgboost model\n\nfrom sklearn.metrics import accuracy_score, roc_auc_score, roc_curve\nfrom sklearn.metrics import ConfusionMatrixDisplay, classification_report, RocCurveDisplay\n\nacc_train = accuracy_score(y_train, model_xgb.predict(x_train))\nacc_test = accuracy_score(y_test, model_xgb.predict(x_test))\n\n\ndisp = ConfusionMatrixDisplay.from_estimator(\n        model_xgb,\n        x_test,\n        y_test,\n        display_labels = model_xgb.classes_,\n        cmap=plt.cm.Blues\n    )\ndisp.ax_.set_title('Confusion matrix')\nplt.show()\n\n\n\n\n\nprint(classification_report(y_test, y_pred))\n\n              precision    recall  f1-score   support\n\n           0       0.49      0.85      0.62       537\n           1       0.46      0.13      0.21       538\n\n    accuracy                           0.49      1075\n   macro avg       0.48      0.49      0.42      1075\nweighted avg       0.48      0.49      0.42      1075\n\n\nAnd the ROC curve\n\n#plt.clf()\ndisp_roc = RocCurveDisplay.from_estimator(\n            model_xgb,\n            x_test,\n            y_test,\n            name='XGBoost')\ndisp_roc.ax_.set_title('ROC Curve')\nplt.plot([0,1], [0,1], linestyle='--')\nplt.show()\n\n\n\n\n\n\n\n\n\n\n\n\nHyperparameters and fine tuning\n\nPythonR\n\n\n\nfrom sklearn.model_selection import TimeSeriesSplit\nfrom sklearn.model_selection import RandomizedSearchCV\n\ntscv = TimeSeriesSplit(n_splits = 5, gap = 23)\nmodel_xgb.get_params()\n\n{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': 1}\n\nparam_grid = {'learning_rate': [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],\n              'max_depth': [3, 4, 5, 6, 8, 10, 12, 15],\n              'min_child_weight': [1, 3, 5, 7],\n              'gamma': [0.0, 0.1, 0.2 , 0.3, 0.4],\n              'colsample_bytree': [0.3, 0.4, 0.5 , 0.7]}\n            \nxv_xgb = RandomizedSearchCV(model_xgb, param_grid, n_iter = 100, scoring = 'f1', cv = tscv, verbose = 1)\n\nxv_xgb.fit(x_train, y_train, verbose = 1)\n\nRandomizedSearchCV(cv=TimeSeriesSplit(gap=23, max_train_size=None, n_splits=5, test_size=None),\n                   estimator=XGBClassifier(base_score=None, booster=None,\n                                           callbacks=None,\n                                           colsample_bylevel=None,\n                                           colsample_bynode=None,\n                                           colsample_bytree=None, device=None,\n                                           early_stopping_rounds=None,\n                                           enable_categorical=False,\n                                           eval_metric=None, feature_types=None,\n                                           gamma=None, grow_policy=...\n                                           monotone_constraints=None,\n                                           multi_strategy=None,\n                                           n_estimators=None, n_jobs=None,\n                                           num_parallel_tree=None,\n                                           random_state=42, ...),\n                   n_iter=100,\n                   param_distributions={'colsample_bytree': [0.3, 0.4, 0.5,\n                                                             0.7],\n                                        'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],\n                                        'learning_rate': [0.05, 0.1, 0.15, 0.2,\n                                                          0.25, 0.3],\n                                        'max_depth': [3, 4, 5, 6, 8, 10, 12,\n                                                      15],\n                                        'min_child_weight': [1, 3, 5, 7]},\n                   scoring='f1', verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.  RandomizedSearchCV?Documentation for RandomizedSearchCViFittedRandomizedSearchCV(cv=TimeSeriesSplit(gap=23, max_train_size=None, n_splits=5, test_size=None),\n                   estimator=XGBClassifier(base_score=None, booster=None,\n                                           callbacks=None,\n                                           colsample_bylevel=None,\n                                           colsample_bynode=None,\n                                           colsample_bytree=None, device=None,\n                                           early_stopping_rounds=None,\n                                           enable_categorical=False,\n                                           eval_metric=None, feature_types=None,\n                                           gamma=None, grow_policy=...\n                                           monotone_constraints=None,\n                                           multi_strategy=None,\n                                           n_estimators=None, n_jobs=None,\n                                           num_parallel_tree=None,\n                                           random_state=42, ...),\n                   n_iter=100,\n                   param_distributions={'colsample_bytree': [0.3, 0.4, 0.5,\n                                                             0.7],\n                                        'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],\n                                        'learning_rate': [0.05, 0.1, 0.15, 0.2,\n                                                          0.25, 0.3],\n                                        'max_depth': [3, 4, 5, 6, 8, 10, 12,\n                                                      15],\n                                        'min_child_weight': [1, 3, 5, 7]},\n                   scoring='f1', verbose=1) estimator: XGBClassifierXGBClassifier(base_score=None, booster=None, callbacks=None,\n              colsample_bylevel=None, colsample_bynode=None,\n              colsample_bytree=None, device=None, early_stopping_rounds=None,\n              enable_categorical=False, eval_metric=None, feature_types=None,\n              gamma=None, grow_policy=None, importance_type=None,\n              interaction_constraints=None, learning_rate=None, max_bin=None,\n              max_cat_threshold=None, max_cat_to_onehot=None,\n              max_delta_step=None, max_depth=None, max_leaves=None,\n              min_child_weight=None, missing=nan, monotone_constraints=None,\n              multi_strategy=None, n_estimators=None, n_jobs=None,\n              num_parallel_tree=None, random_state=42, ...) XGBClassifierXGBClassifier(base_score=None, booster=None, callbacks=None,\n              colsample_bylevel=None, colsample_bynode=None,\n              colsample_bytree=None, device=None, early_stopping_rounds=None,\n              enable_categorical=False, eval_metric=None, feature_types=None,\n              gamma=None, grow_policy=None, importance_type=None,\n              interaction_constraints=None, learning_rate=None, max_bin=None,\n              max_cat_threshold=None, max_cat_to_onehot=None,\n              max_delta_step=None, max_depth=None, max_leaves=None,\n              min_child_weight=None, missing=nan, monotone_constraints=None,\n              multi_strategy=None, n_estimators=None, n_jobs=None,\n              num_parallel_tree=None, random_state=42, ...) \n\nxv_xgb.best_params_\n\n{'min_child_weight': 7, 'max_depth': 12, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.7}\n\nxv_xgb.best_score_\n\n0.5009456435677468\n\n\nNow we need to train the model based on the best paramaters fromt the cross-validation process.\n\nfrom sklearn.model_selection import cross_val_score\n\nmodel_xgb_tuned = XGBClassifier(**xv_xgb.best_params_)\n\nmodel_xgb_tuned.fit(x_train, y_train, \n                    eval_set = [(x_train, y_train), (x_test, y_test)],         \n                    #eval_metric = 'precision', \n                    verbose = True)\n\nXGBClassifier(base_score=None, booster=None, callbacks=None,\n              colsample_bylevel=None, colsample_bynode=None,\n              colsample_bytree=0.7, device=None, early_stopping_rounds=None,\n              enable_categorical=False, eval_metric=None, feature_types=None,\n              gamma=0.1, grow_policy=None, importance_type=None,\n              interaction_constraints=None, learning_rate=0.1, max_bin=None,\n              max_cat_threshold=None, max_cat_to_onehot=None,\n              max_delta_step=None, max_depth=12, max_leaves=None,\n              min_child_weight=7, missing=nan, monotone_constraints=None,\n              multi_strategy=None, n_estimators=None, n_jobs=None,\n              num_parallel_tree=None, random_state=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. XGBClassifieriFittedXGBClassifier(base_score=None, booster=None, callbacks=None,\n              colsample_bylevel=None, colsample_bynode=None,\n              colsample_bytree=0.7, device=None, early_stopping_rounds=None,\n              enable_categorical=False, eval_metric=None, feature_types=None,\n              gamma=0.1, grow_policy=None, importance_type=None,\n              interaction_constraints=None, learning_rate=0.1, max_bin=None,\n              max_cat_threshold=None, max_cat_to_onehot=None,\n              max_delta_step=None, max_depth=12, max_leaves=None,\n              min_child_weight=7, missing=nan, monotone_constraints=None,\n              multi_strategy=None, n_estimators=None, n_jobs=None,\n              num_parallel_tree=None, random_state=None, ...) \n\neval_results = model_xgb_tuned.evals_result()\n#eval_results\n\nscore = cross_val_score(model_xgb_tuned, x_train, y_train, cv = tscv)\nprint(f'Mean CV score for: {score.mean():0.4}')\n\nMean CV score for: 0.4961\n\n\n\n\n\n\n\n\n\n\nFeature importance\n\nPythonR"
+  },
+  {
+    "objectID": "posts/machine-learning-part2/xgboost-time-series/index.html",
+    "href": "posts/machine-learning-part2/xgboost-time-series/index.html",
+    "title": "Translating Python Part 1 - Xgboost with Time-Series",
+    "section": "",
+    "text": "This post is about using xgboost on a time-series using both R with the tidymodel framework and python. It is part of a series of articles aiming at translating python timeseries blog articles into their tidymodels equivalent.\nThe raw data is quite simple as it is energy consumption based on an hourly consumption. Original article can be found here. Minimal changes were made to better fit current python practices.\nXgboost is part of the ensemble machine learning algorithms. It can be used for both regression and classification. There are few issues in using Xgboost with time-series. This article is taking a Xgboost post in python and also translating with the new R tidymodel framework."
+  },
+  {
+    "objectID": "posts/machine-learning-part2/xgboost-time-series/index.html#using-r",
+    "href": "posts/machine-learning-part2/xgboost-time-series/index.html#using-r",
+    "title": "Translating Python Part 1 - Xgboost with Time-Series",
+    "section": "Using R",
+    "text": "Using R\n\n# setting up main R libraries to start \n\nlibrary(glue)\nlibrary(readr)\nlibrary(dplyr)\nlibrary(ggplot2)\ndf0 &lt;- read_csv(\"../../../raw_data/AEP_hourly.csv\")\n# let's have a quick look at what we are dealing with\nglimpse(df0)\n\nRows: 121,273\nColumns: 2\n$ Datetime &lt;dttm&gt; 2004-12-31 01:00:00, 2004-12-31 02:00:00, 2004-12-31 03:00:0…\n$ AEP_MW   &lt;dbl&gt; 13478, 12865, 12577, 12517, 12670, 13038, 13692, 14297, 14719…\n\n\nThere are only 2 variables. The Datetime being the only independ variable. And the energy consumption labelled as AEP_MW being our variable to predict.\n\n# and graphically - \n# just using a couple of years to get an idea \nggplot(df0 |&gt; filter(Datetime &gt; \"2014-01-01\" & Datetime &lt; \"2016-01-01\"), aes(x =Datetime, y=AEP_MW )) + geom_line(color = \"light blue\")\n\n\n\n\nFigure 1: Graphical glimpse of our raw data\n\n\n\n\nAs Datetime is our only input variable, we’ll use the usual tricks of breaking it down into week number, months, etc. I am doing it slightly differently than in the python version here as I will first create the new time related variables then I will split it into training and testing.\n\nlibrary(lubridate)\ndf &lt;- df0 |&gt; \n  mutate(hour = hour(Datetime), \n         day_of_week = wday(Datetime), \n         day_of_year = yday(Datetime), \n         day_of_month = mday(Datetime), \n         week_of_year = isoweek(Datetime), \n         month = month(Datetime), \n         quarter = quarter(Datetime), \n         year = isoyear(Datetime)\n         ) \n# another glimpse now. \nglimpse(df)\n\nRows: 121,273\nColumns: 10\n$ Datetime     &lt;dttm&gt; 2004-12-31 01:00:00, 2004-12-31 02:00:00, 2004-12-31 03:…\n$ AEP_MW       &lt;dbl&gt; 13478, 12865, 12577, 12517, 12670, 13038, 13692, 14297, 1…\n$ hour         &lt;int&gt; 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…\n$ day_of_week  &lt;dbl&gt; 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, …\n$ day_of_year  &lt;dbl&gt; 366, 366, 366, 366, 366, 366, 366, 366, 366, 366, 366, 36…\n$ day_of_month &lt;int&gt; 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 3…\n$ week_of_year &lt;dbl&gt; 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 5…\n$ month        &lt;dbl&gt; 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1…\n$ quarter      &lt;int&gt; 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …\n$ year         &lt;dbl&gt; 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 200…\n\n\nAlthough, there are only 2 variables, there are over 120,000 rows of data. That’s non-negligible."
+  },
+  {
+    "objectID": "posts/machine-learning-part2/xgboost-time-series/index.html#using-python",
+    "href": "posts/machine-learning-part2/xgboost-time-series/index.html#using-python",
+    "title": "Translating Python Part 1 - Xgboost with Time-Series",
+    "section": "Using python",
+    "text": "Using python\nThis is the code from the original post.\n\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\npy_df = pd.read_csv(\"../../../raw_data/AEP_hourly.csv\", index_col = [0], parse_dates = [0])\npy_df.tail()\n\n                      AEP_MW\nDatetime                    \n2018-01-01 20:00:00  21089.0\n2018-01-01 21:00:00  20999.0\n2018-01-01 22:00:00  20820.0\n2018-01-01 23:00:00  20415.0\n2018-01-02 00:00:00  19993.0\n\n#plt.plot(df0)\nsplit_date = '01-jan-2016'\npy_df_train = py_df.loc[py_df.index &lt;= split_date].copy()\npy_df_test = py_df.loc[py_df.index &gt; split_date].copy()\n\nThe author of the python blog first created a train / test set then created a function to add the variables then applied that function to both sets. This is a very valid way of doing things when steps include normalizing and/or scaling data before applying our ML algorithms as we don’t want any leakage from our training set into our testing set.\n\n# Create features of df\ndef create_features(df, label = None): \n  df['date'] = df.index \n  df['hour'] = df['date'].dt.hour\n  df['day_of_week'] = df['date'].dt.dayofweek\n  df['day_of_year'] = df['date'].dt.dayofyear \n  df['day_of_month'] = df['date'].dt.day \n  df['week_of_year'] = df['date'].dt.isocalendar().week \n  df['month'] = df['date'].dt.month \n  df['quarter'] = df['date'].dt.quarter \n  df['year'] = df['date'].dt.year\n  \n  X = df[['hour', 'day_of_week', 'day_of_year', 'day_of_month', 'week_of_year', 'month', 'quarter', 'year']]\n  \n  if label: \n    y = df[label]\n    return X, y\n  \n  return X\n\nCompare this way of constructing variables to the much easier and more elegant tidyverse’s way of cleaning and creating variables. The dplyr package really makes it painless to wrangle data."
+  },
+  {
+    "objectID": "posts/machine-learning-part2/xgboost-time-series/index.html#using-r-1",
+    "href": "posts/machine-learning-part2/xgboost-time-series/index.html#using-r-1",
+    "title": "Translating Python Part 1 - Xgboost with Time-Series",
+    "section": "Using R",
+    "text": "Using R\nRsample is the tidymodel package that deals with creating training and testing sets. There are really many methods available to do this, but we stick to the same methods provided in the original blog post. There are out-of-the-box methods to deal with timeseries like in this case.\n\nlibrary(rsample)\nprop_split = 1 - (nrow(df |&gt; filter(Datetime &gt; \"2016-01-01\")) / nrow(df))\ndf_split &lt;- initial_time_split(df |&gt; arrange(Datetime), prop = prop_split)\ndf_train &lt;- training(df_split)\ndf_test &lt;- testing(df_split)"
+  },
+  {
+    "objectID": "posts/machine-learning-part2/xgboost-time-series/index.html#using-python-1",
+    "href": "posts/machine-learning-part2/xgboost-time-series/index.html#using-python-1",
+    "title": "Translating Python Part 1 - Xgboost with Time-Series",
+    "section": "Using Python",
+    "text": "Using Python\n\npy_x_train, py_y_train = create_features(py_df_train, label = \"AEP_MW\")\npy_x_test, py_y_test =   create_features(py_df_test, label = \"AEP_MW\")\n#When running xgboost, I got an issue with one of the type of the variable.  \n# Let's fix this. \npy_x_train.info()\n\n&lt;class 'pandas.core.frame.DataFrame'&gt;\nDatetimeIndex: 98594 entries, 2004-12-31 01:00:00 to 2015-01-02 00:00:00\nData columns (total 8 columns):\n #   Column        Non-Null Count  Dtype \n---  ------        --------------  ----- \n 0   hour          98594 non-null  int32 \n 1   day_of_week   98594 non-null  int32 \n 2   day_of_year   98594 non-null  int32 \n 3   day_of_month  98594 non-null  int32 \n 4   week_of_year  98594 non-null  UInt32\n 5   month         98594 non-null  int32 \n 6   quarter       98594 non-null  int32 \n 7   year          98594 non-null  int32 \ndtypes: UInt32(1), int32(7)\nmemory usage: 3.9 MB\n\npy_x_train = py_x_train.astype(np.int64)\npy_x_test = py_x_test.astype(np.int64)\npy_x_train.info()\n\n&lt;class 'pandas.core.frame.DataFrame'&gt;\nDatetimeIndex: 98594 entries, 2004-12-31 01:00:00 to 2015-01-02 00:00:00\nData columns (total 8 columns):\n #   Column        Non-Null Count  Dtype\n---  ------        --------------  -----\n 0   hour          98594 non-null  int64\n 1   day_of_week   98594 non-null  int64\n 2   day_of_year   98594 non-null  int64\n 3   day_of_month  98594 non-null  int64\n 4   week_of_year  98594 non-null  int64\n 5   month         98594 non-null  int64\n 6   quarter       98594 non-null  int64\n 7   year          98594 non-null  int64\ndtypes: int64(8)\nmemory usage: 6.8 MB"
+  },
+  {
+    "objectID": "posts/machine-learning-part2/xgboost-time-series/index.html#using-r-2",
+    "href": "posts/machine-learning-part2/xgboost-time-series/index.html#using-r-2",
+    "title": "Translating Python Part 1 - Xgboost with Time-Series",
+    "section": "Using R",
+    "text": "Using R\nAgain this is a very straightforward xgboost application to a dataset. No fine tuning of models, recipe, etc.\n\nlibrary(parsnip)\nmodel_xgboost &lt;- boost_tree(stop_iter = 50L, trees=1000L) |&gt; \n  set_engine(\"xgboost\") |&gt;\n  set_mode(\"regression\")\n  \nfit_xgboost &lt;- model_xgboost |&gt; \n  fit(AEP_MW ~., data = df_train %&gt;% select(-Datetime))\nfit_xgboost\n\nparsnip model object\n\n##### xgb.Booster\nraw: 4.7 Mb \ncall:\n  xgboost::xgb.train(params = list(eta = 0.3, max_depth = 6, gamma = 0, \n    colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 1, \n    subsample = 1), data = x$data, nrounds = 1000L, watchlist = x$watchlist, \n    verbose = 0, early_stopping_rounds = 50L, nthread = 1, objective = \"reg:squarederror\")\nparams (as set within xgb.train):\n  eta = \"0.3\", max_depth = \"6\", gamma = \"0\", colsample_bytree = \"1\", colsample_bynode = \"1\", min_child_weight = \"1\", subsample = \"1\", nthread = \"1\", objective = \"reg:squarederror\", validate_parameters = \"TRUE\"\nxgb.attributes:\n  best_iteration, best_msg, best_ntreelimit, best_score, niter\ncallbacks:\n  cb.evaluation.log()\n  cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize, \n    verbose = verbose)\n# of features: 8 \nniter: 1000\nbest_iteration : 1000 \nbest_ntreelimit : 1000 \nbest_score : 242.3155 \nbest_msg : [1000]   training-rmse:242.315489 \nnfeatures : 8 \nevaluation_log:\n     iter training_rmse\n    &lt;num&gt;         &lt;num&gt;\n        1    11175.8839\n        2     7906.5875\n---                    \n      999      242.5272\n     1000      242.3155"
+  },
+  {
+    "objectID": "posts/machine-learning-part2/xgboost-time-series/index.html#using-python-2",
+    "href": "posts/machine-learning-part2/xgboost-time-series/index.html#using-python-2",
+    "title": "Translating Python Part 1 - Xgboost with Time-Series",
+    "section": "Using Python",
+    "text": "Using Python\n\nfrom xgboost.sklearn import XGBRegressor\npy_xgboost_mod = XGBRegressor(n_estimator = 1000, early_stopping_rounds = 50)\npy_xgboost_mod.fit(py_x_train, py_y_train, \n                   eval_set = [(py_x_train, py_y_train), (py_x_test, py_y_test)], \n                   verbose = True)\n\nXGBRegressor(base_score=None, booster=None, callbacks=None,\n             colsample_bylevel=None, colsample_bynode=None,\n             colsample_bytree=None, device=None, early_stopping_rounds=50,\n             enable_categorical=False, eval_metric=None, feature_types=None,\n             gamma=None, grow_policy=None, importance_type=None,\n             interaction_constraints=None, learning_rate=None, max_bin=None,\n             max_cat_threshold=None, max_cat_to_onehot=None,\n             max_delta_step=None, max_depth=None, max_leaves=None,\n             min_child_weight=None, missing=nan, monotone_constraints=None,\n             multi_strategy=None, n_estimator=1000, n_estimators=None,\n             n_jobs=None, num_parallel_tree=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. XGBRegressoriFittedXGBRegressor(base_score=None, booster=None, callbacks=None,\n             colsample_bylevel=None, colsample_bynode=None,\n             colsample_bytree=None, device=None, early_stopping_rounds=50,\n             enable_categorical=False, eval_metric=None, feature_types=None,\n             gamma=None, grow_policy=None, importance_type=None,\n             interaction_constraints=None, learning_rate=None, max_bin=None,\n             max_cat_threshold=None, max_cat_to_onehot=None,\n             max_delta_step=None, max_depth=None, max_leaves=None,\n             min_child_weight=None, missing=nan, monotone_constraints=None,\n             multi_strategy=None, n_estimator=1000, n_estimators=None,\n             n_jobs=None, num_parallel_tree=None, ...)"
+  },
+  {
+    "objectID": "posts/machine-learning-part2/xgboost-time-series/index.html#using-r-3",
+    "href": "posts/machine-learning-part2/xgboost-time-series/index.html#using-r-3",
+    "title": "Translating Python Part 1 - Xgboost with Time-Series",
+    "section": "Using R",
+    "text": "Using R\n2 ways to do this … (actually more than 2 ways, but here are 2 main ways.). First one is a straight table using the xgboost library itself.\n\nlibrary(xgboost)\nxgb.importance(model = fit_xgboost$fit)\n\n        Feature        Gain       Cover    Frequency\n         &lt;char&gt;       &lt;num&gt;       &lt;num&gt;        &lt;num&gt;\n1:  day_of_year 0.361826849 0.455387001 0.2800303942\n2:         hour 0.336853508 0.125331328 0.2374139102\n3:         year 0.120130183 0.129691117 0.2000679018\n4:  day_of_week 0.105250961 0.073258066 0.1489636887\n5: week_of_year 0.047082964 0.097216236 0.0462379151\n6: day_of_month 0.027801172 0.116483820 0.0864293336\n7:        month 0.001054364 0.002632432 0.0008568565\n\n#detach(xgboost)\n\nAnd also a graphic way.\n\nlibrary(vip)\nfit_xgboost %&gt;%\n  vip(geom = \"point\")"
+  },
+  {
+    "objectID": "posts/machine-learning-part2/xgboost-time-series/index.html#using-python-3",
+    "href": "posts/machine-learning-part2/xgboost-time-series/index.html#using-python-3",
+    "title": "Translating Python Part 1 - Xgboost with Time-Series",
+    "section": "Using python",
+    "text": "Using python\n\nfrom xgboost import plot_importance, plot_tree\n_ = plot_importance(py_xgboost_mod, height=0.9)\n\n\n\n\nI am a bit confused here in the output of the python graph with F-score vs the output of the R graph with importance."
+  },
+  {
+    "objectID": "posts/machine-learning-part2/xgboost-time-series/index.html#using-r-4",
+    "href": "posts/machine-learning-part2/xgboost-time-series/index.html#using-r-4",
+    "title": "Translating Python Part 1 - Xgboost with Time-Series",
+    "section": "Using R",
+    "text": "Using R\nGraphing predicted power output vs actual power output could be a first way to see how our model fares in its predictions. So let’s graph our datetime vs power ouput for both actual and predicted.\n\nlibrary(tibble)  # for the add_column \nlibrary(parsnip)\ndf_test1 &lt;- add_column(df_test,  predict(fit_xgboost, new_data = df_test)) \nggplot(df_test1, aes(x= Datetime, y = AEP_MW)) + \n  geom_line(color = \"blue\") + \n  geom_line(aes(y = .pred), color = \"yellow\", alpha = 0.5) + \n  labs(title = \"Energy Consumption in 2016-2018 (in MWh)\", y = \"Hourly consumption\")\n\n\n\n\nFigure 2: Actual Vs Predicted power consumption for 2016-2018\n\n\n\n\nWe can already see that we are not really modeling well the peaks and through.\nWe could get slightly more granular and try to see whats going on.\n\nggplot(df_test1 %&gt;% filter(Datetime &gt; \"2016-01-01\" & Datetime &lt; \"2016-02-28\"), aes(x= Datetime, y = AEP_MW)) + \n  geom_line(color = \"blue\") + \n  geom_line(aes(y = .pred), color = \"yellow3\", alpha = 0.8)\n\n\n\n\nFigure 3: Actual Vs Predicted power consumption\n\n\n\n\nWe are clearly off there on the second half of February.\nNow, we can use the yardstick package to get numerical values to assess our model on the test set.\n\nlibrary(yardstick)\n# calculating the RMSE (root mean square error)\nrmse(df_test1, truth = AEP_MW, estimate = .pred, na_rm = TRUE)\n\n# A tibble: 1 × 3\n  .metric .estimator .estimate\n  &lt;chr&gt;   &lt;chr&gt;          &lt;dbl&gt;\n1 rmse    standard       2067.\n\n# calculating the MAE (mean absolute error)\nmae(df_test1, truth = AEP_MW, estimate = .pred)\n\n# A tibble: 1 × 3\n  .metric .estimator .estimate\n  &lt;chr&gt;   &lt;chr&gt;          &lt;dbl&gt;\n1 mae     standard       1495.\n\n# calculating the MAPE (mean absolute percent error)\nmape(df_test1, truth = AEP_MW, estimate = .pred)\n\n# A tibble: 1 × 3\n  .metric .estimator .estimate\n  &lt;chr&gt;   &lt;chr&gt;          &lt;dbl&gt;\n1 mape    standard        10.0\n\n# actually much easier to use the metric_set() function !\nxgboost_mod_metrics &lt;- metric_set(rmse, mae, mape)\nxgboost_mod_metrics(df_test1, truth = AEP_MW, estimate = .pred) \n\n# A tibble: 3 × 3\n  .metric .estimator .estimate\n  &lt;chr&gt;   &lt;chr&gt;          &lt;dbl&gt;\n1 rmse    standard      2067. \n2 mae     standard      1495. \n3 mape    standard        10.0"
+  },
+  {
+    "objectID": "machine-learning-part2.html",
+    "href": "machine-learning-part2.html",
+    "title": "Series: Machine Learning - Part 2",
+    "section": "",
+    "text": "Xgboost\n\n\n\n\n\n\n\n\n\n\n\n\nApr 15, 2024\n\n\n5 min\n\n\n\n\n\n\n\n\nTranslating Python Part 1 - Xgboost with Time-Series\n\n\n\n\n\n\n\n\n\n\n\n\nOct 1, 2022\n\n\n7 min\n\n\n\n\n\n\nNo matching items"
+  },
+  {
+    "objectID": "posts/machine-learning-part1/metrics/index.html",
+    "href": "posts/machine-learning-part1/metrics/index.html",
+    "title": "Defining Success",
+    "section": "",
+    "text": "When evaluating models for a given ML algorithm, we need to define in advance what would be our metric to measure success.\nThere are so many ways out there to measure which hyper-parameters performed best for our mode. We’ll bring up some of the most used one."
+  },
+  {
+    "objectID": "posts/machine-learning-part1/metrics/index.html#accuracy",
+    "href": "posts/machine-learning-part1/metrics/index.html#accuracy",
+    "title": "Defining Success",
+    "section": "Accuracy",
+    "text": "Accuracy\nShortcomings:\n\nfor imbalanced dataset, we can have good accuracy by just predicting most observation with the most frequent class. For instance in the case of a rare disease or big financial meltdown, we can just predict"
+  },
+  {
+    "objectID": "posts/machine-learning-part1/metrics/index.html#precision",
+    "href": "posts/machine-learning-part1/metrics/index.html#precision",
+    "title": "Defining Success",
+    "section": "Precision",
+    "text": "Precision\nIf you call it true, is it indeed true? In other words, the proportion of predicted positive that are actually positive."
+  },
+  {
+    "objectID": "posts/machine-learning-part1/metrics/index.html#recall",
+    "href": "posts/machine-learning-part1/metrics/index.html#recall",
+    "title": "Defining Success",
+    "section": "Recall",
+    "text": "Recall\nIf there is a positive, did the model predict a positive."
+  },
+  {
+    "objectID": "posts/machine-learning-part1/metrics/index.html#f1-score",
+    "href": "posts/machine-learning-part1/metrics/index.html#f1-score",
+    "title": "Defining Success",
+    "section": "F1 score",
+    "text": "F1 score\nIt is the harmonic mean of both precision and recall. The harmonic mean penalizes model that have very low precision or recall. Which wouldn’t be the case with arithmetic mean.\n\\[\\frac{2 \\cdot Precision \\cdot Recall}{Precision + Recall}\\]"
   }
 ]
\ No newline at end of file
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
index bbf1a21..1db7fd2 100644
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -2,7 +2,7 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
   <url>
     <loc>https://fderyckel.github.io/blog.html/blog.html</loc>
-    <lastmod>2024-04-16T16:51:38.167Z</lastmod>
+    <lastmod>2024-04-17T08:39:44.316Z</lastmod>
   </url>
   <url>
     <loc>https://fderyckel.github.io/blog.html/quant-part1.html</loc>
@@ -198,7 +198,7 @@
   </url>
   <url>
     <loc>https://fderyckel.github.io/blog.html/posts/machine-learning-part2/xgboost/index.html</loc>
-    <lastmod>2024-04-16T11:17:59.678Z</lastmod>
+    <lastmod>2024-04-17T08:39:42.002Z</lastmod>
   </url>
   <url>
     <loc>https://fderyckel.github.io/blog.html/posts/machine-learning-part2/xgboost-time-series/index.html</loc>
@@ -206,7 +206,7 @@
   </url>
   <url>
     <loc>https://fderyckel.github.io/blog.html/machine-learning-part2.html</loc>
-    <lastmod>2024-04-16T11:18:02.168Z</lastmod>
+    <lastmod>2024-04-17T08:39:44.636Z</lastmod>
   </url>
   <url>
     <loc>https://fderyckel.github.io/blog.html/posts/machine-learning-part1/metrics/index.html</loc>
diff --git a/posts/machine-learning-part2/xgboost/index.qmd b/posts/machine-learning-part2/xgboost/index.qmd
index cc9d0c8..dedb856 100644
--- a/posts/machine-learning-part2/xgboost/index.qmd
+++ b/posts/machine-learning-part2/xgboost/index.qmd
@@ -39,6 +39,8 @@ df.describe()
 df.isnull().sum()
 ```
 
+No missing data, we can move forward and start the feature engineering process. 
+
 ## R
 
 ```{r}
@@ -140,15 +142,26 @@ Let's now fit a basic model without any tuning
 #| label: py_xgboost_base_model
 
 from xgboost import XGBClassifier
+from catboost import CatBoostClassifier
 
-model_xgb = XGBClassifier(verbosity = 1, random_state = 41)
+# using xgboost 
+model_xgb = XGBClassifier(random_state = 17, verbosity = 0)
 model_xgb.fit(x_train, y_train)
 
+# using catboost
+model_cb = CatBoostClassifier(random_state = 17, verbose = False)
+model_cb.fit(x_train, y_train)
+
 # and now go onto prediction 
-y_pred = model_xgb.predict(x_test)
+y_pred_xgb = model_xgb.predict(x_test)
+y_pred_cb = model_cb.predict(x_test)
 
 # or we can also use probability prediction
-y_pred_proba = model_xgb.predict_proba(x_test)
+y_pred_proba_xgb = model_xgb.predict_proba(x_test)
+y_pred_proba_cb = model_cb.predict_proba(x_test)
+
+# for comparison purposes
+#yo = pd.DataFrame({'y_test': y_test, 'y_pred_xgb': y_pred_xgb, 'y_pred_cb': y_pred_cb})
 ```
 
 And we can check our result on this basic xgboost model
@@ -160,11 +173,13 @@ from sklearn.metrics import accuracy_score, f1_score
 from sklearn.metrics import ConfusionMatrixDisplay, classification_report 
 from sklearn.metrics import roc_auc_score, roc_curve, RocCurveDisplay
 
-acc_train = accuracy_score(y_train, model_xgb.predict(x_train))
-acc_test = accuracy_score(y_test, model_xgb.predict(x_test))
+#acc_train = accuracy_score(y_train, model_xgb.predict(x_train))
+acc_test_xgb = accuracy_score(y_test, model_xgb.predict(x_test))
+acc_test_cb = accuracy_score(y_test, model_cb.predict(x_test))
 
 #f1_train = f1_score(y_train, model_xgb.predict(x_train))
-#f1_test = f1_score(y_test, model_xgb.predict(x_test))
+f1_test_xgb = f1_score(y_test, model_xgb.predict(x_test))
+f1_test_cb = f1_score(y_test, model_cb.predict(x_test))
 
 
 disp = ConfusionMatrixDisplay.from_estimator(
@@ -179,7 +194,8 @@ plt.show()
 ```
 
 ```{python}
-print(classification_report(y_test, y_pred))
+print(classification_report(y_test, y_pred_xgb))
+print(classification_report(y_test, y_pred_cb))
 ```
 
 And the ROC curve
@@ -212,23 +228,36 @@ plt.show()
 from sklearn.model_selection import TimeSeriesSplit
 from sklearn.model_selection import RandomizedSearchCV
 
+# defining the samples for the cross-validation process
 tscv = TimeSeriesSplit(n_splits = 5, gap = 23)
+
 model_xgb.get_params()
+param_grid_xgb = {'learning_rate': [0.20, 0.25, 0.30], 
+                  'max_depth': [10, 12, 14, 16], 
+                  'min_child_weight': [3, 5, 7], 
+                  'gamma': [0.2 , 0.3, 0.4], 
+                  'colsample_bytree': [0.4, 0.5 , 0.7]}
 
-param_grid = {'learning_rate': [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
-              'max_depth': [3, 4, 5, 6, 8, 10, 12, 15],
-              'min_child_weight': [1, 3, 5, 7],
-              'gamma': [0.0, 0.1, 0.2 , 0.3, 0.4],
-              'colsample_bytree': [0.3, 0.4, 0.5 , 0.7]}
-            
-xv_xgb = RandomizedSearchCV(model_xgb, param_grid, n_iter = 100, 
+xv_xgb = RandomizedSearchCV(model_xgb, param_grid_xgb, n_iter = 10, 
                             scoring = 'f1', cv = tscv, verbose = 1)
-
 xv_xgb.fit(x_train, y_train, verbose = 1)
-
 xv_xgb.best_params_
 xv_xgb.best_score_
 
+#model_cb.get_params()              
+param_grid_cb = {'learning_rate': [0.20, 0.25, 0.30], 
+                 'depth': [8, 10, 12, 15], 
+                 'l2_leaf_reg': [1, 3, 5, 7], 
+                 'border_count': [254, 300, 400 , 500],
+                 'bagging_temperature': [0.3, 0.7, 1.3, 1.7]}
+
+xv_cb = RandomizedSearchCV(model_cb, param_grid_cb, n_iter = 50, 
+                            scoring = 'f1', cv = tscv, verbose = 1)
+xv_cb.fit(x_train, y_train, verbose = 1)
+
+xv_cb.best_params_
+xv_cb.best_score_
+
 ```
 
 Now we need to train the model based on the best parameters from the cross-validation process. 
@@ -298,8 +327,8 @@ plt.show()
 #plt.bar(x=feat_imp['Importance Score'], height = feat_imp['Features'])
 #ax.set_title('Features Importance');
 
-plt.clf()
 from xgboost import plot_importance
+plt.clf()
 plot_importance(model_xgb_tuned)
 plt.show()
 
@@ -314,12 +343,6 @@ plot_importance(model_xgb_tuned, importance_type='weight', show_values=False)
 plt.show()
 ```
 
-```{python}
-from xgboost import to_graphviz
-
-```
-
-
 
 ```{python}