From af9110091d6889418ba7d53aad242bb8f00cd168 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20de=20Ryckel?= Date: Sat, 20 Apr 2024 17:24:34 +0700 Subject: [PATCH] ML part 2 --- docs/blog.html | 4 +- docs/machine-learning-part2.html | 4 +- docs/search.json | 121 +++++++++++++++++- docs/sitemap.xml | 6 +- .../machine-learning-part2/xgboost/index.qmd | 69 ++++++---- 5 files changed, 173 insertions(+), 31 deletions(-) diff --git a/docs/blog.html b/docs/blog.html index a1f6d43..2fe8af6 100644 --- a/docs/blog.html +++ b/docs/blog.html @@ -260,7 +260,7 @@

-
+
-4 min +5 min
diff --git a/docs/machine-learning-part2.html b/docs/machine-learning-part2.html index 3a84b84..32d7b77 100644 --- a/docs/machine-learning-part2.html +++ b/docs/machine-learning-part2.html @@ -168,7 +168,7 @@

Series: Machine Learning - Part 2

-
+
-4 min +5 min
diff --git a/docs/search.json b/docs/search.json index c77b281..55b4052 100644 --- a/docs/search.json +++ b/docs/search.json @@ -690,7 +690,7 @@ "href": "machine-learning-part1.html", "title": "Series: Machine Learning - Part 1", "section": "", - "text": "KNN\n\n\n\n\n\nUsing KNN in both python and R\n\n\n\n\n\n\nNov 14, 2023\n\n\n12 min\n\n\n\n\n\n\n\n\nNaive-Bayes - Part 1\n\n\n\n\n\nMaking Naive-Bayes work in R\n\n\n\n\n\n\nMay 16, 2023\n\n\n1 min\n\n\n\n\n\n\n\n\nLinear Regression\n\n\n\n\n\nA dive into the math behind the linear regression algorithm.\n\n\n\n\n\n\nApr 14, 2023\n\n\n5 min\n\n\n\n\n\n\n\n\nIntro to Kmeans\n\n\n\n\n\n\n\n\n\n\n\n\nOct 31, 2022\n\n\n8 min\n\n\n\n\n\n\nNo matching items" + "text": "Defining Success\n\n\n\n\n\n\n\n\n\n\n\n\nApr 16, 2024\n\n\n1 min\n\n\n\n\n\n\n\n\nKNN\n\n\n\n\n\nUsing KNN in both python and R\n\n\n\n\n\n\nNov 14, 2023\n\n\n12 min\n\n\n\n\n\n\n\n\nNaive-Bayes - Part 1\n\n\n\n\n\nMaking Naive-Bayes work in R\n\n\n\n\n\n\nMay 16, 2023\n\n\n1 min\n\n\n\n\n\n\n\n\nLinear Regression\n\n\n\n\n\nA dive into the math behind the linear regression algorithm.\n\n\n\n\n\n\nApr 14, 2023\n\n\n5 min\n\n\n\n\n\n\n\n\nIntro to Kmeans\n\n\n\n\n\n\n\n\n\n\n\n\nOct 31, 2022\n\n\n8 min\n\n\n\n\n\n\n\n\nKmeans with regime changes\n\n\n\n\n\n\n\n\n\n\n\n\nOct 12, 2022\n\n\n6 min\n\n\n\n\n\n\nNo matching items" }, { "objectID": "quant-part2.html", @@ -1083,5 +1083,124 @@ "title": "Series", "section": "", "text": "A series of posts on machine learning algorithms that focuses on trees, bagging and boosting." + }, + { + "objectID": "posts/machine-learning-part2/xgboost/index.html", + "href": "posts/machine-learning-part2/xgboost/index.html", + "title": "Xgboost", + "section": "", + "text": "Using Xgboost from a quant perspective. We do a whole cycle of model building on a financial time-series. We’ll again show how to do it with both framework Sklearn for Python and tidymodel for R.\nWe have taken a stock, but this can be applied on an index, or commodity futures, etc.\n\nSetting up the data frame\nWe are just loading the data set and doing the initial cleaning so the features engineering can be achieved smoothly.\n\nPythonR\n\n\n\nimport pandas as pd\nimport matplotlib.pyplot as plt \nimport numpy as np\n\ndf = pd.read_csv('../../../raw_data/AA.csv')\ndf['date'] = pd.to_datetime(df['date'])\ndf = df.sort_values(by = 'date', inplace = False)\ndf.set_index('date', inplace=True)\n\ndf = df[['open', 'high', 'low', 'close', 'volume', 'adjClose']]\n\ndf.head()\n\n open high low close volume adjClose\ndate \n2001-01-02 80.50 80.95 76.60 77.50 1592010 57.23\n2001-01-03 77.50 80.50 75.24 78.55 2011985 58.01\n2001-01-04 78.55 81.25 77.65 81.10 1992468 59.89\n2001-01-05 81.10 81.70 78.85 79.60 1623845 58.78\n2001-01-08 79.60 85.91 79.00 80.80 3073616 59.67\n\ndf.describe()\n\n open high ... volume adjClose\ncount 5821.000000 5821.000000 ... 5.821000e+03 5821.000000\nmean 46.372343 47.097133 ... 6.519558e+06 40.404558\nstd 24.755757 25.075361 ... 5.452542e+06 18.945874\nmin 5.500000 5.950000 ... 4.254680e+05 5.360000\n25% 24.990000 25.420000 ... 2.656970e+06 23.620000\n50% 38.260000 38.780000 ... 5.129900e+06 36.270000\n75% 69.210000 69.980000 ... 8.773242e+06 56.910000\nmax 115.010000 117.190000 ... 1.007518e+08 96.360000\n\n[8 rows x 6 columns]\n\ndf.isnull().sum()\n\nopen 0\nhigh 0\nlow 0\nclose 0\nvolume 0\nadjClose 0\ndtype: int64\n\n\n\n\n\nlibrary(readr)\nlibrary(dplyr)\nlibrary(skimr)\n\ndfr = read_csv('../../../raw_data/AA.csv') |> \n select(date, open, high, low, close, volume, adjClose)\n\nskim(dfr)\n\n\nData summary\n\n\nName\ndfr\n\n\nNumber of rows\n5821\n\n\nNumber of columns\n7\n\n\n_______________________\n\n\n\nColumn type frequency:\n\n\n\nDate\n1\n\n\nnumeric\n6\n\n\n________________________\n\n\n\nGroup variables\nNone\n\n\n\nVariable type: Date\n\n\n\n\n\n\n\n\n\n\n\n\nskim_variable\nn_missing\ncomplete_rate\nmin\nmax\nmedian\nn_unique\n\n\n\n\ndate\n0\n1\n2001-01-02\n2024-02-22\n2012-07-27\n5821\n\n\n\nVariable type: numeric\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nskim_variable\nn_missing\ncomplete_rate\nmean\nsd\np0\np25\np50\np75\np100\nhist\n\n\n\n\nopen\n0\n1\n46.37\n24.76\n5.50\n24.99\n38.26\n69.21\n115.01\n▇▇▃▅▁\n\n\nhigh\n0\n1\n47.10\n25.08\n5.95\n25.42\n38.78\n69.98\n117.19\n▇▇▃▅▁\n\n\nlow\n0\n1\n45.57\n24.38\n5.16\n24.53\n37.72\n68.03\n111.88\n▇▇▃▅▁\n\n\nclose\n0\n1\n46.33\n24.74\n5.48\n25.01\n38.26\n69.04\n113.78\n▇▇▃▅▁\n\n\nvolume\n0\n1\n6519558.18\n5452542.48\n425468.00\n2656970.00\n5129900.00\n8773242.00\n100751769.00\n▇▁▁▁▁\n\n\nadjClose\n0\n1\n40.40\n18.95\n5.36\n23.62\n36.27\n56.91\n96.36\n▆▇▅▅▁\n\n\n\n\n\n\n\n\n\n\nFeature engineering\n\nPythonR\n\n\n\ndf['returns'] = np.log(df['adjClose'] / df['adjClose'].shift(1))\ndf['ret_1m'] = df['returns'].rolling(20).sum()\n\nfeature_list = []\n\nfor r in range(11, 81, 5): \n df['ret_' + str(r)] = df['returns'].rolling(r).sum()\n df['std_' + str(r)] = df['returns'].rolling(r).std()\n feature_list.append('ret_' + str(r))\n feature_list.append('std_' + str(r))\n\ndf1a = df\n\ndf1a['o_c'] = (df1a['open'] - df1a['close']) / df1a['close']\ndf1a['h_l'] = (df1a['high'] - df1a['low']) / df1a['close']\ndf1a['ret_21d'] = np.log(df1a['close'] / df1a['close'].shift(21))\ndf1a['roll_sd_ret21d_1Y'] = df1a['ret_21d'].rolling(window = 251).std()\ndf1a['volum_sma200'] = df1a['volume'].rolling(window = 200).mean()\ndf1a['perc_above_volu_sma200'] = np.log(df1a['volume'] / df1a['volum_sma200'])\ndf1a['roll_sd_volum_1Y'] = df1a['volume'].rolling(window = 251).std()\ndf1a['sma50'] = df1a['close'].rolling(window = 50).mean()\ndf1a['perc_above_sma50'] = np.log(df1a['close'] / df1a['sma50'])\ndf1a['sma200'] = df1a['close'].rolling(window = 200).mean()\ndf1a['perc_above_sma200'] = np.log(df1a['close'] / df1a['sma200'])\ndf1a['roll_corr_sma50_sma200'] = df1a['sma200'].rolling(window = 252).corr(df1a['sma50'])\n\n# setting up a target variable. \n# is the stock above 5% in 2 weeks time. \ndf1a['target'] = np.where(df1a['close'].shift(-41) > 1.01 * df1a['close'], 1, 0)\n\ndf1a = df1a.drop(['open', 'high', 'low', 'close', 'adjClose', 'volume', 'sma50', 'sma200', 'volum_sma200', 'returns'], axis = 1)\ndf1a = df1a.dropna()\n\ntarget = df1a['target']\ndf1a = df1a.drop(['target'], axis = 1)\n\n\ndf.dropna(inplace = True) \n\ndf1a.values\n\narray([[ 0.15564846, 0.21637398, 0.04071166, ..., 0.05192485,\n -0.27383066, 0.86749277],\n [ 0.1761608 , 0.22536981, 0.03979403, ..., 0.03793276,\n -0.28789777, 0.87185623],\n [ 0.123086 , 0.19948936, 0.0422375 , ..., 0.00811607,\n -0.31777658, 0.87606062],\n ...,\n [-0.03425118, -0.09976226, 0.04401878, ..., -0.09835741,\n -0.12808572, 0.89240416],\n [-0.05395427, 0.04088161, 0.03663959, ..., -0.05392038,\n -0.0805411 , 0.88992296],\n [-0.06992937, 0.00469569, 0.03579207, ..., -0.06330803,\n -0.08669331, 0.8874239 ]])\n\n\n\n\n\n\n\n\n\n\nBase Model\n\nPythonR\n\n\n\nfrom sklearn.model_selection import (train_test_split, RandomizedSearchCV, TimeSeriesSplit)\n\nx_train, x_test, y_train, y_test = train_test_split(df1a, target, test_size = 0.2, random_state = 42, shuffle = False)\n\nprint(f\"Train set size is {len(x_train)} and test set size is {len(x_test)}\")\n\nTrain set size is 4296 and test set size is 1075\n\n\nLet’s now fit a basic model without any tuning\n\nfrom xgboost import XGBClassifier\n\nmodel_xgb = XGBClassifier(verbosity = 1, random_state = 42)\nmodel_xgb.fit(x_train, y_train)\n\nXGBClassifier(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=None, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=None, n_jobs=None,\n num_parallel_tree=None, random_state=42, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. XGBClassifieriFittedXGBClassifier(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=None, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=None, n_jobs=None,\n num_parallel_tree=None, random_state=42, ...) \n\n\n# and now go onto prediction \ny_pred = model_xgb.predict(x_test)\n\n# or we can also use probability prediction\ny_pred_proba = model_xgb.predict_proba(x_test)\n\nAnd we can check our result on this basic xgboost model\n\nfrom sklearn.metrics import accuracy_score, roc_auc_score, roc_curve\nfrom sklearn.metrics import ConfusionMatrixDisplay, classification_report, RocCurveDisplay\n\nacc_train = accuracy_score(y_train, model_xgb.predict(x_train))\nacc_test = accuracy_score(y_test, model_xgb.predict(x_test))\n\n\ndisp = ConfusionMatrixDisplay.from_estimator(\n model_xgb,\n x_test,\n y_test,\n display_labels = model_xgb.classes_,\n cmap=plt.cm.Blues\n )\ndisp.ax_.set_title('Confusion matrix')\nplt.show()\n\n\n\n\n\nprint(classification_report(y_test, y_pred))\n\n precision recall f1-score support\n\n 0 0.49 0.85 0.62 537\n 1 0.46 0.13 0.21 538\n\n accuracy 0.49 1075\n macro avg 0.48 0.49 0.42 1075\nweighted avg 0.48 0.49 0.42 1075\n\n\nAnd the ROC curve\n\n#plt.clf()\ndisp_roc = RocCurveDisplay.from_estimator(\n model_xgb,\n x_test,\n y_test,\n name='XGBoost')\ndisp_roc.ax_.set_title('ROC Curve')\nplt.plot([0,1], [0,1], linestyle='--')\nplt.show()\n\n\n\n\n\n\n\n\n\n\n\n\nHyperparameters and fine tuning\n\nPythonR\n\n\n\nfrom sklearn.model_selection import TimeSeriesSplit\nfrom sklearn.model_selection import RandomizedSearchCV\n\ntscv = TimeSeriesSplit(n_splits = 5, gap = 23)\nmodel_xgb.get_params()\n\n{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': 1}\n\nparam_grid = {'learning_rate': [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],\n 'max_depth': [3, 4, 5, 6, 8, 10, 12, 15],\n 'min_child_weight': [1, 3, 5, 7],\n 'gamma': [0.0, 0.1, 0.2 , 0.3, 0.4],\n 'colsample_bytree': [0.3, 0.4, 0.5 , 0.7]}\n \nxv_xgb = RandomizedSearchCV(model_xgb, param_grid, n_iter = 100, scoring = 'f1', cv = tscv, verbose = 1)\n\nxv_xgb.fit(x_train, y_train, verbose = 1)\n\nRandomizedSearchCV(cv=TimeSeriesSplit(gap=23, max_train_size=None, n_splits=5, test_size=None),\n estimator=XGBClassifier(base_score=None, booster=None,\n callbacks=None,\n colsample_bylevel=None,\n colsample_bynode=None,\n colsample_bytree=None, device=None,\n early_stopping_rounds=None,\n enable_categorical=False,\n eval_metric=None, feature_types=None,\n gamma=None, grow_policy=...\n monotone_constraints=None,\n multi_strategy=None,\n n_estimators=None, n_jobs=None,\n num_parallel_tree=None,\n random_state=42, ...),\n n_iter=100,\n param_distributions={'colsample_bytree': [0.3, 0.4, 0.5,\n 0.7],\n 'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],\n 'learning_rate': [0.05, 0.1, 0.15, 0.2,\n 0.25, 0.3],\n 'max_depth': [3, 4, 5, 6, 8, 10, 12,\n 15],\n 'min_child_weight': [1, 3, 5, 7]},\n scoring='f1', verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.  RandomizedSearchCV?Documentation for RandomizedSearchCViFittedRandomizedSearchCV(cv=TimeSeriesSplit(gap=23, max_train_size=None, n_splits=5, test_size=None),\n estimator=XGBClassifier(base_score=None, booster=None,\n callbacks=None,\n colsample_bylevel=None,\n colsample_bynode=None,\n colsample_bytree=None, device=None,\n early_stopping_rounds=None,\n enable_categorical=False,\n eval_metric=None, feature_types=None,\n gamma=None, grow_policy=...\n monotone_constraints=None,\n multi_strategy=None,\n n_estimators=None, n_jobs=None,\n num_parallel_tree=None,\n random_state=42, ...),\n n_iter=100,\n param_distributions={'colsample_bytree': [0.3, 0.4, 0.5,\n 0.7],\n 'gamma': [0.0, 0.1, 0.2, 0.3, 0.4],\n 'learning_rate': [0.05, 0.1, 0.15, 0.2,\n 0.25, 0.3],\n 'max_depth': [3, 4, 5, 6, 8, 10, 12,\n 15],\n 'min_child_weight': [1, 3, 5, 7]},\n scoring='f1', verbose=1) estimator: XGBClassifierXGBClassifier(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=None, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=None, n_jobs=None,\n num_parallel_tree=None, random_state=42, ...) XGBClassifierXGBClassifier(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=None, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=None, n_jobs=None,\n num_parallel_tree=None, random_state=42, ...) \n\nxv_xgb.best_params_\n\n{'min_child_weight': 7, 'max_depth': 12, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.7}\n\nxv_xgb.best_score_\n\n0.5009456435677468\n\n\nNow we need to train the model based on the best paramaters fromt the cross-validation process.\n\nfrom sklearn.model_selection import cross_val_score\n\nmodel_xgb_tuned = XGBClassifier(**xv_xgb.best_params_)\n\nmodel_xgb_tuned.fit(x_train, y_train, \n eval_set = [(x_train, y_train), (x_test, y_test)], \n #eval_metric = 'precision', \n verbose = True)\n\nXGBClassifier(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=0.7, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=0.1, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=0.1, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=12, max_leaves=None,\n min_child_weight=7, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=None, n_jobs=None,\n num_parallel_tree=None, random_state=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. XGBClassifieriFittedXGBClassifier(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=0.7, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=0.1, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=0.1, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=12, max_leaves=None,\n min_child_weight=7, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=None, n_jobs=None,\n num_parallel_tree=None, random_state=None, ...) \n\neval_results = model_xgb_tuned.evals_result()\n#eval_results\n\nscore = cross_val_score(model_xgb_tuned, x_train, y_train, cv = tscv)\nprint(f'Mean CV score for: {score.mean():0.4}')\n\nMean CV score for: 0.4961\n\n\n\n\n\n\n\n\n\n\nFeature importance\n\nPythonR" + }, + { + "objectID": "posts/machine-learning-part2/xgboost-time-series/index.html", + "href": "posts/machine-learning-part2/xgboost-time-series/index.html", + "title": "Translating Python Part 1 - Xgboost with Time-Series", + "section": "", + "text": "This post is about using xgboost on a time-series using both R with the tidymodel framework and python. It is part of a series of articles aiming at translating python timeseries blog articles into their tidymodels equivalent.\nThe raw data is quite simple as it is energy consumption based on an hourly consumption. Original article can be found here. Minimal changes were made to better fit current python practices.\nXgboost is part of the ensemble machine learning algorithms. It can be used for both regression and classification. There are few issues in using Xgboost with time-series. This article is taking a Xgboost post in python and also translating with the new R tidymodel framework." + }, + { + "objectID": "posts/machine-learning-part2/xgboost-time-series/index.html#using-r", + "href": "posts/machine-learning-part2/xgboost-time-series/index.html#using-r", + "title": "Translating Python Part 1 - Xgboost with Time-Series", + "section": "Using R", + "text": "Using R\n\n# setting up main R libraries to start \n\nlibrary(glue)\nlibrary(readr)\nlibrary(dplyr)\nlibrary(ggplot2)\ndf0 <- read_csv(\"../../../raw_data/AEP_hourly.csv\")\n# let's have a quick look at what we are dealing with\nglimpse(df0)\n\nRows: 121,273\nColumns: 2\n$ Datetime <dttm> 2004-12-31 01:00:00, 2004-12-31 02:00:00, 2004-12-31 03:00:0…\n$ AEP_MW <dbl> 13478, 12865, 12577, 12517, 12670, 13038, 13692, 14297, 14719…\n\n\nThere are only 2 variables. The Datetime being the only independ variable. And the energy consumption labelled as AEP_MW being our variable to predict.\n\n# and graphically - \n# just using a couple of years to get an idea \nggplot(df0 |> filter(Datetime > \"2014-01-01\" & Datetime < \"2016-01-01\"), aes(x =Datetime, y=AEP_MW )) + geom_line(color = \"light blue\")\n\n\n\n\nFigure 1: Graphical glimpse of our raw data\n\n\n\n\nAs Datetime is our only input variable, we’ll use the usual tricks of breaking it down into week number, months, etc. I am doing it slightly differently than in the python version here as I will first create the new time related variables then I will split it into training and testing.\n\nlibrary(lubridate)\ndf <- df0 |> \n mutate(hour = hour(Datetime), \n day_of_week = wday(Datetime), \n day_of_year = yday(Datetime), \n day_of_month = mday(Datetime), \n week_of_year = isoweek(Datetime), \n month = month(Datetime), \n quarter = quarter(Datetime), \n year = isoyear(Datetime)\n ) \n# another glimpse now. \nglimpse(df)\n\nRows: 121,273\nColumns: 10\n$ Datetime <dttm> 2004-12-31 01:00:00, 2004-12-31 02:00:00, 2004-12-31 03:…\n$ AEP_MW <dbl> 13478, 12865, 12577, 12517, 12670, 13038, 13692, 14297, 1…\n$ hour <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…\n$ day_of_week <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, …\n$ day_of_year <dbl> 366, 366, 366, 366, 366, 366, 366, 366, 366, 366, 366, 36…\n$ day_of_month <int> 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 3…\n$ week_of_year <dbl> 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 5…\n$ month <dbl> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1…\n$ quarter <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …\n$ year <dbl> 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004, 200…\n\n\nAlthough, there are only 2 variables, there are over 120,000 rows of data. That’s non-negligible." + }, + { + "objectID": "posts/machine-learning-part2/xgboost-time-series/index.html#using-python", + "href": "posts/machine-learning-part2/xgboost-time-series/index.html#using-python", + "title": "Translating Python Part 1 - Xgboost with Time-Series", + "section": "Using python", + "text": "Using python\nThis is the code from the original post.\n\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\npy_df = pd.read_csv(\"../../../raw_data/AEP_hourly.csv\", index_col = [0], parse_dates = [0])\npy_df.tail()\n\n AEP_MW\nDatetime \n2018-01-01 20:00:00 21089.0\n2018-01-01 21:00:00 20999.0\n2018-01-01 22:00:00 20820.0\n2018-01-01 23:00:00 20415.0\n2018-01-02 00:00:00 19993.0\n\n#plt.plot(df0)\nsplit_date = '01-jan-2016'\npy_df_train = py_df.loc[py_df.index <= split_date].copy()\npy_df_test = py_df.loc[py_df.index > split_date].copy()\n\nThe author of the python blog first created a train / test set then created a function to add the variables then applied that function to both sets. This is a very valid way of doing things when steps include normalizing and/or scaling data before applying our ML algorithms as we don’t want any leakage from our training set into our testing set.\n\n# Create features of df\ndef create_features(df, label = None): \n df['date'] = df.index \n df['hour'] = df['date'].dt.hour\n df['day_of_week'] = df['date'].dt.dayofweek\n df['day_of_year'] = df['date'].dt.dayofyear \n df['day_of_month'] = df['date'].dt.day \n df['week_of_year'] = df['date'].dt.isocalendar().week \n df['month'] = df['date'].dt.month \n df['quarter'] = df['date'].dt.quarter \n df['year'] = df['date'].dt.year\n \n X = df[['hour', 'day_of_week', 'day_of_year', 'day_of_month', 'week_of_year', 'month', 'quarter', 'year']]\n \n if label: \n y = df[label]\n return X, y\n \n return X\n\nCompare this way of constructing variables to the much easier and more elegant tidyverse’s way of cleaning and creating variables. The dplyr package really makes it painless to wrangle data." + }, + { + "objectID": "posts/machine-learning-part2/xgboost-time-series/index.html#using-r-1", + "href": "posts/machine-learning-part2/xgboost-time-series/index.html#using-r-1", + "title": "Translating Python Part 1 - Xgboost with Time-Series", + "section": "Using R", + "text": "Using R\nRsample is the tidymodel package that deals with creating training and testing sets. There are really many methods available to do this, but we stick to the same methods provided in the original blog post. There are out-of-the-box methods to deal with timeseries like in this case.\n\nlibrary(rsample)\nprop_split = 1 - (nrow(df |> filter(Datetime > \"2016-01-01\")) / nrow(df))\ndf_split <- initial_time_split(df |> arrange(Datetime), prop = prop_split)\ndf_train <- training(df_split)\ndf_test <- testing(df_split)" + }, + { + "objectID": "posts/machine-learning-part2/xgboost-time-series/index.html#using-python-1", + "href": "posts/machine-learning-part2/xgboost-time-series/index.html#using-python-1", + "title": "Translating Python Part 1 - Xgboost with Time-Series", + "section": "Using Python", + "text": "Using Python\n\npy_x_train, py_y_train = create_features(py_df_train, label = \"AEP_MW\")\npy_x_test, py_y_test = create_features(py_df_test, label = \"AEP_MW\")\n#When running xgboost, I got an issue with one of the type of the variable. \n# Let's fix this. \npy_x_train.info()\n\n<class 'pandas.core.frame.DataFrame'>\nDatetimeIndex: 98594 entries, 2004-12-31 01:00:00 to 2015-01-02 00:00:00\nData columns (total 8 columns):\n # Column Non-Null Count Dtype \n--- ------ -------------- ----- \n 0 hour 98594 non-null int32 \n 1 day_of_week 98594 non-null int32 \n 2 day_of_year 98594 non-null int32 \n 3 day_of_month 98594 non-null int32 \n 4 week_of_year 98594 non-null UInt32\n 5 month 98594 non-null int32 \n 6 quarter 98594 non-null int32 \n 7 year 98594 non-null int32 \ndtypes: UInt32(1), int32(7)\nmemory usage: 3.9 MB\n\npy_x_train = py_x_train.astype(np.int64)\npy_x_test = py_x_test.astype(np.int64)\npy_x_train.info()\n\n<class 'pandas.core.frame.DataFrame'>\nDatetimeIndex: 98594 entries, 2004-12-31 01:00:00 to 2015-01-02 00:00:00\nData columns (total 8 columns):\n # Column Non-Null Count Dtype\n--- ------ -------------- -----\n 0 hour 98594 non-null int64\n 1 day_of_week 98594 non-null int64\n 2 day_of_year 98594 non-null int64\n 3 day_of_month 98594 non-null int64\n 4 week_of_year 98594 non-null int64\n 5 month 98594 non-null int64\n 6 quarter 98594 non-null int64\n 7 year 98594 non-null int64\ndtypes: int64(8)\nmemory usage: 6.8 MB" + }, + { + "objectID": "posts/machine-learning-part2/xgboost-time-series/index.html#using-r-2", + "href": "posts/machine-learning-part2/xgboost-time-series/index.html#using-r-2", + "title": "Translating Python Part 1 - Xgboost with Time-Series", + "section": "Using R", + "text": "Using R\nAgain this is a very straightforward xgboost application to a dataset. No fine tuning of models, recipe, etc.\n\nlibrary(parsnip)\nmodel_xgboost <- boost_tree(stop_iter = 50L, trees=1000L) |> \n set_engine(\"xgboost\") |>\n set_mode(\"regression\")\n \nfit_xgboost <- model_xgboost |> \n fit(AEP_MW ~., data = df_train %>% select(-Datetime))\nfit_xgboost\n\nparsnip model object\n\n##### xgb.Booster\nraw: 4.7 Mb \ncall:\n xgboost::xgb.train(params = list(eta = 0.3, max_depth = 6, gamma = 0, \n colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 1, \n subsample = 1), data = x$data, nrounds = 1000L, watchlist = x$watchlist, \n verbose = 0, early_stopping_rounds = 50L, nthread = 1, objective = \"reg:squarederror\")\nparams (as set within xgb.train):\n eta = \"0.3\", max_depth = \"6\", gamma = \"0\", colsample_bytree = \"1\", colsample_bynode = \"1\", min_child_weight = \"1\", subsample = \"1\", nthread = \"1\", objective = \"reg:squarederror\", validate_parameters = \"TRUE\"\nxgb.attributes:\n best_iteration, best_msg, best_ntreelimit, best_score, niter\ncallbacks:\n cb.evaluation.log()\n cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize, \n verbose = verbose)\n# of features: 8 \nniter: 1000\nbest_iteration : 1000 \nbest_ntreelimit : 1000 \nbest_score : 242.3155 \nbest_msg : [1000] training-rmse:242.315489 \nnfeatures : 8 \nevaluation_log:\n iter training_rmse\n <num> <num>\n 1 11175.8839\n 2 7906.5875\n--- \n 999 242.5272\n 1000 242.3155" + }, + { + "objectID": "posts/machine-learning-part2/xgboost-time-series/index.html#using-python-2", + "href": "posts/machine-learning-part2/xgboost-time-series/index.html#using-python-2", + "title": "Translating Python Part 1 - Xgboost with Time-Series", + "section": "Using Python", + "text": "Using Python\n\nfrom xgboost.sklearn import XGBRegressor\npy_xgboost_mod = XGBRegressor(n_estimator = 1000, early_stopping_rounds = 50)\npy_xgboost_mod.fit(py_x_train, py_y_train, \n eval_set = [(py_x_train, py_y_train), (py_x_test, py_y_test)], \n verbose = True)\n\nXGBRegressor(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=50,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=None, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimator=1000, n_estimators=None,\n n_jobs=None, num_parallel_tree=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. XGBRegressoriFittedXGBRegressor(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=50,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=None, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimator=1000, n_estimators=None,\n n_jobs=None, num_parallel_tree=None, ...)" + }, + { + "objectID": "posts/machine-learning-part2/xgboost-time-series/index.html#using-r-3", + "href": "posts/machine-learning-part2/xgboost-time-series/index.html#using-r-3", + "title": "Translating Python Part 1 - Xgboost with Time-Series", + "section": "Using R", + "text": "Using R\n2 ways to do this … (actually more than 2 ways, but here are 2 main ways.). First one is a straight table using the xgboost library itself.\n\nlibrary(xgboost)\nxgb.importance(model = fit_xgboost$fit)\n\n Feature Gain Cover Frequency\n <char> <num> <num> <num>\n1: day_of_year 0.361826849 0.455387001 0.2800303942\n2: hour 0.336853508 0.125331328 0.2374139102\n3: year 0.120130183 0.129691117 0.2000679018\n4: day_of_week 0.105250961 0.073258066 0.1489636887\n5: week_of_year 0.047082964 0.097216236 0.0462379151\n6: day_of_month 0.027801172 0.116483820 0.0864293336\n7: month 0.001054364 0.002632432 0.0008568565\n\n#detach(xgboost)\n\nAnd also a graphic way.\n\nlibrary(vip)\nfit_xgboost %>%\n vip(geom = \"point\")" + }, + { + "objectID": "posts/machine-learning-part2/xgboost-time-series/index.html#using-python-3", + "href": "posts/machine-learning-part2/xgboost-time-series/index.html#using-python-3", + "title": "Translating Python Part 1 - Xgboost with Time-Series", + "section": "Using python", + "text": "Using python\n\nfrom xgboost import plot_importance, plot_tree\n_ = plot_importance(py_xgboost_mod, height=0.9)\n\n\n\n\nI am a bit confused here in the output of the python graph with F-score vs the output of the R graph with importance." + }, + { + "objectID": "posts/machine-learning-part2/xgboost-time-series/index.html#using-r-4", + "href": "posts/machine-learning-part2/xgboost-time-series/index.html#using-r-4", + "title": "Translating Python Part 1 - Xgboost with Time-Series", + "section": "Using R", + "text": "Using R\nGraphing predicted power output vs actual power output could be a first way to see how our model fares in its predictions. So let’s graph our datetime vs power ouput for both actual and predicted.\n\nlibrary(tibble) # for the add_column \nlibrary(parsnip)\ndf_test1 <- add_column(df_test, predict(fit_xgboost, new_data = df_test)) \nggplot(df_test1, aes(x= Datetime, y = AEP_MW)) + \n geom_line(color = \"blue\") + \n geom_line(aes(y = .pred), color = \"yellow\", alpha = 0.5) + \n labs(title = \"Energy Consumption in 2016-2018 (in MWh)\", y = \"Hourly consumption\")\n\n\n\n\nFigure 2: Actual Vs Predicted power consumption for 2016-2018\n\n\n\n\nWe can already see that we are not really modeling well the peaks and through.\nWe could get slightly more granular and try to see whats going on.\n\nggplot(df_test1 %>% filter(Datetime > \"2016-01-01\" & Datetime < \"2016-02-28\"), aes(x= Datetime, y = AEP_MW)) + \n geom_line(color = \"blue\") + \n geom_line(aes(y = .pred), color = \"yellow3\", alpha = 0.8)\n\n\n\n\nFigure 3: Actual Vs Predicted power consumption\n\n\n\n\nWe are clearly off there on the second half of February.\nNow, we can use the yardstick package to get numerical values to assess our model on the test set.\n\nlibrary(yardstick)\n# calculating the RMSE (root mean square error)\nrmse(df_test1, truth = AEP_MW, estimate = .pred, na_rm = TRUE)\n\n# A tibble: 1 × 3\n .metric .estimator .estimate\n <chr> <chr> <dbl>\n1 rmse standard 2067.\n\n# calculating the MAE (mean absolute error)\nmae(df_test1, truth = AEP_MW, estimate = .pred)\n\n# A tibble: 1 × 3\n .metric .estimator .estimate\n <chr> <chr> <dbl>\n1 mae standard 1495.\n\n# calculating the MAPE (mean absolute percent error)\nmape(df_test1, truth = AEP_MW, estimate = .pred)\n\n# A tibble: 1 × 3\n .metric .estimator .estimate\n <chr> <chr> <dbl>\n1 mape standard 10.0\n\n# actually much easier to use the metric_set() function !\nxgboost_mod_metrics <- metric_set(rmse, mae, mape)\nxgboost_mod_metrics(df_test1, truth = AEP_MW, estimate = .pred) \n\n# A tibble: 3 × 3\n .metric .estimator .estimate\n <chr> <chr> <dbl>\n1 rmse standard 2067. \n2 mae standard 1495. \n3 mape standard 10.0" + }, + { + "objectID": "machine-learning-part2.html", + "href": "machine-learning-part2.html", + "title": "Series: Machine Learning - Part 2", + "section": "", + "text": "Xgboost\n\n\n\n\n\n\n\n\n\n\n\n\nApr 15, 2024\n\n\n5 min\n\n\n\n\n\n\n\n\nTranslating Python Part 1 - Xgboost with Time-Series\n\n\n\n\n\n\n\n\n\n\n\n\nOct 1, 2022\n\n\n7 min\n\n\n\n\n\n\nNo matching items" + }, + { + "objectID": "posts/machine-learning-part1/metrics/index.html", + "href": "posts/machine-learning-part1/metrics/index.html", + "title": "Defining Success", + "section": "", + "text": "When evaluating models for a given ML algorithm, we need to define in advance what would be our metric to measure success.\nThere are so many ways out there to measure which hyper-parameters performed best for our mode. We’ll bring up some of the most used one." + }, + { + "objectID": "posts/machine-learning-part1/metrics/index.html#accuracy", + "href": "posts/machine-learning-part1/metrics/index.html#accuracy", + "title": "Defining Success", + "section": "Accuracy", + "text": "Accuracy\nShortcomings:\n\nfor imbalanced dataset, we can have good accuracy by just predicting most observation with the most frequent class. For instance in the case of a rare disease or big financial meltdown, we can just predict" + }, + { + "objectID": "posts/machine-learning-part1/metrics/index.html#precision", + "href": "posts/machine-learning-part1/metrics/index.html#precision", + "title": "Defining Success", + "section": "Precision", + "text": "Precision\nIf you call it true, is it indeed true? In other words, the proportion of predicted positive that are actually positive." + }, + { + "objectID": "posts/machine-learning-part1/metrics/index.html#recall", + "href": "posts/machine-learning-part1/metrics/index.html#recall", + "title": "Defining Success", + "section": "Recall", + "text": "Recall\nIf there is a positive, did the model predict a positive." + }, + { + "objectID": "posts/machine-learning-part1/metrics/index.html#f1-score", + "href": "posts/machine-learning-part1/metrics/index.html#f1-score", + "title": "Defining Success", + "section": "F1 score", + "text": "F1 score\nIt is the harmonic mean of both precision and recall. The harmonic mean penalizes model that have very low precision or recall. Which wouldn’t be the case with arithmetic mean.\n\\[\\frac{2 \\cdot Precision \\cdot Recall}{Precision + Recall}\\]" } ] \ No newline at end of file diff --git a/docs/sitemap.xml b/docs/sitemap.xml index bbf1a21..1db7fd2 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -2,7 +2,7 @@ https://fderyckel.github.io/blog.html/blog.html - 2024-04-16T16:51:38.167Z + 2024-04-17T08:39:44.316Z https://fderyckel.github.io/blog.html/quant-part1.html @@ -198,7 +198,7 @@ https://fderyckel.github.io/blog.html/posts/machine-learning-part2/xgboost/index.html - 2024-04-16T11:17:59.678Z + 2024-04-17T08:39:42.002Z https://fderyckel.github.io/blog.html/posts/machine-learning-part2/xgboost-time-series/index.html @@ -206,7 +206,7 @@ https://fderyckel.github.io/blog.html/machine-learning-part2.html - 2024-04-16T11:18:02.168Z + 2024-04-17T08:39:44.636Z https://fderyckel.github.io/blog.html/posts/machine-learning-part1/metrics/index.html diff --git a/posts/machine-learning-part2/xgboost/index.qmd b/posts/machine-learning-part2/xgboost/index.qmd index cc9d0c8..dedb856 100644 --- a/posts/machine-learning-part2/xgboost/index.qmd +++ b/posts/machine-learning-part2/xgboost/index.qmd @@ -39,6 +39,8 @@ df.describe() df.isnull().sum() ``` +No missing data, we can move forward and start the feature engineering process. + ## R ```{r} @@ -140,15 +142,26 @@ Let's now fit a basic model without any tuning #| label: py_xgboost_base_model from xgboost import XGBClassifier +from catboost import CatBoostClassifier -model_xgb = XGBClassifier(verbosity = 1, random_state = 41) +# using xgboost +model_xgb = XGBClassifier(random_state = 17, verbosity = 0) model_xgb.fit(x_train, y_train) +# using catboost +model_cb = CatBoostClassifier(random_state = 17, verbose = False) +model_cb.fit(x_train, y_train) + # and now go onto prediction -y_pred = model_xgb.predict(x_test) +y_pred_xgb = model_xgb.predict(x_test) +y_pred_cb = model_cb.predict(x_test) # or we can also use probability prediction -y_pred_proba = model_xgb.predict_proba(x_test) +y_pred_proba_xgb = model_xgb.predict_proba(x_test) +y_pred_proba_cb = model_cb.predict_proba(x_test) + +# for comparison purposes +#yo = pd.DataFrame({'y_test': y_test, 'y_pred_xgb': y_pred_xgb, 'y_pred_cb': y_pred_cb}) ``` And we can check our result on this basic xgboost model @@ -160,11 +173,13 @@ from sklearn.metrics import accuracy_score, f1_score from sklearn.metrics import ConfusionMatrixDisplay, classification_report from sklearn.metrics import roc_auc_score, roc_curve, RocCurveDisplay -acc_train = accuracy_score(y_train, model_xgb.predict(x_train)) -acc_test = accuracy_score(y_test, model_xgb.predict(x_test)) +#acc_train = accuracy_score(y_train, model_xgb.predict(x_train)) +acc_test_xgb = accuracy_score(y_test, model_xgb.predict(x_test)) +acc_test_cb = accuracy_score(y_test, model_cb.predict(x_test)) #f1_train = f1_score(y_train, model_xgb.predict(x_train)) -#f1_test = f1_score(y_test, model_xgb.predict(x_test)) +f1_test_xgb = f1_score(y_test, model_xgb.predict(x_test)) +f1_test_cb = f1_score(y_test, model_cb.predict(x_test)) disp = ConfusionMatrixDisplay.from_estimator( @@ -179,7 +194,8 @@ plt.show() ``` ```{python} -print(classification_report(y_test, y_pred)) +print(classification_report(y_test, y_pred_xgb)) +print(classification_report(y_test, y_pred_cb)) ``` And the ROC curve @@ -212,23 +228,36 @@ plt.show() from sklearn.model_selection import TimeSeriesSplit from sklearn.model_selection import RandomizedSearchCV +# defining the samples for the cross-validation process tscv = TimeSeriesSplit(n_splits = 5, gap = 23) + model_xgb.get_params() +param_grid_xgb = {'learning_rate': [0.20, 0.25, 0.30], + 'max_depth': [10, 12, 14, 16], + 'min_child_weight': [3, 5, 7], + 'gamma': [0.2 , 0.3, 0.4], + 'colsample_bytree': [0.4, 0.5 , 0.7]} -param_grid = {'learning_rate': [0.05, 0.10, 0.15, 0.20, 0.25, 0.30], - 'max_depth': [3, 4, 5, 6, 8, 10, 12, 15], - 'min_child_weight': [1, 3, 5, 7], - 'gamma': [0.0, 0.1, 0.2 , 0.3, 0.4], - 'colsample_bytree': [0.3, 0.4, 0.5 , 0.7]} - -xv_xgb = RandomizedSearchCV(model_xgb, param_grid, n_iter = 100, +xv_xgb = RandomizedSearchCV(model_xgb, param_grid_xgb, n_iter = 10, scoring = 'f1', cv = tscv, verbose = 1) - xv_xgb.fit(x_train, y_train, verbose = 1) - xv_xgb.best_params_ xv_xgb.best_score_ +#model_cb.get_params() +param_grid_cb = {'learning_rate': [0.20, 0.25, 0.30], + 'depth': [8, 10, 12, 15], + 'l2_leaf_reg': [1, 3, 5, 7], + 'border_count': [254, 300, 400 , 500], + 'bagging_temperature': [0.3, 0.7, 1.3, 1.7]} + +xv_cb = RandomizedSearchCV(model_cb, param_grid_cb, n_iter = 50, + scoring = 'f1', cv = tscv, verbose = 1) +xv_cb.fit(x_train, y_train, verbose = 1) + +xv_cb.best_params_ +xv_cb.best_score_ + ``` Now we need to train the model based on the best parameters from the cross-validation process. @@ -298,8 +327,8 @@ plt.show() #plt.bar(x=feat_imp['Importance Score'], height = feat_imp['Features']) #ax.set_title('Features Importance'); -plt.clf() from xgboost import plot_importance +plt.clf() plot_importance(model_xgb_tuned) plt.show() @@ -314,12 +343,6 @@ plot_importance(model_xgb_tuned, importance_type='weight', show_values=False) plt.show() ``` -```{python} -from xgboost import to_graphviz - -``` - - ```{python}