-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path3_rf_hyperparameter_tuning.py
108 lines (82 loc) · 4.27 KB
/
3_rf_hyperparameter_tuning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, roc_curve, auc, RocCurveDisplay, mean_squared_error, r2_score
from sklearn.metrics import f1_score, precision_score, recall_score, cohen_kappa_score, matthews_corrcoef, make_scorer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, log_loss, brier_score_loss
from sklearn.metrics import balanced_accuracy_score as bas
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import sys
data_dir = '/mnt/c/Users/Shina/Documents/ml_qsar/results/preprocs_ecfp4e_c2_ne'
out_dir = '/mnt/c/Users/Shina/Documents/ml_qsar/tunning/ecfp4e_c2_ne'
train_rd = pd.read_csv(f"{data_dir}/train_fp.csv", index_col="molecule")
test_rd = pd.read_csv(f"{data_dir}/test_fp.csv", index_col="molecule")
# Replace label with integers
replace_dico = {'active':1, 'inactive':0}
train_rd['activity'] = train_rd['activity'].replace(replace_dico)
test_rd['activity'] = test_rd['activity'].replace(replace_dico)
#separate X and y
X_train = train_rd.iloc[:, :-3]
y_train = train_rd.iloc[:, -1]
X_test = test_rd.iloc[:, :-3]
y_test = test_rd.iloc[:, -1]
# Define the hyperparameters to tune
param_grid={'class_weight': [None, 'balanced', 'balanced_subsample'],
'max_depth': [None, 5, 10, 15, 20],
'min_samples_leaf': [2, 3, 5, 7, 9],
'min_samples_split': [3, 4, 6, 8, 10],
'n_estimators': [30, 50, 100, 200]
#'max_features': ['sqrt', 'log2']
}
# Define scoring
scoring = ['accuracy', 'neg_log_loss', 'neg_brier_score']
# Create the Random Forest classifier
rf = RandomForestClassifier(random_state=42)
#rf = CalibratedClassifierCV(RandomForestClassifier(random_state=42))
# Define the search strategy (Grid Search in this case)
grid_search = GridSearchCV(rf, param_grid=param_grid, scoring=scoring, n_jobs=-1,
refit='neg_log_loss', cv=None, verbose=0, pre_dispatch='2*n_jobs',
return_train_score=False)
# Perform hyperparameter tuning on the training set
grid_search.fit(X_train, y_train)
# best hyperparameters
best_params = grid_search.best_params_
print_out = f"Tunning rf tune \n\n"
print_out += f"param_grid: {grid_search}\n\n"
print_out += f"Best hyperparameters: {best_params}\n\n"
# Applying best hyperparameters
best_rf = RandomForestClassifier(random_state=42, **best_params)
#best_rf = CalibratedClassifierCV(RandomForestClassifier(random_state=42, **best_params))
# Train the model on the combined training and validation sets
best_rf.fit(X_train, y_train)
# Evaluate on the train set
y_train_pred = best_rf.predict(X_train)
y_train_pred_proba = best_rf.predict_proba(X_train)[:, 1]
train_accuracy = accuracy_score(y_train, y_train_pred)
train_bas = bas(y_train, y_train_pred)
train_logloss = log_loss(y_train, y_train_pred_proba)
train_brier_score = brier_score_loss(y_train, y_train_pred_proba)
train_f1 = f1_score(y_train, y_train_pred)
# Evaluate on the test set
y_test_pred = best_rf.predict(X_test)
y_test_pred_proba = best_rf.predict_proba(X_test)[:, 1]
test_accuracy = accuracy_score(y_test, y_test_pred)
test_bas = bas(y_test, y_test_pred)
test_logloss = log_loss(y_test, y_test_pred_proba)
test_brier_score = brier_score_loss(y_test, y_test_pred_proba)
test_f1 = f1_score(y_test, y_test_pred)
cv_metric = ['accuracy', 'balanced_accuracy', 'neg_log_loss', 'neg_brier_score', 'f1']
cv = 10
for metric in cv_metric:
cross_val = cross_val_score(best_rf, X_train, y_train, cv = cv, scoring=metric)
print_out += f"Cross Validation {metric}: {np.mean(cross_val):.4f} ± {np.std(cross_val):.4f}:\n"
print_out += f"Accuracy on train, test: {train_accuracy:.4f}, {test_accuracy:.4f}\n"
print_out += f"Balanced Accuracy on train, test: {train_bas:.4f}, {test_bas:.4f}\n"
print_out += f"Log loss on train, test: {train_logloss:.4f}, {test_logloss:.4f}\n"
print_out += f"Brier Score on train, test: {train_brier_score:.4f}, {test_brier_score:.4f}\n"
print_out += f"F1 Score on train, test and external set: {train_f1:.4f}, {test_f1:.4f}\n"
# Save the print output to a text file
with open(f"{out_dir}/RF_tuned_ecfp4e_c2_ne.txt", "w") as file:
file.write(print_out)