-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsmote_random_data.py
35 lines (30 loc) · 1.37 KB
/
smote_random_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
np.random.seed(42)
# Generate random data
X = np.random.rand(10000, 5)
y = np.random.choice([0, 1], size=(10000, ), p=[0.9, 0.1])
# Let's measure accuracy score on test set with no oversampling
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
preds = rf.predict_proba(X_test)[:, 1]
print('AUC no oversampling: {}'.format(roc_auc_score(y_test, preds)))
# Let's apply over_sampling on our train set and measure accuracy
smote = SMOTE()
X_train_s, y_train_s = smote.fit_sample(X_train, y_train)
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_s, y_train_s)
preds = rf.predict_proba(X_test)[:, 1]
print('AUC with oversampling after partitioning: {}'.format(roc_auc_score(y_test, preds)))
# Now let's first apply smote, then partition and measure accuracy
smote = SMOTE()
X_s, y_s = smote.fit_sample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, random_state=42)
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
preds = rf.predict_proba(X_test)[:, 1]
print('AUC with oversampling before partitioning: {}'.format(roc_auc_score(y_test, preds)))