-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelpful_model_train.py
70 lines (51 loc) · 1.69 KB
/
helpful_model_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import pickle
import warnings
warnings.filterwarnings("ignore")
def create_new_pipeline(params):
numerical_transformer = SimpleImputer(strategy='mean')
preprocessor = ColumnTransformer(
transformers=[
('numerical', numerical_transformer, numerical),
])
scaler = StandardScaler()
forest = RandomForestRegressor(
n_jobs=-1,
random_state=42,
**params
)
pipeline = Pipeline(
steps=[
('preprocessing', preprocessor),
('scaling', scaler),
('model', forest)
]
)
return pipeline
if __name__ == '__main__':
print('Importing data')
# df = pd.read_csv('wikihow.csv')
df = pd.read_csv('/home/sumedhakoranga/Downloads/archive (1)/wikihow.csv')
print('Spliting data')
df_full_train, df_test = train_test_split(
df, test_size=0.2, random_state=42)
numerical = df.columns[:-1]
regression_target = ['percent_helpful']
X = df_full_train[numerical]
y = df_full_train[regression_target]['percent_helpful']
params = {'n_estimators': 6,
'min_samples_split': 6, 'max_features': 'log2'}
print('Creating pipeline')
pipeline = create_new_pipeline(params)
print('Training model')
pipeline.fit(X, y)
print('Saving model')
with open('percent_helpful_model.pickle', 'wb') as f:
pickle.dump((pipeline), f)