-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathapp.py
87 lines (87 loc) · 3.54 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import sys #lead to model_path
sys.path.append('Models')
import joblib # load joblib model
import streamlit as st # Streamlit app
import pandas as pd #read file
import numpy as np # calculator
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error #import mse library
from random_forest import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Title
st.markdown("<link rel='stylesheet' href='https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css'>", unsafe_allow_html=True)
st.markdown("<h1><i class='fa fa-comment'></i> Comments Prediction</h1>", unsafe_allow_html=True)
# Sidebar options
st.sidebar.title('Options')
uploaded_file = st.sidebar.file_uploader("CSV file", type=['csv'])
# model list
models = {
"Random Forest Regressor":"Random Forest Regressor",
"Decision Tree Regressor":"Decision Tree Regressor",
"poisson regression":"Possion Regression",
#add more models
}
# sidebar actions
selected_model = st.sidebar.selectbox("Choose model", list(models.keys()))
selected_model_name = models[selected_model]
# Model config
@st.cache(allow_output_mutation=True) # Cache (faster process)
def load_model(model_name):
return joblib.load(f'Models/{model_name.lower().replace(" ", "_")}_model.joblib')
# Draw histtogram of Target column
def plot_histogram(data, column_name=''):
plt.figure(figsize=(10, 6))
sns.histplot(data, bins=20, kde=True)
plt.title(f'Histogram of {column_name}')
plt.xlabel(column_name)
plt.ylabel('Frequency')
plt.xlim(0, 200)
st.pyplot(plt)
# Process
if uploaded_file is not None:
# read file and preprocess data
data = pd.read_csv(uploaded_file,header=None)
data = data.fillna(1) #replace NaN with 1
data_cut = data.iloc[:, 50:-1] # drop the first 50 cols and the last col (target columm)
st.write("Uploaded Data:")
# print data and visuallize data
st.write(data)
st.sidebar.subheader("Visualization Options")
if st.sidebar.checkbox("Histogram of Target"):
plot_histogram(data.iloc[:, -1], 'Target')
# if st.sidebar.checkbox("Correlation Matrix"):
# plot_corr(data)
# if st.sidebar.checkbox("Box Plot of First 10 Features"):
# plot_box(data)
# load model
model = load_model(selected_model)
# predict
X = data_cut.values #input
predictions = model.predict(X)
predictions_rounded=np.round(predictions).astype(int)
# calc MSE
target_column = data.iloc[:, -1] #take out target_column
y_mean = np.mean(target_column)
mse = mean_squared_error(target_column, predictions) #mse
mse_baseline = np.mean((target_column - y_mean) ** 2) #mse_baseline
# table of result
result_df = pd.DataFrame({
**{f'Feature_{i+1}': data_cut[col] for i, col in enumerate(data_cut.columns)},
'Predicted Label': predictions_rounded
})
if st.sidebar.checkbox("Histogram of Predicted Labels"):
plot_histogram(result_df['Predicted Label'], 'Predicted Label')
# print
st.write("> The first 50 columns : Average,Standard deviation, min, max and median of the Attribute 51 to 60 for the source of the current blog post")
st.write("> We drop the first 50 columns")
st.write(f"Result {selected_model}:")
st.write(result_df)
st.write("MSE:", mse)
st.write(f"MSE_baseline:", mse_baseline)
#compare
if mse < mse_baseline:
st.success(":heavy_check_mark: mse < mse_baseline")
if mse > mse_baseline:
st.error(":warning: mse > mse_baseline")