-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcoinBase.py~
123 lines (92 loc) · 3.92 KB
/
coinBase.py~
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#run source py3/bin/activate to activate correct python env to run this file
import pandas as pd
import quandl
import math, datetime
import numpy as np
from sklearn import preprocessing, cross_validation, svm
import matplotlib.pyplot as plt
from matplotlib import style
#picking is a good thing to have to save time when doing seralization of a classifier
import pickle
style.use('ggplot')
#svm is a support vector machine
from sklearn.linear_model import LinearRegression
#getting the data set from quandle for free sick wiki dataset
df = quandl.get('WIKI/GOOGL')
#each column is a feature EX: open high low close ...
print(df.head())
#get these specific columns from the dataset
df = df[['Adj. Open' ,'Adj. High' ,'Adj. Low' ,'Adj. Close' ,'Adj. Volume']]
#the margin of high and low tells us a little bit about the volitility for the day
#the open price for the day compared to the close price of the day tells us if the price went up or
#if the price went down for the day
#high - low % or % volitility for day
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Close'] ) / df['Adj. Close'] * 100.0
#daily % change or daily move
df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open'] ) / df['Adj. Open'] * 100.0
#our new dataframe with data that we calculated
#volume is how many trades occured in one day
df = df [[ 'Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume' ]]
#features are the attributes that make up the label
#labels are prediction into the future
#print(df.head())
forecast_col = 'Adj. Close'
df.fillna(-99999, inplace=True) # fill any empty spots with -99999
#number of days out for how many days we are predicting (0.01 for 01% out)
forecast_out = int(math.ceil(0.01*len(df)))
print("number of days we are predicting %d" %forecast_out)
#shifting the columns negatively so each row will be adjusted close price for 10 days into the future
df['label'] = df[forecast_col].shift(-forecast_out)
#df.dropna(inplace=True)
#print(df.head())
#features are a capital X
X = np.array(df.drop(['label'],1))
#print(X)
X = preprocessing.scale(X)
X = X[:-forecast_out]
X_lately = X[-forecast_out:]
#labels are lowercase y
#y= np.array(df['label'])
df.dropna(inplace=True)
y = np.array(df['label'])
print(len(X),len(y))
#going to use 20%, takes all our features and labels and shuffels them up but keeping x and y connected
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)
#****Could comment out below because the pcikle is saved and dont have to retrain****
#clf is a clasifier
#n_jobs is how many threads we want to use per batch (-1 runs as many as it can by your cpu)
clfLR = LinearRegression(n_jobs=10)
clfLR.fit(X_train, y_train)
#can save the training so you dont have to train it everytime
with open('linearregression.pickle','wb') as f:
pickle.dump(clfLR, f)
#***can comment out till here if using pickle saved in local dir*** (pickle data is the classifier)
pickle_in = open('linearregression.pickle','rb')
clfLR = pickle.load(pickle_in) #load it in
accuracyLR = clfLR.score(X_test, y_test)
print("Linear regression test: %f"%accuracyLR)
clfSVM = svm.SVR()
clfSVM.fit(X_train, y_train)
accuracySVM = clfSVM.score(X_test, y_test)
print("support vector machine: %f" %accuracySVM)
#doing a prediction
forecast_set = clfLR.predict(X_lately)
print(forecast_set, accuracyLR, forecast_out)
df['Forecast'] = np.nan
last_date = df.iloc[-1].name
last_unix = last_date.timestamp()
one_day = 86400
next_unix = last_unix + one_day
for i in forecast_set:
next_date = datetime.datetime.fromtimestamp(next_unix)
next_unix += one_day
#df.loc next_date is a date stamp which is the index of the dataframe
df.loc[next_date] = [np.nan for _ in range(len(df.columns) -1)] + [i]
df['Adj. Close'].plot()
df['Forecast'].plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()
#picking is saving your classifier so you dont have to retrain it everytime you run your program
#load it in without any training time