Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

some changes #21

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 23 additions & 33 deletions data_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,39 +5,38 @@
import click
import os
import pandas as pd
import pandas_datareader.data as web
import random
import time
import urllib2
import urllib.request, urllib.error, urllib.parse

from BeautifulSoup import BeautifulSoup
from bs4 import BeautifulSoup
from datetime import datetime

DATA_DIR = "data"
RANDOM_SLEEP_TIMES = (1, 5)

# This repo "github.com/datasets/s-and-p-500-companies" has some other information about
# S & P 500 companies.
SP500_LIST_URL = "https://raw.githubusercontent.com/datasets/s-and-p-500-companies/master/data/constituents-financials.csv"
SP500_LIST_URL = "https://raw.githubusercontent.com/datasets/s-and-p-companies-financials/master/data/constituents-financials.csv"
SP500_LIST_PATH = os.path.join(DATA_DIR, "constituents-financials.csv")


def _download_sp500_list():
if os.path.exists(SP500_LIST_PATH):
return

f = urllib2.urlopen(SP500_LIST_URL)
print "Downloading ...", SP500_LIST_URL
with open(SP500_LIST_PATH, 'w') as fin:
print >> fin, f.read()
print("Downloading ...", SP500_LIST_URL)
f = urllib.request.urlretrieve(SP500_LIST_URL, SP500_LIST_PATH)
return


def _load_symbols():
_download_sp500_list()
df_sp500 = pd.read_csv(SP500_LIST_PATH)
df_sp500.sort('Market Cap', ascending=False, inplace=True)
df_sp500.sort_values(by='Market Cap', ascending=False, inplace=True)
stock_symbols = df_sp500['Symbol'].unique().tolist()
print "Loaded %d stock symbols" % len(stock_symbols)
print("Loaded %d stock symbols" % len(stock_symbols))
return stock_symbols


Expand All @@ -51,36 +50,27 @@ def fetch_prices(symbol, out_name):
Returns: a bool, whether the fetch is succeeded.
"""
# Format today's date to match Google's finance history api.
now_datetime = datetime.now().strftime("%b+%d,+%Y")

BASE_URL = "https://finance.google.com/finance/historical?output=csv&q={0}&startdate=Jan+1%2C+1980&enddate={1}"
symbol_url = BASE_URL.format(
urllib2.quote(symbol),
urllib2.quote(now_datetime, '+')
)
print "Fetching {} ...".format(symbol)
print symbol_url

start = datetime(1980, 1, 1)
end = datetime.now().strftime("%Y-%m-%d")
try:
f = urllib2.urlopen(symbol_url)
with open(out_name, 'w') as fin:
print >> fin, f.read()
except urllib2.HTTPError:
print "Failed when fetching {}".format(symbol)
print("Fetching {} ...".format(symbol))
web.DataReader(symbol, 'quandl', start, end).to_csv(out_name)
except:
print("Failed when fetching {}".format(symbol))
return False

data = pd.read_csv(out_name)
if data.empty:
print "Remove {} because the data set is empty.".format(out_name)
print("Remove {} because the data set is empty.".format(out_name))
os.remove(out_name)
else:
dates = data.iloc[:,0].tolist()
print "# Fetched rows: %d [%s to %s]" % (data.shape[0], dates[-1], dates[0])
dates = data.iloc[:, 0].tolist()
print("# Fetched rows: %d [%s to %s]" % (data.shape[0], dates[-1], dates[0]))

# Take a rest
sleep_time = random.randint(*RANDOM_SLEEP_TIMES)
print "Sleeping ... %ds" % sleep_time
time.sleep(sleep_time)
# sleep_time = random.randint(*RANDOM_SLEEP_TIMES)
# print("Sleeping ... %ds" % sleep_time)
# time.sleep(sleep_time)
return True


Expand All @@ -91,20 +81,20 @@ def main(continued):
num_failure = 0

# This is S&P 500 index
#fetch_prices('INDEXSP%3A.INX')
# fetch_prices('INDEXSP%3A.INX')

symbols = _load_symbols()
for idx, sym in enumerate(symbols):
out_name = os.path.join(DATA_DIR, sym + ".csv")
if continued and os.path.exists(out_name):
print "Fetched", sym
print("Fetched", sym)
continue

succeeded = fetch_prices(sym, out_name)
num_failure += int(not succeeded)

if idx % 10 == 0:
print "# Failures so far [%d/%d]: %d" % (idx + 1, len(symbols), num_failure)
print("# Failures so far [%d/%d]: %d" % (idx + 1, len(symbols), num_failure))


if __name__ == "__main__":
Expand Down
18 changes: 10 additions & 8 deletions data_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,26 @@
random.seed(time.time())


class StockDataSet(object):
class StockDataSet():
def __init__(self,
stock_sym,
input_size=1,
num_steps=30,
batch_size=64,
test_ratio=0.1,
normalized=True,
close_price_only=True):
self.stock_sym = stock_sym
self.input_size = input_size
self.num_steps = num_steps
self.batch_size = batch_size
self.test_ratio = test_ratio
self.close_price_only = close_price_only
self.normalized = normalized

# Read csv file
raw_df = pd.read_csv(os.path.join("data", "%s.csv" % stock_sym))
raw_df.sort_values(by='Date').reset_index()

# Merge into one sequence
if close_price_only:
Expand All @@ -48,22 +51,21 @@ def _prepare_data(self, seq):
curr / seq[i][-1] - 1.0 for i, curr in enumerate(seq[1:])]

# split into groups of num_steps
X = np.array([seq[i: i + self.num_steps] for i in range(len(seq) - self.num_steps)])
y = np.array([seq[i + self.num_steps] for i in range(len(seq) - self.num_steps)])
self.X_y = np.array([seq[i: i + self.num_steps+1] for i in range(len(seq) - self.num_steps)])
X = self.X_y[:, :num_steps, :]
y = self.X_y[:, num_steps, :]

train_size = int(len(X) * (1.0 - self.test_ratio))
train_size -= train_size % self.batch_size
train_X, test_X = X[:train_size], X[train_size:]
train_y, test_y = y[:train_size], y[train_size:]
return train_X, train_y, test_X, test_y

def generate_one_epoch(self, batch_size):
num_batches = int(len(self.train_X)) // batch_size
if batch_size * num_batches < len(self.train_X):
num_batches += 1

batch_indices = range(num_batches)
random.shuffle(batch_indices)
for j in batch_indices:
np.random.shuffle(self.X_y)
for j in range(num_batches):
batch_X = self.train_X[j * batch_size: (j + 1) * batch_size]
batch_y = self.train_y[j * batch_size: (j + 1) * batch_size]
assert set(map(len, batch_X)) == {self.num_steps}
Expand Down
11 changes: 7 additions & 4 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,29 +38,30 @@ def show_all_variables():
slim.model_analyzer.analyze_vars(model_vars, print_info=True)


def load_sp500(input_size, num_steps, k=None, target_symbol=None, test_ratio=0.05):
def load_sp500(input_size, num_steps, batch_size, k=None, target_symbol=None, test_ratio=0.05):
if target_symbol is not None:
return [
StockDataSet(
target_symbol,
input_size=input_size,
num_steps=num_steps,
batch_size=batch_size,
test_ratio=test_ratio)
]

# Load metadata of s & p 500 stocks
info = pd.read_csv("data/constituents-financials.csv")
info = info.rename(columns={col: col.lower().replace(' ', '_') for col in info.columns})
info['file_exists'] = info['symbol'].map(lambda x: os.path.exists("data/{}.csv".format(x)))
print info['file_exists'].value_counts().to_dict()
print(info['file_exists'].value_counts().to_dict())

info = info[info['file_exists'] == True].reset_index(drop=True)
info = info.sort('market_cap', ascending=False).reset_index(drop=True)
info = info.sort_values(by='market_cap', ascending=False).reset_index(drop=True)

if k is not None:
info = info.head(k)

print "Head of S&P 500 info:\n", info.head()
print("Head of S&P 500 info:\n", info.head())

# Generate embedding meta file
info[['symbol', 'sector']].to_csv(os.path.join("logs/metadata.tsv"), sep='\t', index=False)
Expand All @@ -69,6 +70,7 @@ def load_sp500(input_size, num_steps, k=None, target_symbol=None, test_ratio=0.0
StockDataSet(row['symbol'],
input_size=input_size,
num_steps=num_steps,
batch_size=batch_size,
test_ratio=0.05)
for _, row in info.iterrows()]

Expand Down Expand Up @@ -96,6 +98,7 @@ def main(_):
stock_data_list = load_sp500(
FLAGS.input_size,
FLAGS.num_steps,
FLAGS.batch_size,
k=FLAGS.stock_count,
target_symbol=FLAGS.stock_symbol,
)
Expand Down
32 changes: 16 additions & 16 deletions model_rnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from tensorflow.contrib.tensorboard.plugins import projector


class LstmRNN(object):
class LstmRNN():
def __init__(self, sess, stock_count,
lstm_size=128,
num_layers=1,
Expand Down Expand Up @@ -87,7 +87,7 @@ def _create_one_cell():
tf.random_uniform([self.stock_count, self.embed_size], -1.0, 1.0),
name="embed_matrix"
)

# stock_label_embeds.shape = (batch_size, embedding_size)
stacked_symbols = tf.tile(self.symbols, [1, self.num_steps], name='stacked_stock_labels')
stacked_embeds = tf.nn.embedding_lookup(self.embed_matrix, stacked_symbols)
Expand All @@ -100,8 +100,8 @@ def _create_one_cell():
self.inputs_with_embed = tf.identity(self.inputs)
self.embed_matrix_summ = None

print "inputs.shape:", self.inputs.shape
print "inputs_with_embed.shape:", self.inputs_with_embed.shape
print("inputs.shape:", self.inputs.shape)
print("inputs_with_embed.shape:", self.inputs_with_embed.shape)

# Run dynamic RNN
val, state_ = tf.nn.dynamic_rnn(cell, self.inputs_with_embed, dtype=tf.float32, scope="dynamic_rnn")
Expand Down Expand Up @@ -180,9 +180,9 @@ def train(self, dataset_list, config):
merged_test_y = np.array(merged_test_y)
merged_test_labels = np.array(merged_test_labels)

print "len(merged_test_X) =", len(merged_test_X)
print "len(merged_test_y) =", len(merged_test_y)
print "len(merged_test_labels) =", len(merged_test_labels)
print("len(merged_test_X) =", len(merged_test_X))
print("len(merged_test_y) =", len(merged_test_y))
print("len(merged_test_labels) =", len(merged_test_labels))

test_data_feed = {
self.learning_rate: 0.0,
Expand All @@ -198,18 +198,18 @@ def train(self, dataset_list, config):
random.seed(time.time())

# Select samples for plotting.
sample_labels = range(min(config.sample_size, len(dataset_list)))
sample_labels = list(range(min(config.sample_size, len(dataset_list))))
sample_indices = {}
for l in sample_labels:
sym = dataset_list[l].stock_sym
target_indices = np.array([
i for i, sym_label in enumerate(merged_test_labels)
if sym_label[0] == l])
sample_indices[sym] = target_indices
print sample_indices
print(sample_indices)

print "Start training for stocks:", [d.stock_sym for d in dataset_list]
for epoch in xrange(config.max_epoch):
print("Start training for stocks:", [d.stock_sym for d in dataset_list])
for epoch in range(config.max_epoch):
epoch_step = 0
learning_rate = config.init_learning_rate * (
config.learning_rate_decay ** max(float(epoch + 1 - config.init_epoch), 0.0)
Expand All @@ -234,11 +234,11 @@ def train(self, dataset_list, config):
if np.mod(global_step, len(dataset_list) * 200 / config.input_size) == 1:
test_loss, test_pred = self.sess.run([self.loss_test, self.pred], test_data_feed)

print "Step:%d [Epoch:%d] [Learning rate: %.6f] train_loss:%.6f test_loss:%.6f" % (
global_step, epoch, learning_rate, train_loss, test_loss)
print("Step:%d [Epoch:%d] [Learning rate: %.6f] train_loss:%.6f test_loss:%.6f" % (
global_step, epoch, learning_rate, train_loss, test_loss))

# Plot samples
for sample_sym, indices in sample_indices.iteritems():
for sample_sym, indices in sample_indices.items():
image_path = os.path.join(self.model_plots_dir, "{}_epoch{:02d}_step{:04d}.png".format(
sample_sym, epoch, epoch_step))
sample_preds = test_pred[indices]
Expand Down Expand Up @@ -292,7 +292,7 @@ def load(self):
ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
self.saver.restore(self.sess, os.path.join(self.model_logs_dir, ckpt_name))
counter = int(next(re.finditer("(\d+)(?!.*\d)", ckpt_name)).group(0))
print(" [*] Success to read {}".format(ckpt_name))
print((" [*] Success to read {}".format(ckpt_name)))
return True, counter

else:
Expand All @@ -305,7 +305,7 @@ def _flatten(seq):

truths = _flatten(targets)[-200:]
preds = (_flatten(preds) * multiplier)[-200:]
days = range(len(truths))[-200:]
days = list(range(len(truths)))[-200:]

plt.figure(figsize=(12, 6))
plt.plot(days, truths, label='truth')
Expand Down
7 changes: 7 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
BeautifulSoup4
numpy>=1.13.1
pandas>=0.16.2
scikit-learn>=0.16.1
scipy>=0.19.1
tensorflow>=1.2.1
urllib3>=1.8
4 changes: 2 additions & 2 deletions scripts/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class RNNConfig():

def to_dict(self):
dct = self.__class__.__dict__
return {k: v for k, v in dct.iteritems() if not k.startswith('__') and not callable(v)}
return {k: v for k, v in dct.items() if not k.startswith('__') and not callable(v)}

def __str__(self):
return str(self.to_dict())
Expand All @@ -23,7 +23,7 @@ def __repr__(self):


DEFAULT_CONFIG = RNNConfig()
print "Default configuration:", DEFAULT_CONFIG.to_dict()
print("Default configuration:", DEFAULT_CONFIG.to_dict())

DATA_DIR = "data"
LOG_DIR = "logs"
Expand Down
2 changes: 1 addition & 1 deletion scripts/restore_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
import tensorflow as tf

from config import MODEL_DIR
from scripts.config import MODEL_DIR


def prediction_by_trained_graph(graph_name, max_epoch, test_X, test_y):
Expand Down
Loading