lilianweng · megazone87 · Jun 25, 2018 · Jun 27, 2018 · Jun 27, 2018 · Jun 29, 2018
diff --git a/data_fetcher.py b/data_fetcher.py
@@ -5,39 +5,38 @@
 import click
 import os
 import pandas as pd
+import pandas_datareader.data as web
 import random
 import time
-import urllib2
+import urllib.request, urllib.error, urllib.parse
 
-from BeautifulSoup import BeautifulSoup
+from bs4 import BeautifulSoup
 from datetime import datetime
 
 DATA_DIR = "data"
 RANDOM_SLEEP_TIMES = (1, 5)
 
 # This repo "github.com/datasets/s-and-p-500-companies" has some other information about
 # S & P 500 companies.
-SP500_LIST_URL = "https://raw.githubusercontent.com/datasets/s-and-p-500-companies/master/data/constituents-financials.csv"
+SP500_LIST_URL = "https://raw.githubusercontent.com/datasets/s-and-p-companies-financials/master/data/constituents-financials.csv"
 SP500_LIST_PATH = os.path.join(DATA_DIR, "constituents-financials.csv")
 
 
 def _download_sp500_list():
     if os.path.exists(SP500_LIST_PATH):
         return
 
-    f = urllib2.urlopen(SP500_LIST_URL)
-    print "Downloading ...", SP500_LIST_URL
-    with open(SP500_LIST_PATH, 'w') as fin:
-        print >> fin, f.read()
+    print("Downloading ...", SP500_LIST_URL)
+    f = urllib.request.urlretrieve(SP500_LIST_URL, SP500_LIST_PATH)
     return
 
 
 def _load_symbols():
     _download_sp500_list()
     df_sp500 = pd.read_csv(SP500_LIST_PATH)
-    df_sp500.sort('Market Cap', ascending=False, inplace=True)
+    df_sp500.sort_values(by='Market Cap', ascending=False, inplace=True)
     stock_symbols = df_sp500['Symbol'].unique().tolist()
-    print "Loaded %d stock symbols" % len(stock_symbols)
+    print("Loaded %d stock symbols" % len(stock_symbols))
     return stock_symbols
 
 
@@ -51,36 +50,27 @@ def fetch_prices(symbol, out_name):
     Returns: a bool, whether the fetch is succeeded.
     """
     # Format today's date to match Google's finance history api.
-    now_datetime = datetime.now().strftime("%b+%d,+%Y")
-
-    BASE_URL = "https://finance.google.com/finance/historical?output=csv&q={0}&startdate=Jan+1%2C+1980&enddate={1}"
-    symbol_url = BASE_URL.format(
-        urllib2.quote(symbol),
-        urllib2.quote(now_datetime, '+')
-    )
-    print "Fetching {} ...".format(symbol)
-    print symbol_url
-
+    start = datetime(1980, 1, 1)
+    end = datetime.now().strftime("%Y-%m-%d")
     try:
-        f = urllib2.urlopen(symbol_url)
-        with open(out_name, 'w') as fin:
-            print >> fin, f.read()
-    except urllib2.HTTPError:
-        print "Failed when fetching {}".format(symbol)
+        print("Fetching {} ...".format(symbol))
+        web.DataReader(symbol, 'quandl', start, end).to_csv(out_name)
+    except:
+        print("Failed when fetching {}".format(symbol))
         return False
 
     data = pd.read_csv(out_name)
     if data.empty:
-        print "Remove {} because the data set is empty.".format(out_name)
+        print("Remove {} because the data set is empty.".format(out_name))
         os.remove(out_name)
     else:
-        dates = data.iloc[:,0].tolist()
-        print "# Fetched rows: %d [%s to %s]" % (data.shape[0], dates[-1], dates[0])
+        dates = data.iloc[:, 0].tolist()
+        print("# Fetched rows: %d [%s to %s]" % (data.shape[0], dates[-1], dates[0]))
 
     # Take a rest
-    sleep_time = random.randint(*RANDOM_SLEEP_TIMES)
-    print "Sleeping ... %ds" % sleep_time
-    time.sleep(sleep_time)
+    # sleep_time = random.randint(*RANDOM_SLEEP_TIMES)
+    # print("Sleeping ... %ds" % sleep_time)
+    # time.sleep(sleep_time)
     return True
 
 
@@ -91,20 +81,20 @@ def main(continued):
     num_failure = 0
 
     # This is S&P 500 index
-    #fetch_prices('INDEXSP%3A.INX')
+    # fetch_prices('INDEXSP%3A.INX')
 
     symbols = _load_symbols()
     for idx, sym in enumerate(symbols):
         out_name = os.path.join(DATA_DIR, sym + ".csv")
         if continued and os.path.exists(out_name):
-            print "Fetched", sym
+            print("Fetched", sym)
             continue
 
         succeeded = fetch_prices(sym, out_name)
         num_failure += int(not succeeded)
 
         if idx % 10 == 0:
-            print "# Failures so far [%d/%d]: %d" % (idx + 1, len(symbols), num_failure)
+            print("# Failures so far [%d/%d]: %d" % (idx + 1, len(symbols), num_failure))
 
 
 if __name__ == "__main__":

diff --git a/data_model.py b/data_model.py
@@ -7,23 +7,26 @@
 random.seed(time.time())
 
 
-class StockDataSet(object):
+class StockDataSet():
     def __init__(self,
                  stock_sym,
                  input_size=1,
                  num_steps=30,
+                 batch_size=64,
                  test_ratio=0.1,
                  normalized=True,
                  close_price_only=True):
         self.stock_sym = stock_sym
         self.input_size = input_size
         self.num_steps = num_steps
+        self.batch_size = batch_size
         self.test_ratio = test_ratio
         self.close_price_only = close_price_only
         self.normalized = normalized
 
         # Read csv file
         raw_df = pd.read_csv(os.path.join("data", "%s.csv" % stock_sym))
+        raw_df.sort_values(by='Date').reset_index()
 
         # Merge into one sequence
         if close_price_only:
@@ -48,22 +51,21 @@ def _prepare_data(self, seq):
                 curr / seq[i][-1] - 1.0 for i, curr in enumerate(seq[1:])]
 
         # split into groups of num_steps
-        X = np.array([seq[i: i + self.num_steps] for i in range(len(seq) - self.num_steps)])
-        y = np.array([seq[i + self.num_steps] for i in range(len(seq) - self.num_steps)])
+        self.X_y = np.array([seq[i: i + self.num_steps+1] for i in range(len(seq) - self.num_steps)])
+        X = self.X_y[:, :num_steps, :]
+        y = self.X_y[:, num_steps, :]
 
         train_size = int(len(X) * (1.0 - self.test_ratio))
+        train_size -= train_size % self.batch_size
         train_X, test_X = X[:train_size], X[train_size:]
         train_y, test_y = y[:train_size], y[train_size:]
         return train_X, train_y, test_X, test_y
 
     def generate_one_epoch(self, batch_size):
         num_batches = int(len(self.train_X)) // batch_size
-        if batch_size * num_batches < len(self.train_X):
-            num_batches += 1
 
-        batch_indices = range(num_batches)
-        random.shuffle(batch_indices)
-        for j in batch_indices:
+        np.random.shuffle(self.X_y)
+        for j in range(num_batches):
             batch_X = self.train_X[j * batch_size: (j + 1) * batch_size]
             batch_y = self.train_y[j * batch_size: (j + 1) * batch_size]
             assert set(map(len, batch_X)) == {self.num_steps}

diff --git a/main.py b/main.py
@@ -38,29 +38,30 @@ def show_all_variables():
     slim.model_analyzer.analyze_vars(model_vars, print_info=True)
 
 
-def load_sp500(input_size, num_steps, k=None, target_symbol=None, test_ratio=0.05):
+def load_sp500(input_size, num_steps, batch_size, k=None, target_symbol=None, test_ratio=0.05):
     if target_symbol is not None:
         return [
             StockDataSet(
                 target_symbol,
                 input_size=input_size,
                 num_steps=num_steps,
+                batch_size=batch_size,
                 test_ratio=test_ratio)
         ]
 
     # Load metadata of s & p 500 stocks
     info = pd.read_csv("data/constituents-financials.csv")
     info = info.rename(columns={col: col.lower().replace(' ', '_') for col in info.columns})
     info['file_exists'] = info['symbol'].map(lambda x: os.path.exists("data/{}.csv".format(x)))
-    print info['file_exists'].value_counts().to_dict()
+    print(info['file_exists'].value_counts().to_dict())
 
     info = info[info['file_exists'] == True].reset_index(drop=True)
-    info = info.sort('market_cap', ascending=False).reset_index(drop=True)
+    info = info.sort_values(by='market_cap', ascending=False).reset_index(drop=True)
 
     if k is not None:
         info = info.head(k)
 
-    print "Head of S&P 500 info:\n", info.head()
+    print("Head of S&P 500 info:\n", info.head())
 
     # Generate embedding meta file
     info[['symbol', 'sector']].to_csv(os.path.join("logs/metadata.tsv"), sep='\t', index=False)
@@ -69,6 +70,7 @@ def load_sp500(input_size, num_steps, k=None, target_symbol=None, test_ratio=0.0
         StockDataSet(row['symbol'],
                      input_size=input_size,
                      num_steps=num_steps,
+                     batch_size=batch_size,
                      test_ratio=0.05)
         for _, row in info.iterrows()]
 
@@ -96,6 +98,7 @@ def main(_):
         stock_data_list = load_sp500(
             FLAGS.input_size,
             FLAGS.num_steps,
+            FLAGS.batch_size,
             k=FLAGS.stock_count,
             target_symbol=FLAGS.stock_symbol,
         )

diff --git a/model_rnn.py b/model_rnn.py
@@ -14,7 +14,7 @@
 from tensorflow.contrib.tensorboard.plugins import projector
 
 
-class LstmRNN(object):
+class LstmRNN():
     def __init__(self, sess, stock_count,
                  lstm_size=128,
                  num_layers=1,
@@ -87,7 +87,7 @@ def _create_one_cell():
                 tf.random_uniform([self.stock_count, self.embed_size], -1.0, 1.0),
                 name="embed_matrix"
             )
-            
+
             # stock_label_embeds.shape = (batch_size, embedding_size)
             stacked_symbols = tf.tile(self.symbols, [1, self.num_steps], name='stacked_stock_labels')
             stacked_embeds = tf.nn.embedding_lookup(self.embed_matrix, stacked_symbols)
@@ -100,8 +100,8 @@ def _create_one_cell():
             self.inputs_with_embed = tf.identity(self.inputs)
             self.embed_matrix_summ = None
 
-        print "inputs.shape:", self.inputs.shape
-        print "inputs_with_embed.shape:", self.inputs_with_embed.shape
+        print("inputs.shape:", self.inputs.shape)
+        print("inputs_with_embed.shape:", self.inputs_with_embed.shape)
 
         # Run dynamic RNN
         val, state_ = tf.nn.dynamic_rnn(cell, self.inputs_with_embed, dtype=tf.float32, scope="dynamic_rnn")
@@ -180,9 +180,9 @@ def train(self, dataset_list, config):
         merged_test_y = np.array(merged_test_y)
         merged_test_labels = np.array(merged_test_labels)
 
-        print "len(merged_test_X) =", len(merged_test_X)
-        print "len(merged_test_y) =", len(merged_test_y)
-        print "len(merged_test_labels) =", len(merged_test_labels)
+        print("len(merged_test_X) =", len(merged_test_X))
+        print("len(merged_test_y) =", len(merged_test_y))
+        print("len(merged_test_labels) =", len(merged_test_labels))
 
         test_data_feed = {
             self.learning_rate: 0.0,
@@ -198,18 +198,18 @@ def train(self, dataset_list, config):
         random.seed(time.time())
 
         # Select samples for plotting.
-        sample_labels = range(min(config.sample_size, len(dataset_list)))
+        sample_labels = list(range(min(config.sample_size, len(dataset_list))))
         sample_indices = {}
         for l in sample_labels:
             sym = dataset_list[l].stock_sym
             target_indices = np.array([
                 i for i, sym_label in enumerate(merged_test_labels)
                 if sym_label[0] == l])
             sample_indices[sym] = target_indices
-        print sample_indices
+        print(sample_indices)
 
-        print "Start training for stocks:", [d.stock_sym for d in dataset_list]
-        for epoch in xrange(config.max_epoch):
+        print("Start training for stocks:", [d.stock_sym for d in dataset_list])
+        for epoch in range(config.max_epoch):
             epoch_step = 0
             learning_rate = config.init_learning_rate * (
                 config.learning_rate_decay ** max(float(epoch + 1 - config.init_epoch), 0.0)
@@ -234,11 +234,11 @@ def train(self, dataset_list, config):
                     if np.mod(global_step, len(dataset_list) * 200 / config.input_size) == 1:
                         test_loss, test_pred = self.sess.run([self.loss_test, self.pred], test_data_feed)
 
-                        print "Step:%d [Epoch:%d] [Learning rate: %.6f] train_loss:%.6f test_loss:%.6f" % (
-                            global_step, epoch, learning_rate, train_loss, test_loss)
+                        print("Step:%d [Epoch:%d] [Learning rate: %.6f] train_loss:%.6f test_loss:%.6f" % (
+                            global_step, epoch, learning_rate, train_loss, test_loss))
 
                         # Plot samples
-                        for sample_sym, indices in sample_indices.iteritems():
+                        for sample_sym, indices in sample_indices.items():
                             image_path = os.path.join(self.model_plots_dir, "{}_epoch{:02d}_step{:04d}.png".format(
                                 sample_sym, epoch, epoch_step))
                             sample_preds = test_pred[indices]
@@ -292,7 +292,7 @@ def load(self):
             ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
             self.saver.restore(self.sess, os.path.join(self.model_logs_dir, ckpt_name))
             counter = int(next(re.finditer("(\d+)(?!.*\d)", ckpt_name)).group(0))
-            print(" [*] Success to read {}".format(ckpt_name))
+            print((" [*] Success to read {}".format(ckpt_name)))
             return True, counter
 
         else:
@@ -305,7 +305,7 @@ def _flatten(seq):
 
         truths = _flatten(targets)[-200:]
         preds = (_flatten(preds) * multiplier)[-200:]
-        days = range(len(truths))[-200:]
+        days = list(range(len(truths)))[-200:]
 
         plt.figure(figsize=(12, 6))
         plt.plot(days, truths, label='truth')

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,7 @@
+BeautifulSoup4
+numpy>=1.13.1
+pandas>=0.16.2
+scikit-learn>=0.16.1
+scipy>=0.19.1
+tensorflow>=1.2.1
+urllib3>=1.8
diff --git a/scripts/config.py b/scripts/config.py
@@ -13,7 +13,7 @@ class RNNConfig():
 
     def to_dict(self):
         dct = self.__class__.__dict__
-        return {k: v for k, v in dct.iteritems() if not k.startswith('__') and not callable(v)}
+        return {k: v for k, v in dct.items() if not k.startswith('__') and not callable(v)}
 
     def __str__(self):
         return str(self.to_dict())
@@ -23,7 +23,7 @@ def __repr__(self):
 
 
 DEFAULT_CONFIG = RNNConfig()
-print "Default configuration:", DEFAULT_CONFIG.to_dict()
+print("Default configuration:", DEFAULT_CONFIG.to_dict())
 
 DATA_DIR = "data"
 LOG_DIR = "logs"

diff --git a/scripts/restore_model.py b/scripts/restore_model.py
@@ -4,7 +4,7 @@
 import os
 import tensorflow as tf
 
-from config import MODEL_DIR
+from scripts.config import MODEL_DIR
 
 
 def prediction_by_trained_graph(graph_name, max_epoch, test_X, test_y):