Merge branch 'davelester-autopep8'

datactive · Oct 20, 2014 · b271ae5 · b271ae5
2 parents 1900857 + b02357e
commit b271ae5
Show file tree

Hide file tree

Showing 11 changed files with 455 additions and 375 deletions.
diff --git a/bigbang/archive.py b/bigbang/archive.py
@@ -1,6 +1,6 @@
 from bigbang.parse import get_date
 import datetime
-import mailman
+from . import mailman
 import mailbox
 import numpy as np
 import pandas as pd
@@ -11,21 +11,26 @@ def load(path):
     data = pd.read_csv(path)
     return Archive(data)
 
+
 class MissingDataException(Exception):
+
     def __init__(self, value):
         self.value = value
+
     def __str__(self):
         return repr(self.value)
 
+
 class Archive:
+
     """
     A representation of a mailing list archive.
     """
 
     data = None
     activity = None
 
-    def __init__(self, data,archive_dir="archives",single_file=False):
+    def __init__(self, data, archive_dir="archives", single_file=False):
         """
         Initializes an Archive object.
 
@@ -41,49 +46,54 @@ def __init__(self, data,archive_dir="archives",single_file=False):
         copy of the input DataFrame.
 
         If data is a string, then it is interpreted as a path to either a
-        single .mbox file (if the optional argument single_file is True) or else
-        to a directory of .mbox files (also in .mbox format). Note that the
-        file extensions need not be .mbox; frequently they will be .txt.
+        single .mbox file (if the optional argument single_file is True) or
+        else to a directory of .mbox files (also in .mbox format). Note that
+        the file extensions need not be .mbox; frequently they will be .txt.
 
         Upon initialization, the Archive object drops duplicate entries
         and sorts its member variable *data* by Date.
         """
-        if type(data) is list:
+        if isinstance(data, list):
             self.data = self.messages_to_dataframe(data)
-        elif type(data) is pd.core.frame.DataFrame:
+        elif isinstance(data, pd.core.frame.DataFrame):
             self.data = data.copy()
-        elif type(data) is str:
+        elif isinstance(data, str):
             messages = None
 
             if single_file:
                 # treat string as the path to a file that is an mbox
                 box = mailbox.mbox(data, create=False)
                 messages = box.values()
             else:
-                # assume string is the path to a directory with many  
-                messages = mailman.open_list_archives(data,base_arc_dir=archive_dir)
+                # assume string is the path to a directory with many
+                messages = mailman.open_list_archives(
+                    data,
+                    base_arc_dir=archive_dir)
 
                 if len(messages) == 0:
-                    raise MissingDataException("No messages in %s under %s. Did you run the collect_mail.py script?" % (archive_dir,data))
+                    raise MissingDataException(
+                        ("No messages in %s under %s. Did you run the "
+                         "collect_mail.py script?") %
+                        (archive_dir, data))
 
-            self.data= self.messages_to_dataframe(messages)
+            self.data = self.messages_to_dataframe(messages)
             self.data.drop_duplicates(inplace=True)
 
             # Drops any entries with no Date field.
-            # It may be wiser to optionally 
+            # It may be wiser to optionally
             # do interpolation here.
-            self.data.dropna(subset=['Date'],inplace=True)
+            self.data.dropna(subset=['Date'], inplace=True)
 
-            self.data.sort(columns='Date',inplace=True)
+            self.data.sort(columns='Date', inplace=True)
 
     # turn a list of parsed messages into
     # a dataframe of message data, indexed
     # by message-id, with column-names from
     # headers
-    def messages_to_dataframe(self,messages):
+    def messages_to_dataframe(self, messages):
         # extract data into a list of tuples -- records -- with
-        # the Message-ID separated out as an index 
-        pm = [(m.get('Message-ID'), 
+        # the Message-ID separated out as an index
+        pm = [(m.get('Message-ID'),
                (m.get('From'),
                 m.get('Subject'),
                 get_date(m),
@@ -92,7 +102,7 @@ def messages_to_dataframe(self,messages):
                 m.get_payload()))
               for m in messages if m.get('Message-ID')]
 
-        ids,records = zip(*pm)
+        ids, records = zip(*pm)
 
         mdf = pd.DataFrame.from_records(list(records),
                                         index=list(ids),
@@ -103,7 +113,7 @@ def messages_to_dataframe(self,messages):
                                                  'References',
                                                  'Body'])
         mdf.index.name = 'Message-ID'
-        
+
         return mdf
 
     def get_activity(self):
@@ -112,25 +122,28 @@ def get_activity(self):
 
         return self.activity
 
-    def compute_activity(self,clean=True):
+    def compute_activity(self, clean=True):
         mdf = self.data
 
         if clean:
-            #unnecessary?
+            # unnecessary?
             mdf = mdf.dropna(subset=['Date'])
-            mdf = mdf[mdf['Date'] <  datetime.datetime.now(pytz.utc)] # drop messages apparently in the future
+            mdf = mdf[
+                mdf['Date'] < datetime.datetime.now(
+                    pytz.utc)]  # drop messages apparently in the future
 
-        mdf2 = mdf[['From','Date']]
+        mdf2 = mdf[['From', 'Date']]
         mdf2['Date'] = mdf['Date'].apply(lambda x: x.toordinal())
 
-        activity = mdf2.groupby(['From','Date']).size().unstack('From').fillna(0)
-
-        new_date_range = np.arange(mdf2['Date'].min(),mdf2['Date'].max())
-        #activity.set_index('Date')
-
-        activity = activity.reindex(new_date_range,fill_value=0)
+        activity = mdf2.groupby(
+            ['From', 'Date']).size().unstack('From').fillna(0)
+
+        new_date_range = np.arange(mdf2['Date'].min(), mdf2['Date'].max())
+        # activity.set_index('Date')
+
+        activity = activity.reindex(new_date_range, fill_value=0)
 
         return activity
 
-    def save(self,path):
-        self.data.to_csv(path,",")
+    def save(self, path):
+        self.data.to_csv(path, ",")
diff --git a/bigbang/graph.py b/bigbang/graph.py
@@ -1,9 +1,9 @@
-import parse
+from . import parse
 import math
 import numpy as np
 import networkx as nx
 import bigbang.process as process
-#import matplotlib.pyplot as plt
+# import matplotlib.pyplot as plt
 from pprint import pprint as pp
 from collections import Counter
 import pandas
@@ -18,20 +18,21 @@ def messages_to_reply_graph(messages):
 
         G.add_node(mid)
         G.node[mid]['From'] = m.get('From')
-        #G.node[mid]['Date'] = m.get('Date')
-        #G.node[mid]['Message'] = m.get('Message')
-    
+        # G.node[mid]['Date'] = m.get('Date')
+        # G.node[mid]['Message'] = m.get('Message')
+
         # references should be recoverable from in-reply-to structure
-        #if 'References' in d:
+        # if 'References' in d:
         #    G.add_edge(mid,d['References'])
 
         if 'In-Reply-To' in m:
-            G.add_edge(mid,parse.clean_mid(m.get('In-Reply-To')))
+            G.add_edge(mid, parse.clean_mid(m.get('In-Reply-To')))
 
     return G
 
-def messages_to_interaction_graph(messages,verbose=False):
-
+
+def messages_to_interaction_graph(messages, verbose=False):
+
     IG = nx.DiGraph()
 
     from_dict = {}
@@ -48,20 +49,20 @@ def messages_to_interaction_graph(messages,verbose=False):
         reply_counts[m_from] = reply_counts.get(m_from,{})
         IG.add_node(m_from)
     '''
-    if type(messages) is not pandas.core.frame.DataFrame:
+    if not isinstance(messages, pandas.core.frame.DataFrame):
         df = process.messages_to_dataframe(messages)
     else:
         df = messages
 
     for m in df.iterrows():
         m_from = parse.clean_from(m[1]['From'])
         from_dict[m[0]] = m_from
-        sender_counts[m_from] = sender_counts.get(m_from,0) + 1
+        sender_counts[m_from] = sender_counts.get(m_from, 0) + 1
         # the necessity of this initialization step may be dubious
-        reply_counts[m_from] = reply_counts.get(m_from,{})
+        reply_counts[m_from] = reply_counts.get(m_from, {})
         IG.add_node(m_from)
 
-    for sender,count in sender_counts.items():
+    for sender, count in sender_counts.items():
         IG.node[sender]['sent'] = count
 
     replies = [m for m in df.iterrows() if m[1]['In-Reply-To'] is not None]
@@ -72,56 +73,56 @@ def messages_to_interaction_graph(messages,verbose=False):
 
         if reply_to_mid in from_dict:
             m_to = from_dict[reply_to_mid]
-            reply_counts[m_from][m_to] = reply_counts[m_from].get(m_to,0) + 1
+            reply_counts[m_from][m_to] = reply_counts[m_from].get(m_to, 0) + 1
         else:
             if verbose:
                 print reply_to_mid + " not in archive"
 
     for m_from, edges in reply_counts.items():
         for m_to, count in edges.items():
-            IG.add_edge(m_from,m_to,weight=count)
-
+            IG.add_edge(m_from, m_to, weight=count)
 
     return IG
 
 
-#turn an interaction graph into a weighted edge matrix
+# turn an interaction graph into a weighted edge matrix
 def interaction_graph_to_matrix(dg):
     nodes = dg.nodes()
 
     n_nodes = len(nodes)
 
     # n x n where n is number of nodes
-    matrix = np.zeros([n_nodes,n_nodes])
+    matrix = np.zeros([n_nodes, n_nodes])
 
-    for m_from,m_to,data in dg.edges(data=True):
+    for m_from, m_to, data in dg.edges(data=True):
         i = nodes.index(m_from)
         j = nodes.index(m_to)
 
-        matrix[i,j] = data['weight']
+        matrix[i, j] = data['weight']
 
     return matrix
 
 
 # Ulanowicz ecosystem health measures
 # input is weighted adjacency matrix
 def ascendancy(am):
-    #total system throughput
+    # total system throughput
     tst = np.sum(am)
 
     # should these be normalized?!?!
-    #output rates
-    s0 = np.tile(np.sum(am,0).T,(am.shape[0],1))
-    #input rates
-    s1 = np.tile(np.sum(am,1).T,(am.shape[1],1)).T
+    # output rates
+    s0 = np.tile(np.sum(am, 0).T, (am.shape[0], 1))
+    # input rates
+    s1 = np.tile(np.sum(am, 1).T, (am.shape[1], 1)).T
 
     logs = np.nan_to_num(np.log(am * np.sum(am) / (s0 * s1)))
 
-    #ascendancy!
+    # ascendancy!
     A = np.sum(am * logs)
 
     return A
 
+
 def capacity(am):
     # total system throughput
     tst = np.sum(am)
@@ -130,44 +131,47 @@ def capacity(am):
 
     return - np.sum(am * logs)
 
+
 def overhead(am):
-    #could be more efficient...
+    # could be more efficient...
     return capacity(am) - ascendancy(am)
 
 
 """ copied from process.py may have bugs"""
-def compute_ascendancy(messages,duration=50):
+
+
+def compute_ascendancy(messages, duration=50):
     print('compute ascendancy')
     dated_messages = {}
 
     for m in messages:
         d = get_date(m)
 
         if d is not None and d < datetime.datetime.now(pytz.utc):
-            o = d.toordinal()        
-            dated_messages[o] = dated_messages.get(o,[])
+            o = d.toordinal()
+            dated_messages[o] = dated_messages.get(o, [])
             dated_messages[o].append(m)
 
     days = [k for k in dated_messages.keys()]
     day_offset = min(days)
-    epoch = max(days)-min(days)
+    epoch = max(days) - min(days)
 
-    ascendancy = np.zeros([max(days)-min(days)+1])
-    capacity = np.zeros(([max(days)-min(days)+1]))
+    ascendancy = np.zeros([max(days) - min(days) + 1])
+    capacity = np.zeros(([max(days) - min(days) + 1]))
 
     for i in range(epoch):
         min_d = min(days) + i
         max_d = min_d + duration
 
         block_messages = []
 
-        for d in range(min_d,max_d):
-            block_messages.extend(dated_messages.get(d,[]))
+        for d in range(min_d, max_d):
+            block_messages.extend(dated_messages.get(d, []))
 
         b_IG = graph.messages_to_interaction_graph(block_messages)
         b_matrix = graph.interaction_graph_to_matrix(b_IG)
 
-        ascendancy[min_d-day_offset] = graph.ascendancy(b_matrix)
-        capacity[min_d-day_offset] = graph.capacity(b_matrix)
+        ascendancy[min_d - day_offset] = graph.ascendancy(b_matrix)
+        capacity[min_d - day_offset] = graph.capacity(b_matrix)
 
     return ascendancy, capacity