Skip to content

Commit

Permalink
Merge branch 'davelester-autopep8'
Browse files Browse the repository at this point in the history
  • Loading branch information
sbenthall committed Oct 20, 2014
2 parents 1900857 + b02357e commit b271ae5
Show file tree
Hide file tree
Showing 11 changed files with 455 additions and 375 deletions.
77 changes: 45 additions & 32 deletions bigbang/archive.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from bigbang.parse import get_date
import datetime
import mailman
from . import mailman
import mailbox
import numpy as np
import pandas as pd
Expand All @@ -11,21 +11,26 @@ def load(path):
data = pd.read_csv(path)
return Archive(data)


class MissingDataException(Exception):

def __init__(self, value):
self.value = value

def __str__(self):
return repr(self.value)


class Archive:

"""
A representation of a mailing list archive.
"""

data = None
activity = None

def __init__(self, data,archive_dir="archives",single_file=False):
def __init__(self, data, archive_dir="archives", single_file=False):
"""
Initializes an Archive object.
Expand All @@ -41,49 +46,54 @@ def __init__(self, data,archive_dir="archives",single_file=False):
copy of the input DataFrame.
If data is a string, then it is interpreted as a path to either a
single .mbox file (if the optional argument single_file is True) or else
to a directory of .mbox files (also in .mbox format). Note that the
file extensions need not be .mbox; frequently they will be .txt.
single .mbox file (if the optional argument single_file is True) or
else to a directory of .mbox files (also in .mbox format). Note that
the file extensions need not be .mbox; frequently they will be .txt.
Upon initialization, the Archive object drops duplicate entries
and sorts its member variable *data* by Date.
"""
if type(data) is list:
if isinstance(data, list):
self.data = self.messages_to_dataframe(data)
elif type(data) is pd.core.frame.DataFrame:
elif isinstance(data, pd.core.frame.DataFrame):
self.data = data.copy()
elif type(data) is str:
elif isinstance(data, str):
messages = None

if single_file:
# treat string as the path to a file that is an mbox
box = mailbox.mbox(data, create=False)
messages = box.values()
else:
# assume string is the path to a directory with many
messages = mailman.open_list_archives(data,base_arc_dir=archive_dir)
# assume string is the path to a directory with many
messages = mailman.open_list_archives(
data,
base_arc_dir=archive_dir)

if len(messages) == 0:
raise MissingDataException("No messages in %s under %s. Did you run the collect_mail.py script?" % (archive_dir,data))
raise MissingDataException(
("No messages in %s under %s. Did you run the "
"collect_mail.py script?") %
(archive_dir, data))

self.data= self.messages_to_dataframe(messages)
self.data = self.messages_to_dataframe(messages)
self.data.drop_duplicates(inplace=True)

# Drops any entries with no Date field.
# It may be wiser to optionally
# It may be wiser to optionally
# do interpolation here.
self.data.dropna(subset=['Date'],inplace=True)
self.data.dropna(subset=['Date'], inplace=True)

self.data.sort(columns='Date',inplace=True)
self.data.sort(columns='Date', inplace=True)

# turn a list of parsed messages into
# a dataframe of message data, indexed
# by message-id, with column-names from
# headers
def messages_to_dataframe(self,messages):
def messages_to_dataframe(self, messages):
# extract data into a list of tuples -- records -- with
# the Message-ID separated out as an index
pm = [(m.get('Message-ID'),
# the Message-ID separated out as an index
pm = [(m.get('Message-ID'),
(m.get('From'),
m.get('Subject'),
get_date(m),
Expand All @@ -92,7 +102,7 @@ def messages_to_dataframe(self,messages):
m.get_payload()))
for m in messages if m.get('Message-ID')]

ids,records = zip(*pm)
ids, records = zip(*pm)

mdf = pd.DataFrame.from_records(list(records),
index=list(ids),
Expand All @@ -103,7 +113,7 @@ def messages_to_dataframe(self,messages):
'References',
'Body'])
mdf.index.name = 'Message-ID'

return mdf

def get_activity(self):
Expand All @@ -112,25 +122,28 @@ def get_activity(self):

return self.activity

def compute_activity(self,clean=True):
def compute_activity(self, clean=True):
mdf = self.data

if clean:
#unnecessary?
# unnecessary?
mdf = mdf.dropna(subset=['Date'])
mdf = mdf[mdf['Date'] < datetime.datetime.now(pytz.utc)] # drop messages apparently in the future
mdf = mdf[
mdf['Date'] < datetime.datetime.now(
pytz.utc)] # drop messages apparently in the future

mdf2 = mdf[['From','Date']]
mdf2 = mdf[['From', 'Date']]
mdf2['Date'] = mdf['Date'].apply(lambda x: x.toordinal())

activity = mdf2.groupby(['From','Date']).size().unstack('From').fillna(0)

new_date_range = np.arange(mdf2['Date'].min(),mdf2['Date'].max())
#activity.set_index('Date')

activity = activity.reindex(new_date_range,fill_value=0)
activity = mdf2.groupby(
['From', 'Date']).size().unstack('From').fillna(0)

new_date_range = np.arange(mdf2['Date'].min(), mdf2['Date'].max())
# activity.set_index('Date')

activity = activity.reindex(new_date_range, fill_value=0)

return activity

def save(self,path):
self.data.to_csv(path,",")
def save(self, path):
self.data.to_csv(path, ",")
78 changes: 41 additions & 37 deletions bigbang/graph.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import parse
from . import parse
import math
import numpy as np
import networkx as nx
import bigbang.process as process
#import matplotlib.pyplot as plt
# import matplotlib.pyplot as plt
from pprint import pprint as pp
from collections import Counter
import pandas
Expand All @@ -18,20 +18,21 @@ def messages_to_reply_graph(messages):

G.add_node(mid)
G.node[mid]['From'] = m.get('From')
#G.node[mid]['Date'] = m.get('Date')
#G.node[mid]['Message'] = m.get('Message')
# G.node[mid]['Date'] = m.get('Date')
# G.node[mid]['Message'] = m.get('Message')

# references should be recoverable from in-reply-to structure
#if 'References' in d:
# if 'References' in d:
# G.add_edge(mid,d['References'])

if 'In-Reply-To' in m:
G.add_edge(mid,parse.clean_mid(m.get('In-Reply-To')))
G.add_edge(mid, parse.clean_mid(m.get('In-Reply-To')))

return G

def messages_to_interaction_graph(messages,verbose=False):


def messages_to_interaction_graph(messages, verbose=False):

IG = nx.DiGraph()

from_dict = {}
Expand All @@ -48,20 +49,20 @@ def messages_to_interaction_graph(messages,verbose=False):
reply_counts[m_from] = reply_counts.get(m_from,{})
IG.add_node(m_from)
'''
if type(messages) is not pandas.core.frame.DataFrame:
if not isinstance(messages, pandas.core.frame.DataFrame):
df = process.messages_to_dataframe(messages)
else:
df = messages

for m in df.iterrows():
m_from = parse.clean_from(m[1]['From'])
from_dict[m[0]] = m_from
sender_counts[m_from] = sender_counts.get(m_from,0) + 1
sender_counts[m_from] = sender_counts.get(m_from, 0) + 1
# the necessity of this initialization step may be dubious
reply_counts[m_from] = reply_counts.get(m_from,{})
reply_counts[m_from] = reply_counts.get(m_from, {})
IG.add_node(m_from)

for sender,count in sender_counts.items():
for sender, count in sender_counts.items():
IG.node[sender]['sent'] = count

replies = [m for m in df.iterrows() if m[1]['In-Reply-To'] is not None]
Expand All @@ -72,56 +73,56 @@ def messages_to_interaction_graph(messages,verbose=False):

if reply_to_mid in from_dict:
m_to = from_dict[reply_to_mid]
reply_counts[m_from][m_to] = reply_counts[m_from].get(m_to,0) + 1
reply_counts[m_from][m_to] = reply_counts[m_from].get(m_to, 0) + 1
else:
if verbose:
print reply_to_mid + " not in archive"

for m_from, edges in reply_counts.items():
for m_to, count in edges.items():
IG.add_edge(m_from,m_to,weight=count)

IG.add_edge(m_from, m_to, weight=count)

return IG


#turn an interaction graph into a weighted edge matrix
# turn an interaction graph into a weighted edge matrix
def interaction_graph_to_matrix(dg):
nodes = dg.nodes()

n_nodes = len(nodes)

# n x n where n is number of nodes
matrix = np.zeros([n_nodes,n_nodes])
matrix = np.zeros([n_nodes, n_nodes])

for m_from,m_to,data in dg.edges(data=True):
for m_from, m_to, data in dg.edges(data=True):
i = nodes.index(m_from)
j = nodes.index(m_to)

matrix[i,j] = data['weight']
matrix[i, j] = data['weight']

return matrix


# Ulanowicz ecosystem health measures
# input is weighted adjacency matrix
def ascendancy(am):
#total system throughput
# total system throughput
tst = np.sum(am)

# should these be normalized?!?!
#output rates
s0 = np.tile(np.sum(am,0).T,(am.shape[0],1))
#input rates
s1 = np.tile(np.sum(am,1).T,(am.shape[1],1)).T
# output rates
s0 = np.tile(np.sum(am, 0).T, (am.shape[0], 1))
# input rates
s1 = np.tile(np.sum(am, 1).T, (am.shape[1], 1)).T

logs = np.nan_to_num(np.log(am * np.sum(am) / (s0 * s1)))

#ascendancy!
# ascendancy!
A = np.sum(am * logs)

return A


def capacity(am):
# total system throughput
tst = np.sum(am)
Expand All @@ -130,44 +131,47 @@ def capacity(am):

return - np.sum(am * logs)


def overhead(am):
#could be more efficient...
# could be more efficient...
return capacity(am) - ascendancy(am)


""" copied from process.py may have bugs"""
def compute_ascendancy(messages,duration=50):


def compute_ascendancy(messages, duration=50):
print('compute ascendancy')
dated_messages = {}

for m in messages:
d = get_date(m)

if d is not None and d < datetime.datetime.now(pytz.utc):
o = d.toordinal()
dated_messages[o] = dated_messages.get(o,[])
o = d.toordinal()
dated_messages[o] = dated_messages.get(o, [])
dated_messages[o].append(m)

days = [k for k in dated_messages.keys()]
day_offset = min(days)
epoch = max(days)-min(days)
epoch = max(days) - min(days)

ascendancy = np.zeros([max(days)-min(days)+1])
capacity = np.zeros(([max(days)-min(days)+1]))
ascendancy = np.zeros([max(days) - min(days) + 1])
capacity = np.zeros(([max(days) - min(days) + 1]))

for i in range(epoch):
min_d = min(days) + i
max_d = min_d + duration

block_messages = []

for d in range(min_d,max_d):
block_messages.extend(dated_messages.get(d,[]))
for d in range(min_d, max_d):
block_messages.extend(dated_messages.get(d, []))

b_IG = graph.messages_to_interaction_graph(block_messages)
b_matrix = graph.interaction_graph_to_matrix(b_IG)

ascendancy[min_d-day_offset] = graph.ascendancy(b_matrix)
capacity[min_d-day_offset] = graph.capacity(b_matrix)
ascendancy[min_d - day_offset] = graph.ascendancy(b_matrix)
capacity[min_d - day_offset] = graph.capacity(b_matrix)

return ascendancy, capacity
Loading

0 comments on commit b271ae5

Please sign in to comment.