-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_training.py
executable file
·49 lines (31 loc) · 1.27 KB
/
process_training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env python
import os, sys
import pymysql as mdb
import numpy as np
skip_lines=0
start_month=8 #Use Aug, Sep to predict Oct
db = mdb.connect(user="hebda", host="localhost", db="GitWatch", charset='utf8', autocommit=True)
with db:
cur = db.cursor()
#This file is populated by:
#SELECT id,COUNT(*) AS num FROM event GROUP BY id;
with open('data/training_repos.csv','r') as f_in:
ctr=0
for i in f_in:
if ctr<skip_lines:
ctr+=1
continue
repoid=int(i.split(',')[0])
events=int(i.split(',')[1].replace('\n',''))
with open('data/application_set.csv','a') as f_out:
print "Processing repoid %d (%d events)" % (repoid,events)
cur.execute('SELECT type,timestamp FROM event WHERE id=%d and timestamp<"2015-10-01" and timestamp>"2015-06-01"' % repoid)
event_info=np.zeros(60,int)
for j in cur.fetchall():
index=j[0]+20*(j[1].month-start_month)
if index < 0 or index>=60:
continue
event_info[index]+=1
f_out.write(str(repoid)+','+','.join(['%d' % num for num in event_info])+'\n')
cur.close()
db.close()