-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhelpers.py
149 lines (117 loc) · 5.22 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import csv
import numpy as np
import glove_module as GV
def create_corpus(inputfiles):
""" Given files, it returns the corpus.
Input:
iputfiles: A list of filenames: ["filename1.txt", "filename2.txt"]. If only one filename, it must be given as
["filename1.txt"] and not "filename1.txt".
Output:
corpus: A corpus containing all files, in order: filename1, filename2, etc
file_lenghts: A list of the lenght of each file.
"""
file_lengths=[]
corpus=[]
for file in inputfiles:
with open(file,'rb') as infile:
for line in infile:
corpus.append(line[:-1])
sum_so_far=sum(file_lengths)
new_lenght=len(corpus)
file_lengths.append(new_lenght-sum_so_far)
return corpus, file_lengths
def create_labels(nr_pos_lines,nr_neg_lines,kaggle=False):
"""
Input:
nr_pos_lines: Number tweets from positive training data
nr_neg_lines: Number tweets from negative training data
Output:
labels: A array containing labels where 1 corresponds to positive training data and
0 corresponds to negative training data.
"""
nr_lines_total=nr_pos_lines+nr_neg_lines
labels = np.zeros(nr_lines_total);
labels[0:nr_pos_lines]=1;
if kaggle:
labels[nr_pos_lines:nr_lines_total]=-1;
else:
labels[nr_pos_lines:nr_lines_total]=0;
return labels
def shuffle_data(X, Y):
np.random.seed(1337)
np.random.shuffle(X)
np.random.seed(1337)
np.random.shuffle(Y)
return X, Y
def split_data(X, Y, split=0.8):
split_size = int(X.shape[0]*split)
train_x, val_x = X[:split_size], X[split_size:]
train_y, val_y = Y[:split_size], Y[split_size:]
return train_x, val_x, train_y, val_y
def create_csv_submission(ids, y_pred, name):
""" Creates csv file
TAKEN FROM ML COURSE: https://github.com/epfml/ML_course/blob/master/projects/project1/scripts/proj1_helpers.py
Creates an output file in csv format for submission to kaggle
Arguments: ids (event ids associated with each prediction)
y_pred (predicted class labels)
name (string name of .csv output file to be created)
"""
with open(name, 'w') as csvfile:
fieldnames = ['Id', 'Prediction']
writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
writer.writeheader()
for r1, r2 in zip(ids, y_pred):
writer.writerow({'Id':int(r1),'Prediction':int(r2)})
def get_corpus(full=False, test=False, inputfiles=None):
""" Creating corpus and associated constants.
Input:
full: if true, the retunred corpus contains all 2 510 000 tweets
test: if true, the returned corpus contains the 210 000 test- tweets.
inputfiles: if any given, and test= False and full=Fase, the corpus will be created based on the given files.
Must be given as: inputfiles=['filename.txt', 'filename2.txt'] or ['file.txt'].
Outputs:
full_corpus: A corpus made of the desired files,
nr_pos_tweets: If test of full or inputfiles with 3 files are given, nr_pos_tweets is the number of tweets in
the first file (which for test and full is the positive labeled tweets). Otherwise it is 0.
nr_neg_tweets: If test of full or inputfiles with 3 files are given, nr_neg_tweets is the number of tweets in
the second file (which for test and full is the negative labeled tweets). Otherwise it is 0.
total_training_tweets: The sum of the two above.
"""
training_set_pos = "train_pos.txt"
training_set_neg = "train_neg.txt"
training_set_pos_full = "train_pos_full.txt"
training_set_neg_full = "train_neg_full.txt"
test_set = "test_data.txt"
if test:
inputfiles=[training_set_pos,training_set_neg,test_set]
elif full:
inputfiles=[training_set_pos_full,training_set_neg_full,test_set]
if len(inputfiles)==3:
full_corpus, file_lengths=create_corpus(inputfiles)
nr_pos_tweets = file_lengths[0]
nr_neg_tweets = file_lengths[1]
total_training_tweets =file_lengths [0]+file_lengths[1]
else:
nr_pos_tweets=0
nr_neg_tweets=0
total_training_tweets=0
return full_corpus, nr_pos_tweets, nr_neg_tweets, total_training_tweets
def get_global_vectors(dimension):
""" Returns pre-trained Glove-Embeddings, given dimension.
Input:
dimension: The desired dimension in the returned global vectors. Can be 50,100 or 200.
Output:
global_vectors: The pre-trained global vectors,with desired dimension, on the gensim object form.
"""
if dimension==50:
global_vectors=GV.create_glove_model("gensim_global_vectors_50dim.txt")
elif dimension ==100:
global_vectors=GV.create_glove_model("gensim_global_vectors_100dim.txt")
elif dimension== 200:
global_vectors=GV.create_glove_model("gensim_global_vectors_200dim.txt")
elif dimension==25:
global_vectors=GV.create_glove_model("gensim_global_vectors_25dim.txt")
else:
print('not valid dimension')
global_vectors=-1
return global_vectors