-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
84 lines (71 loc) · 2.61 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# -*- coding: utf-8 -*-
import re
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
def process_tweet(tweet):
'''
Input:
tweet: a string containing a tweet
Output:
tweets_clean: a list of words containing the processed tweet
'''
stemmer = PorterStemmer()
stopwords_english = stopwords.words('english')
# remove stock market tickers like $GE
tweet = re.sub(r'\$\w*', '', tweet)
# remove old style retweet text "RT"
tweet = re.sub(r'^RT[\s]+', '', tweet)
# remove hyperlinks
tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
# remove hashtags
# only removing the hash # sign from the word
tweet = re.sub(r'#', '', tweet)
# tokenize tweets
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
reduce_len=True)
tweet_tokens = tokenizer.tokenize(tweet)
tweets_clean = []
for word in tweet_tokens:
if (word not in stopwords_english and # remove stopwords
word not in string.punctuation): # remove punctuation
# tweets_clean.append(word)
stem_word = stemmer.stem(word) # stemming word
tweets_clean.append(stem_word)
return tweets_clean
def get_dict(file_name):
"""
This function returns the english to french dictionary given a file where the each column corresponds to a word.
Check out the files this function takes in your workspace.
"""
my_file = pd.read_csv(file_name, delimiter=' ')
etof = {} # the english to french dictionary to be returned
for i in range(len(my_file)):
# indexing into the rows.
en = my_file.loc[i][0]
fr = my_file.loc[i][1]
etof[en] = fr
return etof
def cosine_similarity(A, B):
'''
Input:
A: a numpy array which corresponds to a word vector
B: A numpy array which corresponds to a word vector
Output:
cos: numerical number representing the cosine similarity between A and B.
'''
# you have to set this variable to the true label.
cos = -10
dot = np.dot(A, B)
normb = np.linalg.norm(B)
if len(A.shape) == 1: # If A is just a vector, we get the norm
norma = np.linalg.norm(A)
cos = dot / (norma * normb)
else: # If A is a matrix, then compute the norms of the word vectors of the matrix (norm of each row)
norma = np.linalg.norm(A, axis=1)
epsilon = 1.0e-9 # to avoid division by 0
cos = dot / (norma * normb + epsilon)
return cos