-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
119 lines (99 loc) · 3.77 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import requests
import json
from nltk.stem import SnowballStemmer
from enum import Enum
from typing import List, Dict
from snowballstemmer import stemmer
from transformers import XLMRobertaForMaskedLM, XLMRobertaTokenizer, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
class Language(Enum):
English = "english"
Spanish = "spanish"
French = "french"
Greek = "greek"
Croatian = "croatian"
Catalan = "catalan"
Serbian = "serbian"
@classmethod
def to_dict(cls):
result = {}
for lang in cls:
stem_enabled = Stem_Language[lang.name].value
lang_dict = {'stem': stem_enabled, 'NRC': NRC_Language[lang.name].value, 'stemmer': None}
if stem_enabled and lang.value in SnowballStemmer.languages:
lang_dict['stemmer'] = SnowballStemmer(lang.value)
elif stem_enabled and lang.name == "Greek":
lang_dict['stemmer'] = stemmer("greek")
elif stem_enabled:
raise Exception("Language Doesnt Have Snowball Stemmer")
result[lang.value] = lang_dict
return result
@classmethod
def has_value(cls, value):
return any(value == item.value for item in cls)
@classmethod
def has_key(cls, value):
return any(value == item.name for item in cls)
class NRC_Language(Enum):
English = "English Word"
Spanish = "Spanish"
French = "French"
Greek = "Greek"
Croatian = "Croatian"
Catalan = "Catalan"
Serbian = "Serbian"
@classmethod
def to_list(cls) -> List[str]:
return [label.value for label in cls]
class Emotions(Enum):
ANGER = "anger"
ANTICIPATION = "anticipation"
DISGUST = "disgust"
FEAR = "fear"
JOY = "joy"
NEGATIVE = "negative"
POSITIVE = "positive"
SADNESS = "sadness"
SURPRISE = "surprise"
TRUST = "trust"
@classmethod
def to_list(cls) -> List[str]:
return [label.value for label in cls]
def load_lexicon(path, language_dict, output_path="data/emolex.json"):
# Parse the lexicon and store it in a dictionary
emolex = {}
with open(path, "r", encoding="utf-8") as f:
header = f.readline().strip().split('\t')
lang_indices = [i for i, lang in enumerate(header) if lang in NRC_Language.to_list()]
for line in f.readlines():
values = line.strip().split('\t')
if len(values) == len(header):
emotion_vector = [int(values[i]) for i in range(1, 11)]
for i in lang_indices:
word = values[i]
lang = Language[NRC_Language(header[i]).name].value
if language_dict[lang]["stem"] and lang!="greek": # skip if it is greek lang needs stemming, and do it with a diff stem
word = language_dict[lang]["stemmer"].stem(word)
elif language_dict[lang]["stem"] and lang=="greek":
word = language_dict[lang]["stemmer"].stemWord(word)
emolex[word] = emotion_vector
else:
continue
with open(output_path, "w", encoding="utf-8") as f:
json.dump(emolex, f, ensure_ascii=False, indent=4)
print(f"EmoLex dictionary saved to {output_path}")
#Manually set this for Stemming each language
class Stem_Language(Enum):
English = False
Spanish = False
French = True
Greek = True
Croatian = False
Catalan = False
Serbian = False
if __name__ == "__main__":
output_path = "data/emolex.json"
languages = Language.to_dict()
lexicon_path = "data/NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-ForVariousLanguages.txt"
load_lexicon(lexicon_path, languages, output_path=output_path)