-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEDA_sentiment_classificationModel.py
90 lines (77 loc) · 3.1 KB
/
EDA_sentiment_classificationModel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def save_model(output_dir, nlp, new_model_name):
''' This Function Saves model to
given output directory'''
output_dir = f'../working/{output_dir}'
if output_dir is not None:
if not os.path.exists(output_dir):
os.makedirs(output_dir)
nlp.meta["name"] = new_model_name
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
# pass model = nlp if you want to train on top of existing model
def trainme(train_data, output_dir, n_iter=20, model=None):
"""Load the model, set up the pipeline and train the entity recognizer."""
""
if model is not None:
nlp = spacy.load(output_dir) # load existing spaCy model
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank("en") # create blank Language class
print("Created blank 'en' model")
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if "ner" not in nlp.pipe_names:
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
ner = nlp.get_pipe("ner")
# add labels
for _, annotations in train_data:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes): # only train NER
# sizes = compounding(1.0, 4.0, 1.001)
# batch up the examples using spaCy's minibatch
if model is None:
nlp.begin_training()
else:
nlp.resume_training()
for itn in tqdm(range(n_iter)):
random.shuffle(train_data)
batches = minibatch(train_data, size=compounding(4.0, 500.0, 1.001))
losses = {}
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, # batch of texts
annotations, # batch of annotations
drop=0.5, # dropout - make it harder to memorise data
losses=losses,
)
print("Losses", losses)
save_model(output_dir, nlp, 'st_ner')
def get_model_out_path(sentiment):
'''
Returns Model output path
'''
model_out_path = None
if sentiment == 'positive':
model_out_path = 'models/model_pos'
elif sentiment == 'negative':
model_out_path = 'models/model_neg'
return model_out_path
def get_training_data(sentiment):
'''
Returns Training data in the format needed to train spacy NER
'''
train_data = []
for index, row in df_train.iterrows():
if row.sentiment == sentiment:
selected_text = row.selected_text
text = row.text
start = text.find(selected_text)
end = start + len(selected_text)
train_data.append((text, {"entities": [[start, end, 'selected_text']]}))
return train_data