Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add experimentator class #3

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions autobrat/annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def predict(self, texts: t.List[str]) -> List[List[str]]:

def get_classifications(self, text: str):
parsed_sentence = [w.text for w in self.nlp(text)]
print(parsed_sentence)
# print(parsed_sentence)
ans = []
for classifier in self.models:
prediction = classifier.predict([parsed_sentence])
Expand All @@ -66,6 +66,19 @@ def get_probs(self,

return ans

def final_prediction(self, texts: List[str]):
predictions = self.predict(texts)
probs = [self.get_probs(p) for p in predictions]

ans = []
for sentence in probs:
ans.append([])
for term in sentence:
m = max( term.items(),key=lambda x: x[1])
ans[-1].append(m[0])

return ans

def get_entropy(self, probs: t.List[Dict[str, float]]):
return sum(-1 * sum([word * log2(word) for word in words.values()])
for words in probs)
Expand Down Expand Up @@ -106,5 +119,10 @@ def fit(self, data: Collection):
lines, classes = load_training_entities(data)
lines = [[w.text for w in l] for l in lines]

return self.fit_classes(lines, classes)

def fit_classes(self, lines, classes):
for model in self.models:
model.fit(lines, classes)
model.best_pipeline_.send('train')
model.best_pipeline_.run((lines, classes))
model.best_pipeline_.send('eval')
63 changes: 48 additions & 15 deletions autobrat/controller.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import collections
from functools import reduce
from typing import List, Optional
from pathlib import Path
from .annotator import SentencesAnnotator
from scripts.utils import Collection
from scripts.utils import Collection, Sentence
from tinydb import TinyDB, Query
from random import choice
from string import ascii_lowercase, digits
Expand All @@ -21,8 +22,8 @@ def generate_random_str(size: int = 10):
class AnotatorController():
def __init__(
self,
sentences_files: List[Path],
baseline_collection: Path,
sentences: List[str],
baseline_collection: Collection,
generated_pack_path: Path = Path('./generated_packs'),
closed_packs_path: Path = Path('./closed_packs'),
db_path: Path = Path('./sentencedb.json'),
Expand All @@ -39,9 +40,8 @@ def __init__(
self.db = TinyDB(db_path)
saved_sentences = set(s['text'] for s in self.db.all())

self._load_sentences(sentences_files, saved_sentences)
collection = Collection()
collection.load_dir(baseline_collection)
self._load_sentences(sentences, saved_sentences)
collection = baseline_collection.clone()
collection.load_dir(closed_packs_path)

self.annotator = sentence_annotator
Expand All @@ -50,15 +50,14 @@ def __init__(
collection, self.number_of_models)

def _load_sentences(self,
files: List[Path],
sentences: List[str],
ignore_sentences: List[str] = []):

for file in files:
for line in file.open():
if not line or line in ignore_sentences:
continue
for line in sentences:
if not line or line in ignore_sentences:
continue

self.db.insert({'text': line[:-1], 'in_pack': False})
self.db.insert({'text': line, 'in_pack': False})

def update_selected(self, sentences):
Senteces = Query()
Expand All @@ -80,6 +79,13 @@ def generate_pack(self,
dest_folder = self.generated_pack_path
pack_name = generate_random_str()

selected = self.get_batch(pack_size)

self.build_pack(dest_folder / (pack_name), pack_name, selected)

def get_batch(self,
batch_size: int,
set_procesed: bool = True) -> List[str]:
Senteces = Query()

texts = [s['text'] for s in self.db.search(Senteces.in_pack == False)]
Expand All @@ -90,9 +96,15 @@ def generate_pack(self,

sentences.sort(key=lambda x: x[1], reverse=True)

selected = [s[0] for s in sentences[:pack_size]]
self.update_selected(selected)
self.build_pack(dest_folder / (pack_name), pack_name, selected)
selected = [s[0] for s in sentences[:batch_size]]

if not selected:
return []

if set_procesed:
self.update_selected(selected)

return [s for s in selected]

def close_pack(self, path: Path):
collection = Collection()
Expand All @@ -104,3 +116,24 @@ def close_pack(self, path: Path):
logger.info(
f'Finish pack moving to closed pack folder ({path}) -> ({self.closed_packs_path})'
)

@staticmethod
def load_from_files(
self,
sentences_files: List[Path],
baseline_collection: Path,
generated_pack_path: Path = Path('./generated_packs'),
closed_packs_path: Path = Path('./closed_packs'),
db_path: Path = Path('./sentencedb.json'),
sentence_annotator: Optional[SentencesAnnotator] = None
) -> "AnotatorController":

sentences = []

for file in sentences_files:
sentences.extend([line[:-1] for line in file.open() if line])
collection = Collection()
collection.load_dir(baseline_collection)
return AnotatorController(sentences, collection, generated_pack_path,
closed_packs_path, db_path,
sentence_annotator)
109 changes: 109 additions & 0 deletions autobrat/experimentator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import imp
import logging
from pathlib import Path
from typing import List

from spacy import load
import spacy
from scripts.utils import Collection, Sentence
from .controller import AnotatorController
from .utils import load_training_entities
from random import choices, shuffle
from functools import reduce
from dataclasses import dataclass
from spacy.tokens.doc import Doc
from .utils import make_sentence
from scripts.score import subtaskA, compute_metrics

logger = logging.getLogger('experimentator')

nlp = spacy.load('es')

class Experimentator(object):
def __init__(self, corpus: Collection) -> None:
logger.info(f'Corpus total sentences: {len(corpus.sentences)}')
lines, classes = load_training_entities(corpus)
self.unique_clases = reduce(lambda x, y: x | y,
[set(c) for c in classes])
print(self.unique_clases)

self.train_data = {
sentence.text: ([w.text for w in line], category)
for sentence, line, category in zip(corpus.sentences, lines,
classes)
}
self.original_corpus = corpus.clone()
self.training, self.test, self.sentences = self.select_traning_sentences(
corpus)

self.test_spacy_doc = {s.text: nlp(s.text) for s in self.test.sentences}

self.sentences_to_train: List[str] = [s.text for s in self.training]

super().__init__()

def select_traning_sentences(self, corpus: Collection):
size_training = 300
size_test = 100
# return Collection([s for s in choices(corpus.sentences, k=size)])
sentences = corpus.sentences[:]
shuffle(sentences)

return Collection([s for s in sentences[:size_training]]), Collection([
s for s in sentences[size_training:size_training + size_test]
]), [s.text for s in sentences[size_training + size_test:]]

def score(self, submit: Collection):
score_data = subtaskA(self.test, submit)
metrics = compute_metrics(score_data, skipB=True, skipC=True)
logger.info(f'Score: {metrics}')
return metrics['f1']

def run_experiment(self,
batch_size: int,
db_name: str = 'experiment.json'):
controller = AnotatorController(self.sentences,
self.training,
db_path=Path(db_name))

scores = []
# while sentences
sentences = controller.get_batch(batch_size)
while sentences:
self.sentences_to_train.extend(sentences)
lines, classes = [], []
for s in self.sentences_to_train:
line, cls = self.train_data[s]
lines.append(line)
classes.append(cls)

controller.annotator.fit_classes(lines, classes)


sentences = []
predictions = controller.annotator.final_prediction([s for s in self.test_spacy_doc])
for (s, spacy_doc), prediction in zip(self.test_spacy_doc.items(), predictions):
sentence = make_sentence(spacy_doc, prediction, self.unique_clases)
sentence.fix_ids()
sentences.append(sentence)

predicted_collection = Collection(sentences)

scores.append(self.score(predicted_collection))
sentences = controller.get_batch(batch_size)

return scores

def train_with_all(self):
controller = AnotatorController(self.sentences,
self.original_corpus,
db_path=Path('fullcorpus.json'))
sentences = []
predictions = controller.annotator.final_prediction([s for s in self.test_spacy_doc])
for (s, spacy_doc), prediction in zip(self.test_spacy_doc.items(), predictions):
sentence = make_sentence(spacy_doc, prediction, self.unique_clases)
sentence.fix_ids()
sentences.append(sentence)

predicted_collection = Collection(sentences)
return self.score(predicted_collection)
Loading