autogoal · hiancdtrsnm · Mar 12, 2021 · Mar 12, 2021
diff --git a/autobrat/annotator.py b/autobrat/annotator.py
@@ -46,7 +46,7 @@ def predict(self, texts: t.List[str]) -> List[List[str]]:
 
     def get_classifications(self, text: str):
         parsed_sentence = [w.text for w in self.nlp(text)]
-        print(parsed_sentence)
+        # print(parsed_sentence)
         ans = []
         for classifier in self.models:
             prediction = classifier.predict([parsed_sentence])
@@ -66,6 +66,19 @@ def get_probs(self,
 
         return ans
 
+    def final_prediction(self, texts: List[str]):
+        predictions = self.predict(texts)
+        probs = [self.get_probs(p) for p in predictions]
+
+        ans = []
+        for sentence in probs:
+            ans.append([])
+            for term in sentence:
+                m = max( term.items(),key=lambda x: x[1])
+                ans[-1].append(m[0])
+
+        return ans
+
     def get_entropy(self, probs: t.List[Dict[str, float]]):
         return sum(-1 * sum([word * log2(word) for word in words.values()])
                    for words in probs)
@@ -106,5 +119,10 @@ def fit(self, data: Collection):
         lines, classes = load_training_entities(data)
         lines = [[w.text for w in l] for l in lines]
 
+        return self.fit_classes(lines, classes)
+
+    def fit_classes(self, lines, classes):
         for model in self.models:
-            model.fit(lines, classes)
+            model.best_pipeline_.send('train')
+            model.best_pipeline_.run((lines, classes))
+            model.best_pipeline_.send('eval')
diff --git a/autobrat/controller.py b/autobrat/controller.py
@@ -1,8 +1,9 @@
+import collections
 from functools import reduce
 from typing import List, Optional
 from pathlib import Path
 from .annotator import SentencesAnnotator
-from scripts.utils import Collection
+from scripts.utils import Collection, Sentence
 from tinydb import TinyDB, Query
 from random import choice
 from string import ascii_lowercase, digits
@@ -21,8 +22,8 @@ def generate_random_str(size: int = 10):
 class AnotatorController():
     def __init__(
             self,
-            sentences_files: List[Path],
-            baseline_collection: Path,
+            sentences: List[str],
+            baseline_collection: Collection,
             generated_pack_path: Path = Path('./generated_packs'),
             closed_packs_path: Path = Path('./closed_packs'),
             db_path: Path = Path('./sentencedb.json'),
@@ -39,9 +40,8 @@ def __init__(
         self.db = TinyDB(db_path)
         saved_sentences = set(s['text'] for s in self.db.all())
 
-        self._load_sentences(sentences_files, saved_sentences)
-        collection = Collection()
-        collection.load_dir(baseline_collection)
+        self._load_sentences(sentences, saved_sentences)
+        collection = baseline_collection.clone()
         collection.load_dir(closed_packs_path)
 
         self.annotator = sentence_annotator
@@ -50,15 +50,14 @@ def __init__(
                 collection, self.number_of_models)
 
     def _load_sentences(self,
-                        files: List[Path],
+                        sentences: List[str],
                         ignore_sentences: List[str] = []):
 
-        for file in files:
-            for line in file.open():
-                if not line or line in ignore_sentences:
-                    continue
+        for line in sentences:
+            if not line or line in ignore_sentences:
+                continue
 
-                self.db.insert({'text': line[:-1], 'in_pack': False})
+            self.db.insert({'text': line, 'in_pack': False})
 
     def update_selected(self, sentences):
         Senteces = Query()
@@ -80,6 +79,13 @@ def generate_pack(self,
             dest_folder = self.generated_pack_path
         pack_name = generate_random_str()
 
+        selected = self.get_batch(pack_size)
+
+        self.build_pack(dest_folder / (pack_name), pack_name, selected)
+
+    def get_batch(self,
+                  batch_size: int,
+                  set_procesed: bool = True) -> List[str]:
         Senteces = Query()
 
         texts = [s['text'] for s in self.db.search(Senteces.in_pack == False)]
@@ -90,9 +96,15 @@ def generate_pack(self,
 
         sentences.sort(key=lambda x: x[1], reverse=True)
 
-        selected = [s[0] for s in sentences[:pack_size]]
-        self.update_selected(selected)
-        self.build_pack(dest_folder / (pack_name), pack_name, selected)
+        selected = [s[0] for s in sentences[:batch_size]]
+
+        if not selected:
+            return []
+
+        if set_procesed:
+            self.update_selected(selected)
+
+        return [s for s in selected]
 
     def close_pack(self, path: Path):
         collection = Collection()
@@ -104,3 +116,24 @@ def close_pack(self, path: Path):
         logger.info(
             f'Finish pack moving to closed pack folder ({path}) -> ({self.closed_packs_path})'
         )
+
+    @staticmethod
+    def load_from_files(
+        self,
+        sentences_files: List[Path],
+        baseline_collection: Path,
+        generated_pack_path: Path = Path('./generated_packs'),
+        closed_packs_path: Path = Path('./closed_packs'),
+        db_path: Path = Path('./sentencedb.json'),
+        sentence_annotator: Optional[SentencesAnnotator] = None
+    ) -> "AnotatorController":
+
+        sentences = []
+
+        for file in sentences_files:
+            sentences.extend([line[:-1] for line in file.open() if line])
+        collection = Collection()
+        collection.load_dir(baseline_collection)
+        return AnotatorController(sentences, collection, generated_pack_path,
+                                  closed_packs_path, db_path,
+                                  sentence_annotator)
diff --git a/autobrat/experimentator.py b/autobrat/experimentator.py
@@ -0,0 +1,109 @@
+import imp
+import logging
+from pathlib import Path
+from typing import List
+
+from spacy import load
+import spacy
+from scripts.utils import Collection, Sentence
+from .controller import AnotatorController
+from .utils import load_training_entities
+from random import choices, shuffle
+from functools import reduce
+from dataclasses import dataclass
+from spacy.tokens.doc import Doc
+from .utils import make_sentence
+from scripts.score import subtaskA, compute_metrics
+
+logger = logging.getLogger('experimentator')
+
+nlp = spacy.load('es')
+
+class Experimentator(object):
+    def __init__(self, corpus: Collection) -> None:
+        logger.info(f'Corpus total sentences: {len(corpus.sentences)}')
+        lines, classes = load_training_entities(corpus)
+        self.unique_clases = reduce(lambda x, y: x | y,
+                                    [set(c) for c in classes])
+        print(self.unique_clases)
+
+        self.train_data = {
+            sentence.text: ([w.text for w in line], category)
+            for sentence, line, category in zip(corpus.sentences, lines,
+                                                classes)
+        }
+        self.original_corpus = corpus.clone()
+        self.training, self.test, self.sentences = self.select_traning_sentences(
+            corpus)
+
+        self.test_spacy_doc = {s.text: nlp(s.text) for s in self.test.sentences}
+
+        self.sentences_to_train: List[str] = [s.text for s in self.training]
+
+        super().__init__()
+
+    def select_traning_sentences(self, corpus: Collection):
+        size_training = 300
+        size_test = 100
+        # return Collection([s for s in choices(corpus.sentences, k=size)])
+        sentences = corpus.sentences[:]
+        shuffle(sentences)
+
+        return Collection([s for s in sentences[:size_training]]), Collection([
+            s for s in sentences[size_training:size_training + size_test]
+        ]), [s.text for s in sentences[size_training + size_test:]]
+
+    def score(self, submit: Collection):
+        score_data = subtaskA(self.test, submit)
+        metrics = compute_metrics(score_data, skipB=True, skipC=True)
+        logger.info(f'Score: {metrics}')
+        return metrics['f1']
+
+    def run_experiment(self,
+                       batch_size: int,
+                       db_name: str = 'experiment.json'):
+        controller = AnotatorController(self.sentences,
+                                        self.training,
+                                        db_path=Path(db_name))
+
+        scores = []
+        # while sentences
+        sentences = controller.get_batch(batch_size)
+        while sentences:
+            self.sentences_to_train.extend(sentences)
+            lines, classes = [], []
+            for s in self.sentences_to_train:
+                line, cls = self.train_data[s]
+                lines.append(line)
+                classes.append(cls)
+
+            controller.annotator.fit_classes(lines, classes)
+
+
+            sentences = []
+            predictions = controller.annotator.final_prediction([s for s in self.test_spacy_doc])
+            for (s, spacy_doc), prediction in zip(self.test_spacy_doc.items(), predictions):
+                sentence = make_sentence(spacy_doc, prediction, self.unique_clases)
+                sentence.fix_ids()
+                sentences.append(sentence)
+
+            predicted_collection = Collection(sentences)
+
+            scores.append(self.score(predicted_collection))
+            sentences = controller.get_batch(batch_size)
+
+        return scores
+
+    def train_with_all(self):
+        controller = AnotatorController(self.sentences,
+                                self.original_corpus,
+                                db_path=Path('fullcorpus.json'))
+        sentences = []
+        predictions = controller.annotator.final_prediction([s for s in self.test_spacy_doc])
+        for (s, spacy_doc), prediction in zip(self.test_spacy_doc.items(), predictions):
+            sentence = make_sentence(spacy_doc, prediction, self.unique_clases)
+            sentence.fix_ids()
+            sentences.append(sentence)
+
+        predicted_collection = Collection(sentences)
+        return self.score(predicted_collection)