diff --git a/pyproject.toml b/pyproject.toml index bd1b1bc..d784429 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ requires-python = ">=3.8" [tool.poetry] name = "spacy-partial-tagger" -version = "0.13.0" +version = "0.14.0" description = "Sequence Tagger for Partially Annotated Dataset in spaCy" authors = ["yasufumi "] license = "MIT" @@ -27,7 +27,7 @@ transformers = {extras = ["ja"], version = "^4.25.1"} torch = "^2.0.1" spacy = {extras = ["transformers"], version = "^3.3.1"} spacy-alignments = "^0.8.5" -pytorch-partial-tagger = "^0.1.5" +pytorch-partial-tagger = "^0.1.6" [tool.poetry.group.dev.dependencies] mypy = "^1.3.0" diff --git a/requirements.txt b/requirements.txt index 6707c95..756b649 100644 --- a/requirements.txt +++ b/requirements.txt @@ -40,9 +40,9 @@ pydantic==1.10.8 ; python_version >= "3.8" and python_version < "4.0" pyflakes==2.4.0 ; python_version >= "3.8" and python_version < "4.0" pytest-cov==3.0.0 ; python_version >= "3.8" and python_version < "4.0" pytest==7.3.1 ; python_version >= "3.8" and python_version < "4.0" -pytorch-partial-tagger==0.1.5 ; python_version >= "3.8" and python_version < "4.0" +pytorch-partial-tagger==0.1.6 ; python_version >= "3.8" and python_version < "4.0" pyyaml==6.0 ; python_version >= "3.8" and python_version < "4.0" -regex==2023.5.5 ; python_version >= "3.8" and python_version < "4.0" +regex==2023.6.3 ; python_version >= "3.8" and python_version < "4.0" requests==2.31.0 ; python_version >= "3.8" and python_version < "4.0" rhoknp==1.3.1 ; python_version >= "3.8" and python_version < "4.0" ruff==0.0.270 ; python_version >= "3.8" and python_version < "4.0" diff --git a/spacy_partial_tagger/pipeline.py b/spacy_partial_tagger/pipeline.py index 43a31d8..c14be5c 100644 --- a/spacy_partial_tagger/pipeline.py +++ b/spacy_partial_tagger/pipeline.py @@ -52,9 +52,9 @@ def set_annotations( tokenized_texts = [doc.user_data["tokenized_text"] for doc in docs] tag_factory = TagFactory(tokenized_texts, self.label_set) - tags_collection = tag_factory.create_char_based_tags(tag_indices) + tags_batch = tag_factory.create_char_based_tags(tag_indices) - for doc, tags in zip(docs, tags_collection): + for doc, tags in zip(docs, tags_batch): ents = [] for tag in tags: span = doc.char_span(tag.start, tag.start + tag.length, tag.label) @@ -115,13 +115,13 @@ def get_loss( ] tag_factory = TagFactory(tokenized_texts, self.label_set) - tags_collection = [] + tags_batch = [] for example in examples: tags = tuple( create_tag(ent.start_char, len(ent.text), ent.label_) for ent in example.y.ents ) - tags_collection.append(CharBasedTags(tags, example.y.text)) + tags_batch.append(CharBasedTags(tags, example.y.text)) lengths = [text.num_tokens for text in tokenized_texts] max_length = max(lengths) @@ -130,9 +130,7 @@ def get_loss( device=scores_pt.device, ) - tag_bitmap = tag_factory.create_tag_bitmap( - tuple(tags_collection), scores_pt.device - ) + tag_bitmap = tag_factory.create_tag_bitmap(tuple(tags_batch), scores_pt.device) loss = expected_entity_ratio_loss( scores_pt, tag_bitmap, mask, self.label_set.get_outside_index() diff --git a/spacy_partial_tagger/tagger.py b/spacy_partial_tagger/tagger.py index 966eca7..3548208 100644 --- a/spacy_partial_tagger/tagger.py +++ b/spacy_partial_tagger/tagger.py @@ -45,15 +45,15 @@ def forward( tokenizer: BaseTokenizer = model.attrs["tokenizer"] - tokenized_texts = tokenizer(tuple(doc.text for doc in X)) + text_batch = tokenizer(tuple(doc.text for doc in X)) - for doc, text in zip(X, tokenized_texts.tokenized_texts): + for doc, text in zip(X, text_batch.tokenized_texts): doc.user_data["tokenized_text"] = text device = get_torch_default_device() (log_potentials, tag_indices), backward = model.layers[0]( - [tokenized_texts.get_tagger_inputs(device), tokenized_texts.get_mask(device)], + [text_batch.get_tagger_inputs(device), text_batch.get_mask(device)], is_train, ) diff --git a/spacy_partial_tagger/tokenizer.py b/spacy_partial_tagger/tokenizer.py index 1cac38c..37f0ba3 100644 --- a/spacy_partial_tagger/tokenizer.py +++ b/spacy_partial_tagger/tokenizer.py @@ -4,8 +4,8 @@ from partial_tagger.data import Span, TokenizedText from partial_tagger.data.batch.text import ( BaseTokenizer, + TextBatch, Texts, - TokenizedTexts, TransformerTokenizer, ) from transformers import AutoTokenizer @@ -31,7 +31,7 @@ def __init__( } self.__tokenizer_args["return_offsets_mapping"] = True - def __call__(self, texts: Texts) -> TokenizedTexts: + def __call__(self, texts: Texts) -> TextBatch: batch_encoding = self.__tokenizer(texts, **self.__tokenizer_args) pad_token_id = self.__tokenizer.pad_token_id @@ -62,7 +62,7 @@ def __call__(self, texts: Texts) -> TokenizedTexts: mask = torch.tensor( [[True] * length + [False] * (max_length - length) for length in lengths] ) - return TokenizedTexts(tuple(tokenized_texts), batch_encoding, mask) + return TextBatch(tuple(tokenized_texts), batch_encoding, mask) def get_tokenizer(