Skip to content

Commit

Permalink
Update pytorch-partial-tagger (#34)
Browse files Browse the repository at this point in the history
* Bump pytorch-partial-tagger version

* Update incompatible parts

* Bump version

* Fix the import order in tokenizer.py

* Update requirements.txt
  • Loading branch information
yasufumy authored Jun 4, 2023
1 parent 8b151b7 commit 2adbeab
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 17 deletions.
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ requires-python = ">=3.8"

[tool.poetry]
name = "spacy-partial-tagger"
version = "0.13.0"
version = "0.14.0"
description = "Sequence Tagger for Partially Annotated Dataset in spaCy"
authors = ["yasufumi <yasufumi.taniguchi@gmail.com>"]
license = "MIT"
Expand All @@ -27,7 +27,7 @@ transformers = {extras = ["ja"], version = "^4.25.1"}
torch = "^2.0.1"
spacy = {extras = ["transformers"], version = "^3.3.1"}
spacy-alignments = "^0.8.5"
pytorch-partial-tagger = "^0.1.5"
pytorch-partial-tagger = "^0.1.6"

[tool.poetry.group.dev.dependencies]
mypy = "^1.3.0"
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ pydantic==1.10.8 ; python_version >= "3.8" and python_version < "4.0"
pyflakes==2.4.0 ; python_version >= "3.8" and python_version < "4.0"
pytest-cov==3.0.0 ; python_version >= "3.8" and python_version < "4.0"
pytest==7.3.1 ; python_version >= "3.8" and python_version < "4.0"
pytorch-partial-tagger==0.1.5 ; python_version >= "3.8" and python_version < "4.0"
pytorch-partial-tagger==0.1.6 ; python_version >= "3.8" and python_version < "4.0"
pyyaml==6.0 ; python_version >= "3.8" and python_version < "4.0"
regex==2023.5.5 ; python_version >= "3.8" and python_version < "4.0"
regex==2023.6.3 ; python_version >= "3.8" and python_version < "4.0"
requests==2.31.0 ; python_version >= "3.8" and python_version < "4.0"
rhoknp==1.3.1 ; python_version >= "3.8" and python_version < "4.0"
ruff==0.0.270 ; python_version >= "3.8" and python_version < "4.0"
Expand Down
12 changes: 5 additions & 7 deletions spacy_partial_tagger/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ def set_annotations(
tokenized_texts = [doc.user_data["tokenized_text"] for doc in docs]
tag_factory = TagFactory(tokenized_texts, self.label_set)

tags_collection = tag_factory.create_char_based_tags(tag_indices)
tags_batch = tag_factory.create_char_based_tags(tag_indices)

for doc, tags in zip(docs, tags_collection):
for doc, tags in zip(docs, tags_batch):
ents = []
for tag in tags:
span = doc.char_span(tag.start, tag.start + tag.length, tag.label)
Expand Down Expand Up @@ -115,13 +115,13 @@ def get_loss(
]
tag_factory = TagFactory(tokenized_texts, self.label_set)

tags_collection = []
tags_batch = []
for example in examples:
tags = tuple(
create_tag(ent.start_char, len(ent.text), ent.label_)
for ent in example.y.ents
)
tags_collection.append(CharBasedTags(tags, example.y.text))
tags_batch.append(CharBasedTags(tags, example.y.text))

lengths = [text.num_tokens for text in tokenized_texts]
max_length = max(lengths)
Expand All @@ -130,9 +130,7 @@ def get_loss(
device=scores_pt.device,
)

tag_bitmap = tag_factory.create_tag_bitmap(
tuple(tags_collection), scores_pt.device
)
tag_bitmap = tag_factory.create_tag_bitmap(tuple(tags_batch), scores_pt.device)

loss = expected_entity_ratio_loss(
scores_pt, tag_bitmap, mask, self.label_set.get_outside_index()
Expand Down
6 changes: 3 additions & 3 deletions spacy_partial_tagger/tagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,15 @@ def forward(

tokenizer: BaseTokenizer = model.attrs["tokenizer"]

tokenized_texts = tokenizer(tuple(doc.text for doc in X))
text_batch = tokenizer(tuple(doc.text for doc in X))

for doc, text in zip(X, tokenized_texts.tokenized_texts):
for doc, text in zip(X, text_batch.tokenized_texts):
doc.user_data["tokenized_text"] = text

device = get_torch_default_device()

(log_potentials, tag_indices), backward = model.layers[0](
[tokenized_texts.get_tagger_inputs(device), tokenized_texts.get_mask(device)],
[text_batch.get_tagger_inputs(device), text_batch.get_mask(device)],
is_train,
)

Expand Down
6 changes: 3 additions & 3 deletions spacy_partial_tagger/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from partial_tagger.data import Span, TokenizedText
from partial_tagger.data.batch.text import (
BaseTokenizer,
TextBatch,
Texts,
TokenizedTexts,
TransformerTokenizer,
)
from transformers import AutoTokenizer
Expand All @@ -31,7 +31,7 @@ def __init__(
}
self.__tokenizer_args["return_offsets_mapping"] = True

def __call__(self, texts: Texts) -> TokenizedTexts:
def __call__(self, texts: Texts) -> TextBatch:
batch_encoding = self.__tokenizer(texts, **self.__tokenizer_args)

pad_token_id = self.__tokenizer.pad_token_id
Expand Down Expand Up @@ -62,7 +62,7 @@ def __call__(self, texts: Texts) -> TokenizedTexts:
mask = torch.tensor(
[[True] * length + [False] * (max_length - length) for length in lengths]
)
return TokenizedTexts(tuple(tokenized_texts), batch_encoding, mask)
return TextBatch(tuple(tokenized_texts), batch_encoding, mask)


def get_tokenizer(
Expand Down

0 comments on commit 2adbeab

Please sign in to comment.