You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
Traceback (most recent call last):
File "retriever.py", line 159, in <module>
train()
File "retriever.py", line 148, in train
retriever.index()
File "envs/retriever/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "envs/retriever/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "envs/retriever/lib/python3.10/site-packages/goldenretriever/pytorch_modules/model.py", line 239, in index
return self.document_index.index(
File "envs/retriever/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "envs/retriever/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "envs/retriever/lib/python3.10/site-packages/goldenretriever/indexers/inmemory.py", line 222, in index
for batch in tqdm(dataloader, desc="Indexing"):
File "envs/retriever/lib/python3.10/site-packages/tqdm/std.py", line 1181, in __iter__
for obj in iterable:
File "/home/alessandroscire/miniconda3/envs/retriever/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 631, in __next__
data = self._next_data()
File "envs/retriever/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1346, in _next_data
return self._process_data(data)
File "envs/retriever/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1372, in _process_data
data.reraise()
File "envs/retriever/lib/python3.10/site-packages/torch/_utils.py", line 722, in reraise
raise exception
OverflowError: Caught OverflowError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "envs/retriever/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
data = fetcher.fetch(index)
File "envs/retriever/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
return self.collate_fn(data)
File "envs/retriever/lib/python3.10/site-packages/goldenretriever/indexers/inmemory.py", line 181, in collate_fn
tokenizer(
File "envs/retriever/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 2803, in __call__
encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs)
File "envs/retriever/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 2889, in _call_one
return self.batch_encode_plus(
File "envs/retriever/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 3080, in batch_encode_plus
return self._batch_encode_plus(
File "envs/retriever/lib/python3.10/site-packages/transformers/tokenization_utils_fast.py", line 496, in _batch_encode_plus
self.set_truncation_and_padding(
File "envs/retriever/lib/python3.10/site-packages/transformers/tokenization_utils_fast.py", line 451, in set_truncation_and_padding
self._tokenizer.enable_truncation(**target)
OverflowError: int too big to convert
The text was updated successfully, but these errors were encountered:
The retriever do not work if using deberta as the encoder model.
Code:
Error:
The text was updated successfully, but these errors were encountered: