Skip to content

Commit

Permalink
Bump to Torch 1.7 and Transformers 4.1.1 (#665)
Browse files Browse the repository at this point in the history
* bump torch version

* update to transformers 4.4.1

* move CI from azure to github actions

* modify and fix tests

Co-authored-by: Timo Moeller <timo.moeller@deepset.ai>
  • Loading branch information
tholor and Timoeller authored Dec 28, 2020
1 parent 97972a3 commit 94c6b8d
Show file tree
Hide file tree
Showing 25 changed files with 215 additions and 336 deletions.
34 changes: 34 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: Build

on:
push:
branches: [ master ]
pull_request:
branches: [ master ]

jobs:
build:

runs-on: ubuntu-20.04

steps:
- uses: actions/checkout@v2

- name: Set up Python 3.8
uses: actions/setup-python@v2
with:
python-version: 3.8

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pytest
pip install -r requirements.txt
pip install onnxruntime
pip install -e .
- name: Run pytest - only "conversion" marker
run: cd test && pytest -m "conversion"

- name: Run Pytest - all except conversion marker
run: cd test && pytest -m "not conversion"
41 changes: 0 additions & 41 deletions azure-pipelines.yml

This file was deleted.

6 changes: 3 additions & 3 deletions farm/data_handler/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import torch
from numpy.random import random as random_float
from sklearn.preprocessing import StandardScaler
from transformers.configuration_auto import AutoConfig
from transformers import AutoConfig
from tokenizers import Encoding

from farm.data_handler.dataset import convert_features_to_dataset
Expand Down Expand Up @@ -111,8 +111,8 @@ def __init__(
Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
:type proxies: dict
:param multithreading_rust: Whether to allow multithreading in Rust, e.g. for FastTokenizers.
Note: Enabling multithreading in Rust AND multiprocessing in python can cause
trouble incl. deadlocks.
Note: Enabling multithreading in Rust AND multiprocessing in python might cause
deadlocks.
:type multithreading_rust: bool
"""
if not multithreading_rust:
Expand Down
2 changes: 1 addition & 1 deletion farm/data_handler/samples.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from transformers.tokenization_bert import whitespace_tokenize
from transformers.models.bert.tokenization_bert import whitespace_tokenize
from farm.visual.ascii.images import SAMPLE
import numpy as np

Expand Down
28 changes: 16 additions & 12 deletions farm/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ def load(
tokenizer_class=None,
use_fast=True,
tokenizer_args=None,
multithreading_rust=True,
dummy_ph=False,
benchmarking=False,
):
Expand Down Expand Up @@ -218,13 +219,17 @@ def load(
:type disable_tqdm: bool
:param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
:type tokenizer_class: str
:param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
:param use_fast: (Optional, True by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
use the Python one (False).
:type use_fast: bool
:param tokenizer_args: (Optional) Will be passed to the Tokenizer ``__init__`` method.
See https://huggingface.co/transformers/main_classes/tokenizer.html and detailed tokenizer documentation
on `Hugging Face Transformers <https://huggingface.co/transformers/>`_.
:type tokenizer_args: dict
:type use_fast: bool
:param multithreading_rust: Whether to allow multithreading in Rust, e.g. for FastTokenizers.
Note: Enabling multithreading in Rust AND multiprocessing in python might cause
deadlocks.
:type multithreading_rust: bool
:param dummy_ph: If True, methods of the prediction head will be replaced
with a dummy method. This is used to isolate lm run time from ph run time.
:type dummy_ph: bool
Expand All @@ -250,14 +255,6 @@ def load(
else:
processor = Processor.load_from_dir(model_name_or_path)

# override processor attributes loaded from config file with inferencer params
processor.max_seq_len = max_seq_len
if hasattr(processor, "doc_stride"):
assert doc_stride < max_seq_len, "doc_stride is longer than max_seq_len. This means that there will be gaps " \
"as the passage windows slide, causing the model to skip over parts of the document. "\
"Please set a lower value for doc_stride (Suggestions: doc_stride=128, max_seq_len=384) "
processor.doc_stride = doc_stride

# b) or from remote transformers model hub
else:
if not task_type:
Expand All @@ -278,8 +275,15 @@ def load(
tokenizer_args=tokenizer_args,
use_fast=use_fast)

if not isinstance(model,ONNXAdaptiveModel):
model, _ = optimize_model(model=model, device=device, local_rank=-1, optimizer=None)
# override processor attributes loaded from config or HF with inferencer params
processor.max_seq_len = max_seq_len
processor.multithreading_rust = multithreading_rust
if hasattr(processor, "doc_stride"):
assert doc_stride < max_seq_len, "doc_stride is longer than max_seq_len. This means that there will be gaps " \
"as the passage windows slide, causing the model to skip over parts of the document. " \
"Please set a lower value for doc_stride (Suggestions: doc_stride=128, max_seq_len=384) "
processor.doc_stride = doc_stride

return cls(
model,
processor,
Expand Down
6 changes: 3 additions & 3 deletions farm/modeling/adaptive_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import numpy
import torch
from torch import nn
from transformers.configuration_auto import AutoConfig
from transformers import AutoConfig
from transformers.convert_graph_to_onnx import convert, quantize as quantize_model


Expand Down Expand Up @@ -455,11 +455,11 @@ def forward_lm(self, **kwargs):

# Run forward pass of language model
if extraction_layer == -1:
sequence_output, pooled_output = self.language_model(**kwargs, output_all_encoded_layers=False)
sequence_output, pooled_output = self.language_model(**kwargs, return_dict=False, output_all_encoded_layers=False)
else:
# get output from an earlier layer
self.language_model.enable_hidden_states_output()
sequence_output, pooled_output, all_hidden_states = self.language_model(**kwargs)
sequence_output, pooled_output, all_hidden_states = self.language_model(**kwargs, return_dict=False)
sequence_output = all_hidden_states[extraction_layer]
pooled_output = None #not available in earlier layers
self.language_model.disable_hidden_states_output()
Expand Down
23 changes: 13 additions & 10 deletions farm/modeling/language_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,20 @@

logger = logging.getLogger(__name__)

from transformers.modeling_bert import BertModel, BertConfig
from transformers.modeling_roberta import RobertaModel, RobertaConfig
from transformers.modeling_xlnet import XLNetModel, XLNetConfig
from transformers.modeling_albert import AlbertModel, AlbertConfig
from transformers.modeling_xlm_roberta import XLMRobertaModel, XLMRobertaConfig
from transformers.modeling_distilbert import DistilBertModel, DistilBertConfig
from transformers.modeling_electra import ElectraModel, ElectraConfig
from transformers.modeling_camembert import CamembertModel, CamembertConfig
from transformers.modeling_auto import AutoModel, AutoConfig
from transformers import (
BertModel, BertConfig,
RobertaModel, RobertaConfig,
XLNetModel, XLNetConfig,
AlbertModel, AlbertConfig,
XLMRobertaModel, XLMRobertaConfig,
DistilBertModel, DistilBertConfig,
ElectraModel, ElectraConfig,
CamembertModel, CamembertConfig
)

from transformers import AutoModel, AutoConfig
from transformers.modeling_utils import SequenceSummary
from transformers.tokenization_bert import load_vocab
from transformers.models.bert.tokenization_bert import load_vocab
import transformers

from farm.modeling import wordembedding_utils
Expand Down
4 changes: 2 additions & 2 deletions farm/modeling/prediction_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
import numpy as np

from pathlib import Path
from transformers.modeling_bert import BertForPreTraining, ACT2FN
from transformers.modeling_auto import AutoModelForQuestionAnswering, AutoModelForTokenClassification, AutoModelForSequenceClassification
from transformers.models.bert.modeling_bert import BertForPreTraining, ACT2FN
from transformers import AutoModelForQuestionAnswering, AutoModelForTokenClassification, AutoModelForSequenceClassification
from typing import List, Tuple

import torch
Expand Down
34 changes: 15 additions & 19 deletions farm/modeling/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,26 +23,22 @@
from pathlib import Path

import numpy as np
from transformers.tokenization_albert import AlbertTokenizer
from transformers.tokenization_albert_fast import AlbertTokenizerFast
from transformers.tokenization_bert import BertTokenizer, load_vocab
from transformers.tokenization_bert_fast import BertTokenizerFast
from transformers.tokenization_distilbert import DistilBertTokenizer
from transformers.tokenization_distilbert_fast import DistilBertTokenizerFast
from transformers.tokenization_electra import ElectraTokenizer
from transformers.tokenization_electra_fast import ElectraTokenizerFast
from transformers.tokenization_roberta import RobertaTokenizer
from transformers.tokenization_roberta_fast import RobertaTokenizerFast
from transformers import (
AlbertTokenizer, AlbertTokenizerFast,
BertTokenizer, BertTokenizerFast,
DistilBertTokenizer, DistilBertTokenizerFast,
ElectraTokenizer, ElectraTokenizerFast,
RobertaTokenizer, RobertaTokenizerFast,
XLMRobertaTokenizer, XLMRobertaTokenizerFast,
XLNetTokenizer, XLNetTokenizerFast,
CamembertTokenizer, CamembertTokenizerFast,
DPRContextEncoderTokenizer, DPRContextEncoderTokenizerFast,
DPRQuestionEncoderTokenizer, DPRQuestionEncoderTokenizerFast
)
from transformers.models.bert.tokenization_bert import load_vocab
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.tokenization_xlm_roberta import XLMRobertaTokenizer
from transformers.tokenization_xlm_roberta_fast import XLMRobertaTokenizerFast
from transformers.tokenization_xlnet import XLNetTokenizer
from transformers.tokenization_xlnet_fast import XLNetTokenizerFast
from transformers.tokenization_camembert import CamembertTokenizer
from transformers.tokenization_camembert_fast import CamembertTokenizerFast
from transformers.modeling_auto import AutoConfig
from transformers import DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer
from transformers import DPRContextEncoderTokenizerFast, DPRQuestionEncoderTokenizerFast
from transformers import AutoConfig


from farm.data_handler.samples import SampleBasket
from farm.modeling.wordembedding_utils import load_from_cache, EMBEDDING_VOCAB_FILES_MAP, run_split_on_punc
Expand Down
2 changes: 1 addition & 1 deletion farm/modeling/wordembedding_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers.tokenization_bert import BertTokenizer
from transformers import BertTokenizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from collections import Counter
Expand Down
8 changes: 5 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
setuptools
wheel
# PyTorch
--find-links=https://download.pytorch.org/whl/torch_stable.html
torch>1.5,<1.7
# Temp. disabled the next line as it gets currently resolved to https://download.pytorch.org/whl/rocm3.8/torch-1.7.1%2Brocm3.8-cp38-cp38-linux_x86_64.whl
#--find-links=https://download.pytorch.org/whl/torch_stable.html
torch>1.5,<1.8
# progress bars in model download and training scripts
tqdm
# Accessing files from S3 directly.
Expand All @@ -17,7 +18,7 @@ sklearn
seqeval==0.0.12
mlflow==1.0.0
# huggingface repository
transformers==3.5.1
transformers==4.1.1
# accessing dictionary elements with dot notation
dotmap==1.3.0
# for inference-rest-apis
Expand All @@ -32,3 +33,4 @@ dill # pickle extension for (de-)serialization
#onnxruntime
#onnxruntime_tools
psutil
sentencepiece
2 changes: 1 addition & 1 deletion test/benchmarks/question_answering_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def prepare_dict(sample_file, q, document_size):
if sample_file[-3:] == "txt":
text = f.read()[:document_size]
assert len(text) == document_size
dicts = [{"qas": [q], "context": text}]
dicts = [{"questions": [q], "text": text}]
elif sample_file[-4:] == "json":
data = json.load(f)
dicts = []
Expand Down
17 changes: 12 additions & 5 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,13 @@ def pytest_generate_tests(metafunc):
metafunc.parametrize("use_gpu", [False], scope="session")


@pytest.fixture()
def pytest_collection_modifyitems(items):
for item in items:
if "conversion" in item.nodeid:
item.add_marker(pytest.mark.conversion)


@pytest.fixture(scope="module")
def adaptive_model_qa(use_gpu, num_processes):
"""
PyTest Fixture for a Question Answering Inferencer based on PyTorch.
Expand Down Expand Up @@ -61,20 +67,21 @@ def adaptive_model_qa(use_gpu, num_processes):
assert len(children) == 0


@pytest.fixture()
@pytest.fixture(scope="module")
def bert_base_squad2(request):
model = QAInferencer.load(
"deepset/bert-base-cased-squad2",
"deepset/minilm-uncased-squad2",
task_type="question_answering",
batch_size=16,
batch_size=4,
num_processes=0,
multithreading_rust=False,
use_fast=True # TODO parametrize this to test slow as well
)
return model

# TODO add other model types (roberta, xlm-r, albert) here as well

@pytest.fixture()
@pytest.fixture(scope="module")
def distilbert_squad(request):
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=False)
Expand Down
Loading

0 comments on commit 94c6b8d

Please sign in to comment.