diff --git a/app.py b/app.py
index 752e0c9..ed593d4 100644
--- a/app.py
+++ b/app.py
@@ -6,9 +6,9 @@
 
 
 app = FastAPI()
-vec : Vectorizer
-meta_config : Meta
-logger = getLogger('uvicorn')
+vec: Vectorizer
+meta_config: Meta
+logger = getLogger("uvicorn")
 
 
 @app.on_event("startup")
@@ -20,16 +20,22 @@ def startup_event():
     cuda_per_process_memory_fraction = 1.0
     if "CUDA_PER_PROCESS_MEMORY_FRACTION" in os.environ:
         try:
-            cuda_per_process_memory_fraction = float(os.getenv("CUDA_PER_PROCESS_MEMORY_FRACTION"))
+            cuda_per_process_memory_fraction = float(
+                os.getenv("CUDA_PER_PROCESS_MEMORY_FRACTION")
+            )
         except ValueError:
-            logger.error(f"Invalid CUDA_PER_PROCESS_MEMORY_FRACTION (should be between 0.0-1.0)")
+            logger.error(
+                f"Invalid CUDA_PER_PROCESS_MEMORY_FRACTION (should be between 0.0-1.0)"
+            )
     if 0.0 <= cuda_per_process_memory_fraction <= 1.0:
-        logger.info(f"CUDA_PER_PROCESS_MEMORY_FRACTION set to {cuda_per_process_memory_fraction}")
-    cuda_support=False
-    cuda_core=""
+        logger.info(
+            f"CUDA_PER_PROCESS_MEMORY_FRACTION set to {cuda_per_process_memory_fraction}"
+        )
+    cuda_support = False
+    cuda_core = ""
 
     if cuda_env is not None and cuda_env == "true" or cuda_env == "1":
-        cuda_support=True
+        cuda_support = True
         cuda_core = os.getenv("CUDA_CORE")
         if cuda_core is None or cuda_core == "":
             cuda_core = "cuda:0"
@@ -40,10 +46,15 @@ def startup_event():
     # Batch text tokenization enabled by default
     direct_tokenize = False
     transformers_direct_tokenize = os.getenv("T2V_TRANSFORMERS_DIRECT_TOKENIZE")
-    if transformers_direct_tokenize is not None and transformers_direct_tokenize == "true" or transformers_direct_tokenize == "1":
+    if (
+        transformers_direct_tokenize is not None
+        and transformers_direct_tokenize == "true"
+        or transformers_direct_tokenize == "1"
+    ):
         direct_tokenize = True
 
     model_dir = "./models/model"
+
     def get_model_directory() -> (str, bool):
         if os.path.exists(f"{model_dir}/model_name"):
             with open(f"{model_dir}/model_name", "r") as f:
@@ -65,17 +76,27 @@ def log_info_about_onnx(onnx_runtime: bool):
             if os.path.exists(f"{model_dir}/onnx_quantization_info"):
                 with open(f"{model_dir}/onnx_quantization_info", "r") as f:
                     onnx_quantization_info = f.read()
-            logger.info(f"Running ONNX vectorizer with quantized model for {onnx_quantization_info}")
+            logger.info(
+                f"Running ONNX vectorizer with quantized model for {onnx_quantization_info}"
+            )
 
     model_name, use_sentence_transformer_vectorizer = get_model_directory()
     onnx_runtime = get_onnx_runtime()
     log_info_about_onnx(onnx_runtime)
 
     meta_config = Meta(model_dir, model_name, use_sentence_transformer_vectorizer)
-    vec = Vectorizer(model_dir, cuda_support, cuda_core, cuda_per_process_memory_fraction,
-                     meta_config.get_model_type(), meta_config.get_architecture(), 
-                     direct_tokenize, onnx_runtime, use_sentence_transformer_vectorizer,
-                     model_name)
+    vec = Vectorizer(
+        model_dir,
+        cuda_support,
+        cuda_core,
+        cuda_per_process_memory_fraction,
+        meta_config.get_model_type(),
+        meta_config.get_architecture(),
+        direct_tokenize,
+        onnx_runtime,
+        use_sentence_transformer_vectorizer,
+        model_name,
+    )
 
 
 @app.get("/.well-known/live", response_class=Response)
@@ -96,8 +117,6 @@ async def read_item(item: VectorInput, response: Response):
         vector = await vec.vectorize(item.text, item.config)
         return {"text": item.text, "vector": vector.tolist(), "dim": len(vector)}
     except Exception as e:
-        logger.exception(
-            'Something went wrong while vectorizing data.'
-        )
+        logger.exception("Something went wrong while vectorizing data.")
         response.status_code = status.HTTP_500_INTERNAL_SERVER_ERROR
         return {"error": str(e)}
diff --git a/custom_prerequisites.py b/custom_prerequisites.py
index 42a13ea..5f19d83 100755
--- a/custom_prerequisites.py
+++ b/custom_prerequisites.py
@@ -2,4 +2,4 @@
 
 import nltk
 
-nltk.download('punkt')
+nltk.download("punkt")
diff --git a/download.py b/download.py
index 3c04630..0b025a6 100755
--- a/download.py
+++ b/download.py
@@ -91,7 +91,9 @@ def quantization_config(onnx_cpu_arch: str):
         os.remove(f"{model_dir}/model.onnx")
     # Save information about ONNX runtime
     save_to_file(f"{model_dir}/onnx_runtime", onnx_runtime)
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name, trust_remote_code=trust_remote_code
+    )
     tokenizer.save_pretrained(onnx_path)
 
 
diff --git a/requirements-test.txt b/requirements-test.txt
index dfc5234..f8c6a0d 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,13 +1,14 @@
 requests==2.32.3
-transformers==4.42.4
-fastapi==0.112.0
-uvicorn==0.30.5
+transformers==4.44.2
+fastapi==0.115.0
+uvicorn==0.31.0
 nltk==3.9.1
-torch==2.4.0
+torch==2.4.1
 sentencepiece==0.2.0
-sentence-transformers==3.0.1
-optimum==1.21.2
-onnxruntime==1.18.1
-onnx==1.16.2
+sentence-transformers==3.1.1
+optimum==1.22.0
+onnxruntime==1.19.2
+onnx==1.17.0
 numpy==1.26.4
+einops==0.8.0
 pytest
diff --git a/requirements.txt b/requirements.txt
index d2c8e98..d3f99d0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,11 +1,12 @@
-transformers==4.42.4
-fastapi==0.112.0
-uvicorn==0.30.5
+transformers==4.44.2
+fastapi==0.115.0
+uvicorn==0.31.0
 nltk==3.9.1
-torch==2.4.0
+torch==2.4.1
 sentencepiece==0.2.0
-sentence-transformers==3.0.1
-optimum==1.21.2
-onnxruntime==1.18.1
-onnx==1.16.2
+sentence-transformers==3.1.1
+optimum==1.22.0
+onnxruntime==1.19.2
+onnx==1.17.0
 numpy==1.26.4
+einops==0.8.0
diff --git a/smoke_test.py b/smoke_test.py
index 92a55a7..46f57a4 100755
--- a/smoke_test.py
+++ b/smoke_test.py
@@ -5,11 +5,11 @@
 
 class SmokeTest(unittest.TestCase):
     def setUp(self):
-        self.url = 'http://localhost:8000'
+        self.url = "http://localhost:8000"
 
         for i in range(0, 100):
             try:
-                res = requests.get(self.url + '/.well-known/ready')
+                res = requests.get(self.url + "/.well-known/ready")
                 if res.status_code == 204:
                     return
                 else:
@@ -21,17 +21,17 @@ def setUp(self):
         raise Exception("did not start up")
 
     def test_well_known_ready(self):
-        res = requests.get(self.url + '/.well-known/ready')
+        res = requests.get(self.url + "/.well-known/ready")
 
         self.assertEqual(res.status_code, 204)
 
     def test_well_known_live(self):
-        res = requests.get(self.url + '/.well-known/live')
+        res = requests.get(self.url + "/.well-known/live")
 
         self.assertEqual(res.status_code, 204)
 
     def test_meta(self):
-        res = requests.get(self.url + '/meta')
+        res = requests.get(self.url + "/meta")
 
         self.assertEqual(res.status_code, 200)
         self.assertIsInstance(res.json(), dict)
@@ -39,7 +39,7 @@ def test_meta(self):
     def test_vectorizing(self):
         def try_to_vectorize(url):
             print(f"url: {url}")
-            req_body = {'text': 'The London Eye is a ferris wheel at the River Thames.'}
+            req_body = {"text": "The London Eye is a ferris wheel at the River Thames."}
 
             res = requests.post(url, json=req_body)
             resBody = res.json()
@@ -49,7 +49,7 @@ def try_to_vectorize(url):
             # below tests that what we deem a reasonable vector is returned. We are
             # aware of 384 and 768 dim vectors, which should both fall in that
             # range
-            self.assertTrue(len(resBody['vector']) > 100)
+            self.assertTrue(len(resBody["vector"]) > 100)
             print(f"vector dimensions are: {len(resBody['vector'])}")
 
         try_to_vectorize(self.url + "/vectors/")
diff --git a/test_app.py b/test_app.py
index f088cb1..656cfb0 100644
--- a/test_app.py
+++ b/test_app.py
@@ -11,7 +11,7 @@
 
 
 def wait_for_uvicorn_start():
-    url = 'http://localhost:8000/.well-known/ready'
+    url = "http://localhost:8000/.well-known/ready"
 
     for i in range(0, 100):
         try:
@@ -19,8 +19,7 @@ def wait_for_uvicorn_start():
             if res.status_code == 204:
                 return
             else:
-                raise Exception(
-                    "status code is {}".format(res.status_code))
+                raise Exception("status code is {}".format(res.status_code))
         except Exception as e:
             print("Attempt {}: {}".format(i, e))
             time.sleep(2)
@@ -32,10 +31,15 @@ def run_server():
     uvicorn.run(app)
 
 
-@pytest.fixture(params=["t5-small",
-                        "distilroberta-base",
-                        "vblagoje/dpr-ctx_encoder-single-lfqa-wiki",
-                        "vblagoje/dpr-question_encoder-single-lfqa-wiki"], scope="function")
+@pytest.fixture(
+    params=[
+        "t5-small",
+        "distilroberta-base",
+        "vblagoje/dpr-ctx_encoder-single-lfqa-wiki",
+        "vblagoje/dpr-question_encoder-single-lfqa-wiki",
+    ],
+    scope="function",
+)
 def server(request):
     os.environ["MODEL_NAME"] = request.param
     subprocess.call("python download.py", shell=True)
@@ -48,12 +52,12 @@ def server(request):
 
 def test_vectorizing(server):
     wait_for_uvicorn_start()
-    url = 'http://127.0.0.1:8000/vectors/'
-    req_body = {'text': 'The London Eye is a ferris wheel at the River Thames.'}
+    url = "http://127.0.0.1:8000/vectors/"
+    req_body = {"text": "The London Eye is a ferris wheel at the River Thames."}
 
     res = requests.post(url, json=req_body)
     resBody = res.json()
-    vectorized_text = resBody['vector']
+    vectorized_text = resBody["vector"]
 
     assert 200 == res.status_code
 
@@ -66,14 +70,15 @@ def test_vectorizing(server):
 
     # now let's try two sentences
 
-    req_body = {'text': 'The London Eye is a ferris wheel at the River Thames. Here is the second sentence.'}
+    req_body = {
+        "text": "The London Eye is a ferris wheel at the River Thames. Here is the second sentence."
+    }
     res = requests.post(url, json=req_body)
     resBody = res.json()
-    vectorized_text = resBody['vector']
+    vectorized_text = resBody["vector"]
 
     assert 200 == res.status_code
 
     assert type(vectorized_text) is list
 
     assert 128 <= len(vectorized_text) <= 1024
-
diff --git a/vectorizer.py b/vectorizer.py
index 75e0a74..b284bde 100644
--- a/vectorizer.py
+++ b/vectorizer.py
@@ -11,16 +11,22 @@
 from optimum.onnxruntime import ORTModelForFeatureExtraction
 from pydantic import BaseModel
 from sentence_transformers import SentenceTransformer
-from transformers import (AutoModel, AutoTokenizer, DPRContextEncoder,
-                          DPRQuestionEncoder, T5ForConditionalGeneration,
-                          T5Tokenizer)
+from transformers import (
+    AutoModel,
+    AutoTokenizer,
+    DPRContextEncoder,
+    DPRQuestionEncoder,
+    T5ForConditionalGeneration,
+    T5Tokenizer,
+)
 
 from config import TRUST_REMOTE_CODE
 
 # limit transformer batch size to limit parallel inference, otherwise we run
 # into memory problems
 MAX_BATCH_SIZE = 25  # TODO: take from config
-DEFAULT_POOL_METHOD="masked_mean"
+DEFAULT_POOL_METHOD = "masked_mean"
+
 
 class VectorInputConfig(BaseModel):
     pooling_strategy: str
@@ -34,20 +40,42 @@ class VectorInput(BaseModel):
 class Vectorizer:
     executor: ThreadPoolExecutor
 
-    def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, cuda_per_process_memory_fraction: float, 
-                 model_type: str, architecture: str, direct_tokenize: bool, onnx_runtime: bool,
-                 use_sentence_transformer_vectorizer: bool, model_name: str):
+    def __init__(
+        self,
+        model_path: str,
+        cuda_support: bool,
+        cuda_core: str,
+        cuda_per_process_memory_fraction: float,
+        model_type: str,
+        architecture: str,
+        direct_tokenize: bool,
+        onnx_runtime: bool,
+        use_sentence_transformer_vectorizer: bool,
+        model_name: str,
+    ):
         self.executor = ThreadPoolExecutor()
         if onnx_runtime:
             self.vectorizer = ONNXVectorizer(model_path)
         else:
-            if model_type == 't5' or use_sentence_transformer_vectorizer:
-                self.vectorizer = SentenceTransformerVectorizer(model_path, model_name, cuda_core)
+            if model_type == "t5" or use_sentence_transformer_vectorizer:
+                self.vectorizer = SentenceTransformerVectorizer(
+                    model_path, model_name, cuda_core
+                )
             else:
-                self.vectorizer = HuggingFaceVectorizer(model_path, cuda_support, cuda_core, cuda_per_process_memory_fraction, model_type, architecture, direct_tokenize)
+                self.vectorizer = HuggingFaceVectorizer(
+                    model_path,
+                    cuda_support,
+                    cuda_core,
+                    cuda_per_process_memory_fraction,
+                    model_type,
+                    architecture,
+                    direct_tokenize,
+                )
 
     async def vectorize(self, text: str, config: VectorInputConfig):
-        return await asyncio.wrap_future(self.executor.submit(self.vectorizer.vectorize, text, config))
+        return await asyncio.wrap_future(
+            self.executor.submit(self.vectorizer.vectorize, text, config)
+        )
 
 
 class SentenceTransformerVectorizer:
@@ -56,8 +84,10 @@ class SentenceTransformerVectorizer:
 
     def __init__(self, model_path: str, model_name: str, cuda_core: str):
         self.cuda_core = cuda_core
-        self.model = SentenceTransformer(model_name, cache_folder=model_path, device=self.get_device())
-        self.model.eval() # make sure we're in inference mode, not training
+        self.model = SentenceTransformer(
+            model_name, cache_folder=model_path, device=self.get_device()
+        )
+        self.model.eval()  # make sure we're in inference mode, not training
 
     def get_device(self) -> Optional[str]:
         if self.cuda_core is not None and self.cuda_core != "":
@@ -65,7 +95,12 @@ def get_device(self) -> Optional[str]:
         return None
 
     def vectorize(self, text: str, config: VectorInputConfig):
-        embedding = self.model.encode([text], device=self.get_device(), convert_to_tensor=False, convert_to_numpy=True)
+        embedding = self.model.encode(
+            [text],
+            device=self.get_device(),
+            convert_to_tensor=False,
+            convert_to_numpy=True,
+        )
         return embedding[0]
 
 
@@ -75,23 +110,38 @@ class ONNXVectorizer:
 
     def __init__(self, model_path) -> None:
         onnx_path = Path(model_path)
-        self.model = ORTModelForFeatureExtraction.from_pretrained(onnx_path, file_name="model_quantized.onnx",
-                                                                trust_remote_code=TRUST_REMOTE_CODE)
-        self.tokenizer = AutoTokenizer.from_pretrained(onnx_path, trust_remote_code=TRUST_REMOTE_CODE)
+        self.model = ORTModelForFeatureExtraction.from_pretrained(
+            onnx_path,
+            file_name="model_quantized.onnx",
+            trust_remote_code=TRUST_REMOTE_CODE,
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            onnx_path, trust_remote_code=TRUST_REMOTE_CODE
+        )
 
     def mean_pooling(self, model_output, attention_mask):
-        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
-        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+        token_embeddings = model_output[
+            0
+        ]  # First element of model_output contains all token embeddings
+        input_mask_expanded = (
+            attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        )
+        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
+            input_mask_expanded.sum(1), min=1e-9
+        )
 
     def vectorize(self, text: str, config: VectorInputConfig):
-        encoded_input = self.tokenizer([text], padding=True, truncation=True, return_tensors='pt')
+        encoded_input = self.tokenizer(
+            [text], padding=True, truncation=True, return_tensors="pt"
+        )
         # Compute token embeddings
         with torch.no_grad():
             model_output = self.model(**encoded_input)
 
         # Perform pooling
-        sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
+        sentence_embeddings = self.mean_pooling(
+            model_output, encoded_input["attention_mask"]
+        )
 
         # Normalize embeddings
         sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
@@ -106,29 +156,48 @@ class HuggingFaceVectorizer:
     model_type: str
     direct_tokenize: bool
 
-    def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, cuda_per_process_memory_fraction: float, model_type: str, architecture: str, direct_tokenize: bool):
+    def __init__(
+        self,
+        model_path: str,
+        cuda_support: bool,
+        cuda_core: str,
+        cuda_per_process_memory_fraction: float,
+        model_type: str,
+        architecture: str,
+        direct_tokenize: bool,
+    ):
         self.cuda = cuda_support
         self.cuda_core = cuda_core
         self.cuda_per_process_memory_fraction = cuda_per_process_memory_fraction
         self.model_type = model_type
         self.direct_tokenize = direct_tokenize
 
-        self.model_delegate: HFModel = ModelFactory.model(model_type, architecture, cuda_support, cuda_core)
+        self.model_delegate: HFModel = ModelFactory.model(
+            model_type, architecture, cuda_support, cuda_core
+        )
         self.model = self.model_delegate.create_model(model_path)
 
         if self.cuda:
             self.model.to(self.cuda_core)
             if self.cuda_per_process_memory_fraction:
-                torch.cuda.set_per_process_memory_fraction(self.cuda_per_process_memory_fraction)
-        self.model.eval() # make sure we're in inference mode, not training
+                torch.cuda.set_per_process_memory_fraction(
+                    self.cuda_per_process_memory_fraction
+                )
+        self.model.eval()  # make sure we're in inference mode, not training
 
         self.tokenizer = self.model_delegate.create_tokenizer(model_path)
 
-        nltk.data.path.append('./nltk_data')
+        nltk.data.path.append("./nltk_data")
 
-    def tokenize(self, text:str):
-        return self.tokenizer(text, padding=True, truncation=True, max_length=500,
-                add_special_tokens = True, return_tensors="pt")
+    def tokenize(self, text: str):
+        return self.tokenizer(
+            text,
+            padding=True,
+            truncation=True,
+            max_length=500,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
 
     def get_embeddings(self, batch_results):
         return self.model_delegate.get_embeddings(batch_results)
@@ -151,7 +220,11 @@ def vectorize(self, text: str, config: VectorInputConfig):
                 return batch_sum_vectors.detach()
             else:
                 # tokenize text
-                sentences = sent_tokenize(' '.join(text.split(),))
+                sentences = sent_tokenize(
+                    " ".join(
+                        text.split(),
+                    )
+                )
                 num_sentences = len(sentences)
                 number_of_batch_vectors = math.ceil(num_sentences / MAX_BATCH_SIZE)
                 batch_sum_vectors = 0
@@ -162,8 +235,12 @@ def vectorize(self, text: str, config: VectorInputConfig):
                     tokens = self.tokenize(sentences[start_index:end_index])
                     if self.cuda:
                         tokens.to(self.cuda_core)
-                    batch_results = self.get_batch_results(tokens, sentences[start_index:end_index])
-                    batch_sum_vectors += self.pool_embedding(batch_results, tokens, config)
+                    batch_results = self.get_batch_results(
+                        tokens, sentences[start_index:end_index]
+                    )
+                    batch_sum_vectors += self.pool_embedding(
+                        batch_results, tokens, config
+                    )
                 return batch_sum_vectors.detach() / num_sentences
 
 
@@ -177,11 +254,15 @@ def __init__(self, cuda_support: bool, cuda_core: str):
         self.cuda_core = cuda_core
 
     def create_tokenizer(self, model_path):
-        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=TRUST_REMOTE_CODE)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=TRUST_REMOTE_CODE
+        )
         return self.tokenizer
 
     def create_model(self, model_path):
-        self.model = AutoModel.from_pretrained(model_path, trust_remote_code=TRUST_REMOTE_CODE)
+        self.model = AutoModel.from_pretrained(
+            model_path, trust_remote_code=TRUST_REMOTE_CODE
+        )
         return self.model
 
     def get_embeddings(self, batch_results):
@@ -195,7 +276,9 @@ def pool_embedding(self, batch_results, tokens, config: VectorInputConfig):
         if pooling_method == "cls":
             return self.get_embeddings(batch_results)[:, 0, :].sum(0)
         elif pooling_method == "masked_mean":
-            return self.pool_sum(self.get_embeddings(batch_results), tokens['attention_mask'])
+            return self.pool_sum(
+                self.get_embeddings(batch_results), tokens["attention_mask"]
+            )
         else:
             raise Exception(f"invalid pooling method '{pooling_method}'")
 
@@ -210,8 +293,12 @@ def pool_method_from_config(self, config: VectorInputConfig):
 
     def get_sum_embeddings_mask(self, embeddings, input_mask_expanded):
         if self.cuda:
-            sum_embeddings = torch.sum(embeddings * input_mask_expanded, 1).to(self.cuda_core)
-            sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9).to(self.cuda_core)
+            sum_embeddings = torch.sum(embeddings * input_mask_expanded, 1).to(
+                self.cuda_core
+            )
+            sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9).to(
+                self.cuda_core
+            )
             return sum_embeddings, sum_mask
         else:
             sum_embeddings = torch.sum(embeddings * input_mask_expanded, 1)
@@ -219,8 +306,12 @@ def get_sum_embeddings_mask(self, embeddings, input_mask_expanded):
             return sum_embeddings, sum_mask
 
     def pool_sum(self, embeddings, attention_mask):
-        input_mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
-        sum_embeddings, sum_mask = self.get_sum_embeddings_mask(embeddings, input_mask_expanded)
+        input_mask_expanded = (
+            attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
+        )
+        sum_embeddings, sum_mask = self.get_sum_embeddings_mask(
+            embeddings, input_mask_expanded
+        )
         sentences = sum_embeddings / sum_mask
         return sentences.sum(0)
 
@@ -234,13 +325,17 @@ def __init__(self, architecture: str, cuda_support: bool, cuda_core: str):
 
     def create_model(self, model_path):
         if self.architecture == "DPRQuestionEncoder":
-            self.model = DPRQuestionEncoder.from_pretrained(model_path, trust_remote_code=TRUST_REMOTE_CODE)
+            self.model = DPRQuestionEncoder.from_pretrained(
+                model_path, trust_remote_code=TRUST_REMOTE_CODE
+            )
         else:
-            self.model = DPRContextEncoder.from_pretrained(model_path, trust_remote_code=TRUST_REMOTE_CODE)
+            self.model = DPRContextEncoder.from_pretrained(
+                model_path, trust_remote_code=TRUST_REMOTE_CODE
+            )
         return self.model
 
     def get_batch_results(self, tokens, text):
-        return self.model(tokens['input_ids'], tokens['attention_mask'])
+        return self.model(tokens["input_ids"], tokens["attention_mask"])
 
     def pool_embedding(self, batch_results, tokens, config: VectorInputConfig):
         # no pooling needed for DPR
@@ -257,18 +352,22 @@ def __init__(self, cuda_support: bool, cuda_core: str):
         self.cuda_core = cuda_core
 
     def create_model(self, model_path):
-        self.model = T5ForConditionalGeneration.from_pretrained(model_path, trust_remote_code=TRUST_REMOTE_CODE)
+        self.model = T5ForConditionalGeneration.from_pretrained(
+            model_path, trust_remote_code=TRUST_REMOTE_CODE
+        )
         return self.model
 
     def create_tokenizer(self, model_path):
-        self.tokenizer = T5Tokenizer.from_pretrained(model_path, trust_remote_code=TRUST_REMOTE_CODE)
+        self.tokenizer = T5Tokenizer.from_pretrained(
+            model_path, trust_remote_code=TRUST_REMOTE_CODE
+        )
         return self.tokenizer
 
     def get_embeddings(self, batch_results):
         return batch_results["encoder_last_hidden_state"]
 
     def get_batch_results(self, tokens, text):
-        input_ids, attention_mask = tokens['input_ids'], tokens['attention_mask']
+        input_ids, attention_mask = tokens["input_ids"], tokens["attention_mask"]
 
         target_encoding = self.tokenizer(
             text, padding="longest", max_length=500, truncation=True
@@ -279,16 +378,18 @@ def get_batch_results(self, tokens, text):
         else:
             labels = torch.tensor(labels)
 
-        return self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
+        return self.model(
+            input_ids=input_ids, attention_mask=attention_mask, labels=labels
+        )
 
 
 class ModelFactory:
 
     @staticmethod
     def model(model_type, architecture, cuda_support: bool, cuda_core: str):
-        if model_type == 't5':
+        if model_type == "t5":
             return T5Model(cuda_support, cuda_core)
-        elif model_type == 'dpr':
+        elif model_type == "dpr":
             return DPRModel(architecture, cuda_support, cuda_core)
         else:
             return HFModel(cuda_support, cuda_core)