weaviate · antas-marcin · Oct 27, 2023 · May 29, 2023 · Oct 25, 2023
diff --git a/app.py b/app.py
@@ -43,8 +43,15 @@ def startup_event():
     if transformers_direct_tokenize is not None and transformers_direct_tokenize == "true" or transformers_direct_tokenize == "1":
         direct_tokenize = True
 
-    meta_config = Meta('./models/model')
-    vec = Vectorizer('./models/model', cuda_support, cuda_core, cuda_per_process_memory_fraction,
+    def get_model_directory() -> str:
+        if os.path.exists("./models/model/model_name"):
+            with open("./models/model/model_name", "r") as f:
+                model_name = f.read()
+                return f"./models/model/{model_name}"
+        return "./models/model"
+
+    meta_config = Meta(get_model_directory())
+    vec = Vectorizer(get_model_directory(), cuda_support, cuda_core, cuda_per_process_memory_fraction,
                      meta_config.getModelType(), meta_config.get_architecture(), direct_tokenize)
 
 

diff --git a/cicd/docker_push.sh b/cicd/docker_push.sh
@@ -8,21 +8,18 @@ set -eou pipefail
 # - any commit is pushed as :<model>-latest
 # - any commit is pushed as :<model>
 git_hash=
-pr=
 remote_repo=${REMOTE_REPO?Variable REMOTE_REPO is required}
 model_name=${MODEL_NAME?Variable MODEL_NAME is required}
-original_model_name=$model_name
 docker_username=${DOCKER_USERNAME?Variable DOCKER_USERNAME is required}
 docker_password=${DOCKER_PASSWORD?Variable DOCKER_PASSWORD is required}
-git_tag=${GITHUB_REF##*/}
+original_model_name=$model_name
+git_tag=$GITHUB_REF_NAME
 
 function main() {
   init
   echo "git ref type is $GITHUB_REF_TYPE"
   echo "git ref name is $GITHUB_REF_NAME"
-  echo "git branch is $GIT_BRANCH"
   echo "git tag is $git_tag"
-  echo "pr is $pr"
   push_tag
 }
 
@@ -35,18 +32,14 @@ function init() {
   fi
 
   git_hash="$(git rev-parse HEAD | head -c 7)"
-  pr=false
-  if [ ! -z "$GIT_PULL_REQUEST" ]; then
-    pr="$GIT_PULL_REQUEST"
-  fi
 
   docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
   docker buildx create --use
   echo "$docker_password" | docker login -u "$docker_username" --password-stdin
 }
 
 function push_tag() {
-  if [ ! -z "$git_tag" ]; then
+  if [ ! -z "$git_tag" ] && [ "$GITHUB_REF_TYPE" == "tag" ]; then
     tag_git="$remote_repo:$model_name-$git_tag"
     tag_latest="$remote_repo:$model_name-latest"
     tag="$remote_repo:$model_name"

diff --git a/cicd/docker_push_custom_base.sh b/cicd/docker_push_custom_base.sh
@@ -6,7 +6,7 @@ git_hash=
 remote_repo=${REMOTE_REPO?Variable REMOTE_REPO is required}
 docker_username=${DOCKER_USERNAME?Variable DOCKER_USERNAME is required}
 docker_password=${DOCKER_PASSWORD?Variable DOCKER_PASSWORD is required}
-git_tag=${GITHUB_REF##*/}
+git_tag=$GITHUB_REF_NAME
 
 function main() {
   init

diff --git a/download.py b/download.py
@@ -8,8 +8,10 @@
     AutoTokenizer,
     AutoConfig,
 )
+from sentence_transformers import SentenceTransformer
 
 
+model_dir = './models/model'
 model_name = os.getenv('MODEL_NAME', None)
 force_automodel = os.getenv('FORCE_AUTOMODEL', False)
 if not model_name:
@@ -22,21 +24,28 @@
 
 print(f"Downloading model {model_name} from huggingface model hub")
 config = AutoConfig.from_pretrained(model_name)
-if config.architectures and not force_automodel:
-    print(f"Using class {config.architectures[0]} to load model weights")
-    mod = __import__('transformers', fromlist=[config.architectures[0]])
-    try:
-        klass_architecture = getattr(mod, config.architectures[0])
-        model = klass_architecture.from_pretrained(model_name)
-    except AttributeError:
-        print(f"{config.architectures[0]} not found in transformers, fallback to AutoModel")
-        model = AutoModel.from_pretrained(model_name)
+model_type = config.to_dict()['model_type']
+
+if model_type is not None and model_type == "t5":
+    SentenceTransformer(model_name, cache_folder=model_dir)
+    with open(f"{model_dir}/model_name", "w") as f:
+        f.write(model_name.replace("/", "_"))
 else:
-    model = AutoModel.from_pretrained(model_name)
+    if config.architectures and not force_automodel:
+        print(f"Using class {config.architectures[0]} to load model weights")
+        mod = __import__('transformers', fromlist=[config.architectures[0]])
+        try:
+            klass_architecture = getattr(mod, config.architectures[0])
+            model = klass_architecture.from_pretrained(model_name)
+        except AttributeError:
+            print(f"{config.architectures[0]} not found in transformers, fallback to AutoModel")
+            model = AutoModel.from_pretrained(model_name)
+    else:
+        model = AutoModel.from_pretrained(model_name)
 
-tokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
 
-model.save_pretrained('./models/model')
-tokenizer.save_pretrained('./models/model')
+    model.save_pretrained(model_dir)
+    tokenizer.save_pretrained(model_dir)
 
 nltk.download('punkt', download_dir='./nltk_data')
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -5,4 +5,5 @@ uvicorn==0.21.1
 nltk==3.8.1
 torch==2.0.0
 sentencepiece==0.1.97
+sentence-transformers==2.2.2
 pytest
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,7 @@
-transformers==4.29.2
-fastapi==0.95.2
-uvicorn==0.22.0
+transformers==4.34.0
+fastapi==0.103.2
+uvicorn==0.23.2
 nltk==3.8.1
-torch==2.0.1
+torch==2.1.0
 sentencepiece==0.1.99
+sentence-transformers==2.2.2
diff --git a/smoke_test.py b/smoke_test.py
@@ -50,6 +50,7 @@ def try_to_vectorize(url):
             # aware of 384 and 768 dim vectors, which should both fall in that
             # range
             self.assertTrue(len(resBody['vector']) > 100)
+            print(f"vector dimensions are: {len(resBody['vector'])}")
 
         try_to_vectorize(self.url + "/vectors/")
         try_to_vectorize(self.url + "/vectors")

diff --git a/vectorizer.py b/vectorizer.py
@@ -14,6 +14,7 @@
     DPRContextEncoder,
     DPRQuestionEncoder,
 )
+from sentence_transformers import SentenceTransformer
 
 
 # limit transformer batch size to limit parallel inference, otherwise we run
@@ -29,14 +30,47 @@ class VectorInput(BaseModel):
     text: str
     config: Optional[VectorInputConfig] = None
 
+
 class Vectorizer:
+    executor: ThreadPoolExecutor
+
+    def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, cuda_per_process_memory_fraction: float, model_type: str, architecture: str, direct_tokenize: bool):
+        self.executor = ThreadPoolExecutor()
+        if model_type == 't5':
+            self.vectorizer = SentenceTransformerVectorizer(model_path, cuda_core)
+        else:
+            self.vectorizer = HuggingFaceVectorizer(model_path, cuda_support, cuda_core, cuda_per_process_memory_fraction, model_type, architecture, direct_tokenize)
+
+    async def vectorize(self, text: str, config: VectorInputConfig):
+        return await asyncio.wrap_future(self.executor.submit(self.vectorizer.vectorize, text, config))
+
+
+class SentenceTransformerVectorizer:
+    model: SentenceTransformer
+    cuda_core: str
+
+    def __init__(self, model_path: str, cuda_core: str):
+        self.cuda_core = cuda_core
+        self.model = SentenceTransformer(model_path, device=self.get_device())
+        self.model.eval() # make sure we're in inference mode, not training
+
+    def get_device(self) -> Optional[str]:
+        if self.cuda_core is not None and self.cuda_core != "":
+            return self.cuda_core
+        return None
+
+    def vectorize(self, text: str, config: VectorInputConfig):
+        embedding = self.model.encode([text], device=self.get_device(), convert_to_tensor=False, convert_to_numpy=True)
+        return embedding[0]
+
+
+class HuggingFaceVectorizer:
     model: AutoModel
     tokenizer: AutoTokenizer
     cuda: bool
     cuda_core: str
     model_type: str
     direct_tokenize: bool
-    executor: ThreadPoolExecutor
 
     def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, cuda_per_process_memory_fraction: float, model_type: str, architecture: str, direct_tokenize: bool):
         self.cuda = cuda_support
@@ -56,8 +90,6 @@ def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, cuda_per
 
         self.tokenizer = self.model_delegate.create_tokenizer(model_path)
 
-        self.executor = ThreadPoolExecutor()
-
         nltk.data.path.append('./nltk_data')
 
     def tokenize(self, text:str):
@@ -73,7 +105,7 @@ def get_batch_results(self, tokens, text):
     def pool_embedding(self, batch_results, tokens, config):
         return self.model_delegate.pool_embedding(batch_results, tokens, config)
 
-    def _vectorize(self, text: str, config: VectorInputConfig):
+    def vectorize(self, text: str, config: VectorInputConfig):
         with torch.no_grad():
             if self.direct_tokenize:
                 # create embeddings without tokenizing text
@@ -100,9 +132,6 @@ def _vectorize(self, text: str, config: VectorInputConfig):
                     batch_sum_vectors += self.pool_embedding(batch_results, tokens, config)
                 return batch_sum_vectors.detach() / num_sentences
 
-    async def vectorize(self, text: str, config: VectorInputConfig):
-        return await asyncio.wrap_future(self.executor.submit(self._vectorize, text, config))
-
 
 class HFModel: