From f5bbf90e4b70b745514c6c1cadc2d30f797b5370 Mon Sep 17 00:00:00 2001 From: Marcin Antas Date: Fri, 1 Dec 2023 08:18:01 +0100 Subject: [PATCH] Add quantization info --- README.md | 6 ++++++ app.py | 13 ++++++++++++- download.py | 15 ++++++++++++--- vectorizer.py | 4 ++-- 4 files changed, 32 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 633a7bc..a3b24ad 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,12 @@ The pre-built models include: |Bar-Ilan University NLP Lab Models| |`biu-nlp/abstract-sim-sentence` ([Info](https://huggingface.co/biu-nlp/abstract-sim-sentence))|`semitechnologies/transformers-inference:biu-nlp-abstract-sim-sentence`| |`biu-nlp/abstract-sim-query` ([Info](https://huggingface.co/biu-nlp/abstract-sim-query))|`semitechnologies/transformers-inference:biu-nlp-abstract-sim-query`| +|ONNX Models| +|`BAAI/bge-small-en` ([Info](https://huggingface.co/BAAI/bge-small-en))|`semitechnologies/transformers-inference:baai-bge-small-en-onnx`| +|`BAAI/bge-small-en-v1.5` ([Info](https://huggingface.co/BAAI/bge-small-en-v1.5))|`semitechnologies/transformers-inference:baai-bge-small-en-v1.5-onnx`| +|`BAAI/bge-base-en` ([Info](https://huggingface.co/BAAI/bge-base-en))|`semitechnologies/transformers-inference:baai-bge-base-en-onnx`| +|`BAAI/bge-base-en-v1.5` ([Info](https://huggingface.co/BAAI/bge-base-en-v1.5))|`semitechnologies/transformers-inference:baai-bge-base-en-v1.5-onnx`| +|`sentence-transformers/all-MiniLM-L6-v2` ([Info](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2))|`semitechnologies/transformers-inference:sentence-transformers-all-MiniLM-L6-v2-onnx`| The above image names always point to the latest version of the inference diff --git a/app.py b/app.py index 1fb3ab6..b8c20c3 100644 --- a/app.py +++ b/app.py @@ -58,10 +58,21 @@ def get_onnx_runtime() -> bool: return onnx_runtime == "true" return False + def log_info_about_onnx(onnx_runtime: bool): + if onnx_runtime: + onnx_quantization_info = "missing" + if os.path.exists(f"{model_dir}/onnx_quantization_info"): + with open(f"{model_dir}/onnx_quantization_info", "r") as f: + onnx_quantization_info = f.read() + logger.info(f"Running ONNX vectorizer with quantized model for {onnx_quantization_info}") + + onnx_runtime = get_onnx_runtime() + log_info_about_onnx(onnx_runtime) + meta_config = Meta(get_model_directory()) vec = Vectorizer(get_model_directory(), cuda_support, cuda_core, cuda_per_process_memory_fraction, meta_config.get_model_type(), meta_config.get_architecture(), - direct_tokenize, get_onnx_runtime()) + direct_tokenize, onnx_runtime) @app.get("/.well-known/live", response_class=Response) diff --git a/download.py b/download.py index 791fb09..0b54bf5 100755 --- a/download.py +++ b/download.py @@ -43,15 +43,25 @@ def download_onnx_model(model_name: str, model_dir: str): # Save model ort_model.save_pretrained(onnx_path) + def save_to_file(filepath: str, content: str): + with open(filepath, "w") as f: + f.write(content) + + def save_quantization_info(arch: str): + save_to_file(f"{model_dir}/onnx_quantization_info", arch) + def quantization_config(onnx_cpu_arch: str): if onnx_cpu_arch.lower() == "avx512_vnni": print("Quantize Model for x86_64 (amd64) (avx512_vnni)") + save_quantization_info("AVX-512") return AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False) if onnx_cpu_arch.lower() == "arm64": print(f"Quantize Model for ARM64") + save_quantization_info("ARM64") return AutoQuantizationConfig.arm64(is_static=False, per_channel=False) - # default is AMD64 + # default is AMD64 (AVX2) print(f"Quantize Model for x86_64 (amd64) (AVX2)") + save_quantization_info("amd64 (AVX2)") return AutoQuantizationConfig.avx2(is_static=False, per_channel=False) # Quantize the model / convert to ONNX @@ -63,8 +73,7 @@ def quantization_config(onnx_cpu_arch: str): if os.path.isfile(f"{model_dir}/model.onnx"): os.remove(f"{model_dir}/model.onnx") # Save information about ONNX runtime - with open(f"{model_dir}/onnx_runtime", "w") as f: - f.write(onnx_runtime) + save_to_file(f"{model_dir}/onnx_runtime", onnx_runtime) tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer.save_pretrained(onnx_path) diff --git a/vectorizer.py b/vectorizer.py index 4385dd8..8d13f34 100644 --- a/vectorizer.py +++ b/vectorizer.py @@ -78,12 +78,12 @@ def __init__(self, model_path) -> None: onnx_path = Path(model_path) self.model = ORTModelForFeatureExtraction.from_pretrained(onnx_path, file_name="model_quantized.onnx") self.tokenizer = AutoTokenizer.from_pretrained(onnx_path) - + def mean_pooling(self, model_output, attention_mask): token_embeddings = model_output[0] #First element of model_output contains all token embeddings input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) - + def vectorize(self, text: str, config: VectorInputConfig): encoded_input = self.tokenizer([text], padding=True, truncation=True, return_tensors='pt') # Compute token embeddings