Skip to content

Commit

Permalink
Add quantization info
Browse files Browse the repository at this point in the history
  • Loading branch information
antas-marcin committed Dec 1, 2023
1 parent 2436eeb commit f5bbf90
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 6 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,12 @@ The pre-built models include:
|Bar-Ilan University NLP Lab Models|
|`biu-nlp/abstract-sim-sentence` ([Info](https://huggingface.co/biu-nlp/abstract-sim-sentence))|`semitechnologies/transformers-inference:biu-nlp-abstract-sim-sentence`|
|`biu-nlp/abstract-sim-query` ([Info](https://huggingface.co/biu-nlp/abstract-sim-query))|`semitechnologies/transformers-inference:biu-nlp-abstract-sim-query`|
|ONNX Models|
|`BAAI/bge-small-en` ([Info](https://huggingface.co/BAAI/bge-small-en))|`semitechnologies/transformers-inference:baai-bge-small-en-onnx`|
|`BAAI/bge-small-en-v1.5` ([Info](https://huggingface.co/BAAI/bge-small-en-v1.5))|`semitechnologies/transformers-inference:baai-bge-small-en-v1.5-onnx`|
|`BAAI/bge-base-en` ([Info](https://huggingface.co/BAAI/bge-base-en))|`semitechnologies/transformers-inference:baai-bge-base-en-onnx`|
|`BAAI/bge-base-en-v1.5` ([Info](https://huggingface.co/BAAI/bge-base-en-v1.5))|`semitechnologies/transformers-inference:baai-bge-base-en-v1.5-onnx`|
|`sentence-transformers/all-MiniLM-L6-v2` ([Info](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2))|`semitechnologies/transformers-inference:sentence-transformers-all-MiniLM-L6-v2-onnx`|


The above image names always point to the latest version of the inference
Expand Down
13 changes: 12 additions & 1 deletion app.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,21 @@ def get_onnx_runtime() -> bool:
return onnx_runtime == "true"
return False

def log_info_about_onnx(onnx_runtime: bool):
if onnx_runtime:
onnx_quantization_info = "missing"
if os.path.exists(f"{model_dir}/onnx_quantization_info"):
with open(f"{model_dir}/onnx_quantization_info", "r") as f:
onnx_quantization_info = f.read()
logger.info(f"Running ONNX vectorizer with quantized model for {onnx_quantization_info}")

onnx_runtime = get_onnx_runtime()
log_info_about_onnx(onnx_runtime)

meta_config = Meta(get_model_directory())
vec = Vectorizer(get_model_directory(), cuda_support, cuda_core, cuda_per_process_memory_fraction,
meta_config.get_model_type(), meta_config.get_architecture(),
direct_tokenize, get_onnx_runtime())
direct_tokenize, onnx_runtime)


@app.get("/.well-known/live", response_class=Response)
Expand Down
15 changes: 12 additions & 3 deletions download.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,25 @@ def download_onnx_model(model_name: str, model_dir: str):
# Save model
ort_model.save_pretrained(onnx_path)

def save_to_file(filepath: str, content: str):
with open(filepath, "w") as f:
f.write(content)

def save_quantization_info(arch: str):
save_to_file(f"{model_dir}/onnx_quantization_info", arch)

def quantization_config(onnx_cpu_arch: str):
if onnx_cpu_arch.lower() == "avx512_vnni":
print("Quantize Model for x86_64 (amd64) (avx512_vnni)")
save_quantization_info("AVX-512")
return AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
if onnx_cpu_arch.lower() == "arm64":
print(f"Quantize Model for ARM64")
save_quantization_info("ARM64")
return AutoQuantizationConfig.arm64(is_static=False, per_channel=False)
# default is AMD64
# default is AMD64 (AVX2)
print(f"Quantize Model for x86_64 (amd64) (AVX2)")
save_quantization_info("amd64 (AVX2)")
return AutoQuantizationConfig.avx2(is_static=False, per_channel=False)

# Quantize the model / convert to ONNX
Expand All @@ -63,8 +73,7 @@ def quantization_config(onnx_cpu_arch: str):
if os.path.isfile(f"{model_dir}/model.onnx"):
os.remove(f"{model_dir}/model.onnx")
# Save information about ONNX runtime
with open(f"{model_dir}/onnx_runtime", "w") as f:
f.write(onnx_runtime)
save_to_file(f"{model_dir}/onnx_runtime", onnx_runtime)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(onnx_path)

Expand Down
4 changes: 2 additions & 2 deletions vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,12 @@ def __init__(self, model_path) -> None:
onnx_path = Path(model_path)
self.model = ORTModelForFeatureExtraction.from_pretrained(onnx_path, file_name="model_quantized.onnx")
self.tokenizer = AutoTokenizer.from_pretrained(onnx_path)

def mean_pooling(self, model_output, attention_mask):
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def vectorize(self, text: str, config: VectorInputConfig):
encoded_input = self.tokenizer([text], padding=True, truncation=True, return_tensors='pt')
# Compute token embeddings
Expand Down

0 comments on commit f5bbf90

Please sign in to comment.