From 4e10efbec37592405b95998bbfd9f60f5aec3f24 Mon Sep 17 00:00:00 2001
From: Marshall Wang <marshallxkwang@gmail.com>
Date: Tue, 3 Sep 2024 12:48:57 -0400
Subject: [PATCH 1/2] Add support for custom models, remove load format to
 default to auto for multi node jobs, add Llama3-OpenBio-70B

---
 vec_inf/cli/_cli.py          | 30 ++++++++++++++++++++----------
 vec_inf/models/models.csv    |  3 ++-
 vec_inf/multinode_vllm.slurm |  2 --
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
index 8dd77c5..9c7bee7 100644
--- a/vec_inf/cli/_cli.py
+++ b/vec_inf/cli/_cli.py
@@ -63,6 +63,11 @@ def cli():
     type=str,
     help='Time limit for job, this should comply with QoS, default to max walltime of the chosen QoS'
 )
+@click.option(
+    "--vocab-size",
+    type=int,
+    help='Vocabulary size, this option is intended for custom models'
+)
 @click.option(
     "--data-type",
     type=str,
@@ -93,6 +98,7 @@ def launch(
     num_gpus: int=None,
     qos: str=None,
     time: str=None,
+    vocab_size: int=None,
     data_type: str=None,
     venv: str=None,
     log_dir: str=None,
@@ -109,16 +115,20 @@ def launch(
 
     models_df = load_models_df()
 
-    if model_name not in models_df['model_name'].values:
-        raise ValueError(f"Model name {model_name} not found in available models")
-
-    default_args = load_default_args(models_df, model_name)
-
-    for arg in default_args:
-        if arg in locals() and locals()[arg] is not None:
-            default_args[arg] = locals()[arg]
-        renamed_arg = arg.replace("_", "-")
-        launch_cmd += f" --{renamed_arg} {default_args[arg]}"    
+    if model_name in models_df['model_name'].values:
+        default_args = load_default_args(models_df, model_name)
+        for arg in default_args:
+            if arg in locals() and locals()[arg] is not None:
+                default_args[arg] = locals()[arg]
+            renamed_arg = arg.replace("_", "-")
+            launch_cmd += f" --{renamed_arg} {default_args[arg]}" 
+    else:
+        model_args = models_df.columns.tolist()
+        excluded_keys = ['model_name', 'pipeline_parallelism']
+        for arg in model_args:
+            if arg not in excluded_keys and locals()[arg] is not None:
+                renamed_arg = arg.replace("_", "-")
+                launch_cmd += f" --{renamed_arg} {locals()[arg]}"  
     
     output = run_bash_command(launch_cmd)
 
diff --git a/vec_inf/models/models.csv b/vec_inf/models/models.csv
index 160c228..bb0b9b6 100644
--- a/vec_inf/models/models.csv
+++ b/vec_inf/models/models.csv
@@ -42,4 +42,5 @@ Mixtral-8x7B-Instruct-v0.1,Mixtral,8x7B-Instruct-v0.1,a40,m2,08:00:00,4,1,32000,
 Mixtral-8x22B-v0.1,Mixtral,8x22B-v0.1,a40,m2,08:00:00,4,2,32768,65536,auto,singularity,default,false
 Mixtral-8x22B-Instruct-v0.1,Mixtral,8x22B-Instruct-v0.1,a40,m2,08:00:00,4,2,32768,65536,auto,singularity,default,false
 Phi-3-medium-128k-instruct,Phi-3,medium-128k-instruct,a40,m2,08:00:00,2,1,32064,131072,auto,singularity,default,false
-Phi-3-vision-128k-instruct,Phi-3,vision-128k-instruct,a40,m2,08:00:00,2,1,32064,65536,auto,singularity,default,false
\ No newline at end of file
+Phi-3-vision-128k-instruct,Phi-3,vision-128k-instruct,a40,m2,08:00:00,2,1,32064,65536,auto,singularity,default,false
+Llama3-OpenBioLLM-70B,Llama3-OpenBioLLM,70B,a40,m2,08:00:00,4,1,128256,8192,auto,singularity,default,false
\ No newline at end of file
diff --git a/vec_inf/multinode_vllm.slurm b/vec_inf/multinode_vllm.slurm
index db8f710..ee36cdb 100644
--- a/vec_inf/multinode_vllm.slurm
+++ b/vec_inf/multinode_vllm.slurm
@@ -93,7 +93,6 @@ if [ "$VENV_BASE" = "singularity" ]; then
     --pipeline-parallel-size ${PIPELINE_PARALLEL_SIZE} \
     --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
     --dtype ${VLLM_DATA_TYPE} \
-    --load-format safetensors \
     --trust-remote-code \
     --max-logprobs ${VLLM_MAX_LOGPROBS} \
     --max-model-len ${VLLM_MAX_MODEL_LEN}
@@ -107,7 +106,6 @@ else
     --pipeline-parallel-size ${PIPELINE_PARALLEL_SIZE} \
     --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
     --dtype ${VLLM_DATA_TYPE} \
-    --load-format safetensors \
     --trust-remote-code \
     --max-logprobs ${VLLM_MAX_LOGPROBS} \
     --max-model-len ${VLLM_MAX_MODEL_LEN}

From 9a07db820dcd2ff33f2a7ca7bf75a567faabf626 Mon Sep 17 00:00:00 2001
From: Marshall Wang <marshallxkwang@gmail.com>
Date: Tue, 3 Sep 2024 12:49:45 -0400
Subject: [PATCH 2/2] Bump version to 0.3.2

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 22838b6..6f47cbc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "vec-inf"
-version = "0.3.1"
+version = "0.3.2"
 description = "Efficient LLM inference on Slurm clusters using vLLM."
 authors = ["Marshall Wang <marshall.wang@vectorinstitute.ai>"]
 license = "MIT license"