Fixes for gptq image, improve codegen mapping (to gptj) (#64)

Signed-off-by: Hung-Han (Henry) Chen <chenhungh@gmail.com>
chenhunghan · Aug 26, 2023 · 4d1fc25 · 4d1fc25
1 parent 4f651e3
commit 4d1fc25
Show file tree

Hide file tree

Showing 4 changed files with 70 additions and 29 deletions.
diff --git a/Dockerfile.gptq b/Dockerfile.gptq
@@ -1,9 +1,15 @@
 # syntax=docker/dockerfile:1
 
-FROM python:3.11-slim
+FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends g++ python3-dev python3-pip \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get purge -y --auto-remove g++ python3-dev python3-pip
 WORKDIR /app
 COPY requirements.txt requirements.txt
 RUN pip3 install -r requirements.txt
+# Fixes exllama/cuda_ext.py:82: UserWarning: Failed to initialize NumPy: No module named 'numpy'
+RUN pip3 install numpy
 # https://github.com/marella/ctransformers#gptq
 RUN pip3 install ctransformers[gptq]
 COPY . .

diff --git a/charts/ialacol/Chart.yaml b/charts/ialacol/Chart.yaml
@@ -1,6 +1,6 @@
 apiVersion: v2
-appVersion: 0.11.1
+appVersion: 0.11.2
 description: A Helm chart for ialacol
 name: ialacol
 type: application
-version: 0.11.1
+version: 0.11.2
diff --git a/get_model_type.py b/get_model_type.py
@@ -1,11 +1,11 @@
-from request_body import ChatCompletionRequestBody, CompletionRequestBody
 from get_env import get_env
 
 
 def get_model_type(
     filename: str,
 ) -> str:
     ctransformer_model_type = "llama"
+    filename = filename.lower()
     # These are also in "starcoder" format
     # https://huggingface.co/TheBloke/WizardCoder-15B-1.0-GGML
     # https://huggingface.co/TheBloke/minotaur-15B-GGML
@@ -34,6 +34,15 @@ def get_model_type(
     # matching https://huggingface.co/EleutherAI/pythia-70m
     if "pythia" in filename:
         ctransformer_model_type = "gpt_neox"
+    # codegen family are in gptj, codegen2 isn't but not supported by ggml/ctransformer yet
+    # https://huggingface.co/Salesforce/codegen-2B-multi
+    # https://huggingface.co/ravenscroftj/CodeGen-2B-multi-ggml-quant
+    if "codegen" in filename:
+        ctransformer_model_type = "gptj"
+
+    DEFAULT_MODEL_HG_REPO_ID = get_env("DEFAULT_MODEL_HG_REPO_ID", "")
+    if "gptq" in str(DEFAULT_MODEL_HG_REPO_ID).lower() or "gptq" in filename:
+        ctransformer_model_type = "gptq"
 
     MODE_TYPE = get_env("MODE_TYPE", "")
     if len(MODE_TYPE) > 0:

diff --git a/main.py b/main.py
@@ -12,8 +12,8 @@
 )
 from fastapi import FastAPI, Depends, HTTPException, Body, Request
 from fastapi.responses import StreamingResponse
-from ctransformers import LLM, Config
-from huggingface_hub import hf_hub_download
+from ctransformers import LLM, AutoModelForCausalLM, Config
+from huggingface_hub import hf_hub_download, snapshot_download
 from get_config import get_config
 from get_model_type import get_model_type
 
@@ -70,22 +70,37 @@ async def startup_event():
     Starts up the server, setting log level, downloading the default model if necessary.
     """
     log.info("Starting up...")
-    if DEFAULT_MODEL_FILE and DEFAULT_MODEL_HG_REPO_ID:
+    model_type = get_model_type(DEFAULT_MODEL_FILE)
+    if DEFAULT_MODEL_HG_REPO_ID:
         set_downloading_model(True)
-        log.info(
-            "Downloading model... %s/%s to %s/models",
-            DEFAULT_MODEL_HG_REPO_ID,
-            DEFAULT_MODEL_FILE,
-            os.getcwd(),
-        )
+
         try:
-            hf_hub_download(
-                repo_id=DEFAULT_MODEL_HG_REPO_ID,
-                cache_dir="models/.cache",
-                local_dir="models",
-                filename=DEFAULT_MODEL_FILE,
-                resume_download=True,
-            )
+            if model_type == "gptq":
+                log.info(
+                    "Downloading repo %s to %s/models",
+                    DEFAULT_MODEL_HG_REPO_ID,
+                    os.getcwd(),
+                )
+                snapshot_download(
+                    repo_id=DEFAULT_MODEL_HG_REPO_ID,
+                    cache_dir="models/.cache",
+                    local_dir="models",
+                    resume_download=True,
+                )
+            elif DEFAULT_MODEL_FILE:
+                log.info(
+                    "Downloading model... %s/%s to %s/models",
+                    DEFAULT_MODEL_HG_REPO_ID,
+                    DEFAULT_MODEL_FILE,
+                    os.getcwd(),
+                )
+                hf_hub_download(
+                    repo_id=DEFAULT_MODEL_HG_REPO_ID,
+                    cache_dir="models/.cache",
+                    local_dir="models",
+                    filename=DEFAULT_MODEL_FILE,
+                    resume_download=True,
+                )
         except Exception as exception:
             log.error("Error downloading model: %s", exception)
         finally:
@@ -103,20 +118,29 @@ async def startup_event():
         context_length=CONTEXT_LENGTH,
         gpu_layers=GPU_LAYERS,
     )
-    model_type = get_model_type(DEFAULT_MODEL_FILE)
+
     log.info(
-        "Creating llm singleton with model_type: %s for DEFAULT_MODEL_FILE %s",
+        "Creating llm singleton with model_type: %s",
         model_type,
-        DEFAULT_MODEL_FILE,
     )
     set_loading_model(True)
-    llm = LLM(
-        model_path=f"{os.getcwd()}/models/{DEFAULT_MODEL_FILE}",
-        config=config,
-        model_type=model_type,
-    )
+    if model_type == "gptq":
+        log.debug("Creating llm/gptq instance...")
+        llm = AutoModelForCausalLM.from_pretrained(
+            model_path_or_repo_id=f"{os.getcwd()}/models",
+            model_type="gptq",
+            local_files_only=True,
+        )
+        app.state.llm = llm
+    else:
+        log.debug("Creating llm/ggml instance...")
+        llm = LLM(
+            model_path=f"{os.getcwd()}/models/{DEFAULT_MODEL_FILE}",
+            config=config,
+            model_type=model_type,
+        )
+        app.state.llm = llm
     log.info("llm singleton created.")
-    app.state.llm = llm
     set_loading_model(False)
 
 
@@ -143,6 +167,7 @@ async def models():
         "object": "list",
     }
 
+
 @app.post("/v1/completions", response_model=CompletionResponseBody)
 async def completions(
     body: Annotated[CompletionRequestBody, Body()],
@@ -182,6 +207,7 @@ async def completions(
         )
     return model_generate(prompt, model_name, llm, config)
 
+
 @app.post("/v1/engines/{engine}/completions")
 async def engine_completions(
     # Can't use body as FastAPI require corrent context-type header