Skip to content
This repository has been archived by the owner on Mar 30, 2024. It is now read-only.

Commit

Permalink
Fixes for gptq image, improve codegen mapping (to gptj) (#64)
Browse files Browse the repository at this point in the history
Signed-off-by: Hung-Han (Henry) Chen <chenhungh@gmail.com>
  • Loading branch information
chenhunghan authored Aug 26, 2023
1 parent 4f651e3 commit 4d1fc25
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 29 deletions.
8 changes: 7 additions & 1 deletion Dockerfile.gptq
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
# syntax=docker/dockerfile:1

FROM python:3.11-slim
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
RUN apt-get update \
&& apt-get install -y --no-install-recommends g++ python3-dev python3-pip \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get purge -y --auto-remove g++ python3-dev python3-pip
WORKDIR /app
COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt
# Fixes exllama/cuda_ext.py:82: UserWarning: Failed to initialize NumPy: No module named 'numpy'
RUN pip3 install numpy
# https://github.com/marella/ctransformers#gptq
RUN pip3 install ctransformers[gptq]
COPY . .
Expand Down
4 changes: 2 additions & 2 deletions charts/ialacol/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
apiVersion: v2
appVersion: 0.11.1
appVersion: 0.11.2
description: A Helm chart for ialacol
name: ialacol
type: application
version: 0.11.1
version: 0.11.2
11 changes: 10 additions & 1 deletion get_model_type.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from request_body import ChatCompletionRequestBody, CompletionRequestBody
from get_env import get_env


def get_model_type(
filename: str,
) -> str:
ctransformer_model_type = "llama"
filename = filename.lower()
# These are also in "starcoder" format
# https://huggingface.co/TheBloke/WizardCoder-15B-1.0-GGML
# https://huggingface.co/TheBloke/minotaur-15B-GGML
Expand Down Expand Up @@ -34,6 +34,15 @@ def get_model_type(
# matching https://huggingface.co/EleutherAI/pythia-70m
if "pythia" in filename:
ctransformer_model_type = "gpt_neox"
# codegen family are in gptj, codegen2 isn't but not supported by ggml/ctransformer yet
# https://huggingface.co/Salesforce/codegen-2B-multi
# https://huggingface.co/ravenscroftj/CodeGen-2B-multi-ggml-quant
if "codegen" in filename:
ctransformer_model_type = "gptj"

DEFAULT_MODEL_HG_REPO_ID = get_env("DEFAULT_MODEL_HG_REPO_ID", "")
if "gptq" in str(DEFAULT_MODEL_HG_REPO_ID).lower() or "gptq" in filename:
ctransformer_model_type = "gptq"

MODE_TYPE = get_env("MODE_TYPE", "")
if len(MODE_TYPE) > 0:
Expand Down
76 changes: 51 additions & 25 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
)
from fastapi import FastAPI, Depends, HTTPException, Body, Request
from fastapi.responses import StreamingResponse
from ctransformers import LLM, Config
from huggingface_hub import hf_hub_download
from ctransformers import LLM, AutoModelForCausalLM, Config
from huggingface_hub import hf_hub_download, snapshot_download
from get_config import get_config
from get_model_type import get_model_type

Expand Down Expand Up @@ -70,22 +70,37 @@ async def startup_event():
Starts up the server, setting log level, downloading the default model if necessary.
"""
log.info("Starting up...")
if DEFAULT_MODEL_FILE and DEFAULT_MODEL_HG_REPO_ID:
model_type = get_model_type(DEFAULT_MODEL_FILE)
if DEFAULT_MODEL_HG_REPO_ID:
set_downloading_model(True)
log.info(
"Downloading model... %s/%s to %s/models",
DEFAULT_MODEL_HG_REPO_ID,
DEFAULT_MODEL_FILE,
os.getcwd(),
)

try:
hf_hub_download(
repo_id=DEFAULT_MODEL_HG_REPO_ID,
cache_dir="models/.cache",
local_dir="models",
filename=DEFAULT_MODEL_FILE,
resume_download=True,
)
if model_type == "gptq":
log.info(
"Downloading repo %s to %s/models",
DEFAULT_MODEL_HG_REPO_ID,
os.getcwd(),
)
snapshot_download(
repo_id=DEFAULT_MODEL_HG_REPO_ID,
cache_dir="models/.cache",
local_dir="models",
resume_download=True,
)
elif DEFAULT_MODEL_FILE:
log.info(
"Downloading model... %s/%s to %s/models",
DEFAULT_MODEL_HG_REPO_ID,
DEFAULT_MODEL_FILE,
os.getcwd(),
)
hf_hub_download(
repo_id=DEFAULT_MODEL_HG_REPO_ID,
cache_dir="models/.cache",
local_dir="models",
filename=DEFAULT_MODEL_FILE,
resume_download=True,
)
except Exception as exception:
log.error("Error downloading model: %s", exception)
finally:
Expand All @@ -103,20 +118,29 @@ async def startup_event():
context_length=CONTEXT_LENGTH,
gpu_layers=GPU_LAYERS,
)
model_type = get_model_type(DEFAULT_MODEL_FILE)

log.info(
"Creating llm singleton with model_type: %s for DEFAULT_MODEL_FILE %s",
"Creating llm singleton with model_type: %s",
model_type,
DEFAULT_MODEL_FILE,
)
set_loading_model(True)
llm = LLM(
model_path=f"{os.getcwd()}/models/{DEFAULT_MODEL_FILE}",
config=config,
model_type=model_type,
)
if model_type == "gptq":
log.debug("Creating llm/gptq instance...")
llm = AutoModelForCausalLM.from_pretrained(
model_path_or_repo_id=f"{os.getcwd()}/models",
model_type="gptq",
local_files_only=True,
)
app.state.llm = llm
else:
log.debug("Creating llm/ggml instance...")
llm = LLM(
model_path=f"{os.getcwd()}/models/{DEFAULT_MODEL_FILE}",
config=config,
model_type=model_type,
)
app.state.llm = llm
log.info("llm singleton created.")
app.state.llm = llm
set_loading_model(False)


Expand All @@ -143,6 +167,7 @@ async def models():
"object": "list",
}


@app.post("/v1/completions", response_model=CompletionResponseBody)
async def completions(
body: Annotated[CompletionRequestBody, Body()],
Expand Down Expand Up @@ -182,6 +207,7 @@ async def completions(
)
return model_generate(prompt, model_name, llm, config)


@app.post("/v1/engines/{engine}/completions")
async def engine_completions(
# Can't use body as FastAPI require corrent context-type header
Expand Down

0 comments on commit 4d1fc25

Please sign in to comment.