Add support for download model from a specific revision (#77)

Signed-off-by: Hung-Han (Henry) Chen <chenhungh@gmail.com>
chenhunghan · Oct 16, 2023 · e22bd33 · e22bd33
1 parent 13d4334
commit e22bd33
Show file tree

Hide file tree

Showing 6 changed files with 138 additions and 25 deletions.
diff --git a/.github/workflows/smoke_test.yaml b/.github/workflows/smoke_test.yaml
@@ -1,14 +1,12 @@
 name: Smoke Test
 
-on:
-  pull_request:
-    branches:
-    - main
+on: pull_request
 
 env:
   REGISTRY: quay.io
   REPO_ORG_NAME: ialacol
   IMAGE_NAME: ialacol-smoke-test
+  GPTQ_IMAGE_TAG: gptq
   HELM_NAMESPACE: default
   LOGGING_LEVEL: DEBUG
   # for testing llama base models
@@ -26,6 +24,12 @@ env:
   STARCODER_MODEL_HG_REPO_ID: mike-ravkine/tiny_starcoder_py-GGML
   STARCODER_MODEL_FILE: tiny_starcoder_py-q8_0.bin
   STARCODER_SVC_PORT: 8002
+  # for testing gptq models
+  GPTQ_HELM_RELEASE_NAME: stablecode-instruct-alpha-3b-gptq
+  GPTQ_MODEL_HG_REPO_ID: TheBloke/stablecode-instruct-alpha-3b-GPTQ
+  GPTQ_MODEL_HG_REVISION: gptq-4bit-32g-actorder_True
+  GPTQ_MODEL_FILE: model.safetensors
+  GPTQ_SVC_PORT: 8003
 
 jobs:
   build-image:
@@ -36,7 +40,7 @@ jobs:
         with:
           fetch-depth: 0
 
-      - name: Login to Github Container Registry
+      - name: Login to Registry
         uses: docker/login-action@v2
         with:
           registry: ${{ env.REGISTRY }}
@@ -51,6 +55,29 @@ jobs:
           push: true
           tags: |
             ${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ github.sha }}
+  build-gptq-cuda12-image:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: Login to Registry
+        uses: docker/login-action@v2
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ secrets.QUAY_ROBOT_USERNAME }}
+          password: ${{ secrets.QUAY_ROBOT_PASSWORD }}
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: ./Dockerfile.cuda12
+          push: true
+          tags: |
+            ${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ env.GPTQ_IMAGE_TAG }}
   llama-smoke-test:
     runs-on: ubuntu-latest
     needs: build-image
@@ -274,3 +301,79 @@ jobs:
       - if: always()
         run: |
           kubectl logs --tail=200 --selector app.kubernetes.io/name=$STARCODER_HELM_RELEASE_NAME -n $HELM_NAMESPACE
+  gptq-smoke-test:
+    runs-on: ubuntu-latest
+    needs: build-gptq-cuda12-image
+    steps:
+      - name: Create k8s Kind Cluster
+        uses: helm/kind-action@v1.7.0
+
+      - name: Set up Helm
+        uses: azure/setup-helm@v3
+        with:
+          version: v3.12.0
+
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: Install ialacol with GPTQ model from a revision and wait for pods to be ready
+        run: |
+          cat > values.yaml <<EOF
+          replicas: 1
+          deployment:
+            image: ${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ ${{ env.GPTQ_IMAGE_TAG }}
+            env:
+              DEFAULT_MODEL_HG_REPO_ID: $GPTQ_MODEL_HG_REPO_ID
+              DEFAULT_MODEL_HG_REPO_REVISION: $GPTQ_MODEL_HG_REVISION
+              DEFAULT_MODEL_FILE: $GPTQ_MODEL_FILE
+              MODEL_TYPE: "gptq"
+              LOGGING_LEVEL: $LOGGING_LEVEL
+          resources:
+            {}
+          model:
+            persistence:
+              size: 3Gi
+              accessModes:
+                - ReadWriteOnce
+          service:
+            type: ClusterIP
+            port: $GPTQ_SVC_PORT
+            annotations: {}
+          nodeSelector: {}
+          tolerations: []
+          affinity: {}
+          EOF
+          helm install $GPTQ_HELM_RELEASE_NAME -f values.yaml --namespace $HELM_NAMESPACE ./charts/ialacol
+
+          echo "Wait for the pod to be ready, it takes about 36s to download a 1.93GB model (~50MB/s)"
+          sleep 40
+      - if: always()
+        run: |
+          kubectl get pods -n $HELM_NAMESPACE
+      - if: always()
+        run: |
+          kubectl logs --tail=200 --selector app.kubernetes.io/name=$GPTQ_HELM_RELEASE_NAME -n $HELM_NAMESPACE
+      - name: Port forward to the GPTQ model service
+        run: |
+          kubectl port-forward svc/$GPTQ_HELM_RELEASE_NAME $GPTQ_SVC_PORT:$GPTQ_SVC_PORT &
+          echo "Wait for port-forward to be ready"
+          sleep 5
+      - name: Check the GET /v1/models endpoint
+        run: |
+          curl http://localhost:$GPTQ_SVC_PORT/v1/models
+      - uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+      - name: Install OpenAI CLI
+        run: |
+          pip install --upgrade openai --quiet
+      # We can only test if download works and if GET /models returns something on CPU CI workers
+      - name: Test the OpenAI CLI with default parameters
+        run: |
+          openai -k "sk-fake" -b http://localhost:$GPTQ_SVC_PORT/v1 api models.list
+      - if: always()
+        run: |
+          kubectl logs --tail=200 --selector app.kubernetes.io/name=$LLAMA_HELM_RELEASE_NAME -n $HELM_NAMESPACE
+  
diff --git a/README.md b/README.md
@@ -85,24 +85,25 @@ openai -k "sk-fake" \
 
 All configuration is done via environmental variable.
 
-| Parameter                    | Description                                                          | Default | Example                                                                      |
-| :----------------------------| :------------------------------------------------------------------- | :------ | :--------------------------------------------------------------------------- |
-| `DEFAULT_MODEL_HG_REPO_ID`   | The Hugging Face repo id to download the model                       | `None`  | `TheBloke/orca_mini_3B-GGML`                                                 |
-| `DEFAULT_MODEL_FILE`         | The file name to download from the repo, optional for GPTQ models    | `None`  | `orca-mini-3b.ggmlv3.q4_0.bin`                                               |
-| `MODE_TYPE`                  | Model type to override the auto model type detection                 | `None`  | `gptq`, `gpt_bigcode`, `llama`, `mpt`, `replit`, `falcon`, `gpt_neox` `gptj` |
-| `LOGGING_LEVEL`              | Logging level                                                        | `INFO`  | `DEBUG`                                                                      |
-| `TOP_K`                      | top-k for sampling.                                                  | `40 `   | Integers                                                                     |
-| `TOP_P`                      | top-p for sampling.                                                  | `1.0`   | Floats                                                                       |
-| `REPETITION_PENALTY`         | rp for sampling.                                                     | `1.1`   | Floats                                                                       |
-| `LAST_N_TOKENS`              | The last n tokens for repetition penalty.                            | `1.1`   | Integers                                                                     |
-| `SEED`                       | The seed for sampling.                                               | `-1`    | Integers                                                                     |
-| `BATCH_SIZE`                 | The batch size for evaluating tokens, only for GGUF/GGML models      | `8`     | Integers                                                                     |
-| `THREADS`                    | Thread number override auto detect by CPU/2, set `1` for GPTQ models | `Auto`  | Integers                                                                     |
-| `MAX_TOKENS`                 | The max number of token to generate                                  | `512`   | Integers                                                                     |
-| `STOP`                       | The token to stop the generation                                     | `None`  | `<|endoftext>`                                                               |
-| `CONTEXT_LENGTH`             | Override the auto detect context length                              | `512`   | Integers                                                                     |
-| `GPU_LAYERS`                 | The number of layers to off load to GPU                              | `0`     | Integers                                                                     |
-| `TRUNCATE_PROMPT_LENGTH`     | Truncate the prompt if set                                           | `0`     | Integers                                                                     |
+| Parameter                          | Description                                                          | Default | Example                                                                      |
+| :----------------------------------| :------------------------------------------------------------------- | :------ | :--------------------------------------------------------------------------- |
+| `DEFAULT_MODEL_HG_REPO_ID`         | The Hugging Face repo id to download the model                       | `None`  | `TheBloke/orca_mini_3B-GGML`                                                 |
+| `DEFAULT_MODEL_HG_REPO_REVISION`   | The Hugging Face repo revision                                       | `main`  | `gptq-4bit-32g-actorder_True`                                                |
+| `DEFAULT_MODEL_FILE`               | The file name to download from the repo, optional for GPTQ models    | `None`  | `orca-mini-3b.ggmlv3.q4_0.bin`                                               |
+| `MODE_TYPE`                        | Model type to override the auto model type detection                 | `None`  | `gptq`, `gpt_bigcode`, `llama`, `mpt`, `replit`, `falcon`, `gpt_neox` `gptj` |
+| `LOGGING_LEVEL`                    | Logging level                                                        | `INFO`  | `DEBUG`                                                                      |
+| `TOP_K`                            | top-k for sampling.                                                  | `40 `   | Integers                                                                     |
+| `TOP_P`                            | top-p for sampling.                                                  | `1.0`   | Floats                                                                       |
+| `REPETITION_PENALTY`               | rp for sampling.                                                     | `1.1`   | Floats                                                                       |
+| `LAST_N_TOKENS`                    | The last n tokens for repetition penalty.                            | `1.1`   | Integers                                                                     |
+| `SEED`                             | The seed for sampling.                                               | `-1`    | Integers                                                                     |
+| `BATCH_SIZE`                       | The batch size for evaluating tokens, only for GGUF/GGML models      | `8`     | Integers                                                                     |
+| `THREADS`                          | Thread number override auto detect by CPU/2, set `1` for GPTQ models | `Auto`  | Integers                                                                     |
+| `MAX_TOKENS`                       | The max number of token to generate                                  | `512`   | Integers                                                                     |
+| `STOP`                             | The token to stop the generation                                     | `None`  | `<|endoftext>`                                                               |
+| `CONTEXT_LENGTH`                   | Override the auto detect context length                              | `512`   | Integers                                                                     |
+| `GPU_LAYERS`                       | The number of layers to off load to GPU                              | `0`     | Integers                                                                     |
+| `TRUNCATE_PROMPT_LENGTH`           | Truncate the prompt if set                                           | `0`     | Integers                                                                     |
 
 Sampling parameters including `TOP_K`, `TOP_P`, `REPETITION_PENALTY`, `LAST_N_TOKENS`, `SEED`, `MAX_TOKENS`, `STOP` can be override per request via request body, for example:
 

diff --git a/charts/ialacol/Chart.yaml b/charts/ialacol/Chart.yaml
@@ -1,6 +1,6 @@
 apiVersion: v2
-appVersion: 0.11.5
+appVersion: 0.12.0
 description: A Helm chart for ialacol
 name: ialacol
 type: application
-version: 0.11.5
+version: 0.12.0
diff --git a/charts/ialacol/templates/deployment.yaml b/charts/ialacol/templates/deployment.yaml
@@ -27,6 +27,8 @@ spec:
           env:
           - name: DEFAULT_MODEL_HG_REPO_ID
             value: {{ (.Values.deployment.env).DEFAULT_MODEL_HG_REPO_ID | quote }}
+          - name: DEFAULT_MODEL_HG_REPO_REVISION
+            value: {{ (.Values.deployment.env).DEFAULT_MODEL_HG_REPO_REVISION | quote }}
           - name: DEFAULT_MODEL_FILE
             value: {{ (.Values.deployment.env).DEFAULT_MODEL_FILE | quote }}
           - name: MODE_TYPE

diff --git a/charts/ialacol/values.yaml b/charts/ialacol/values.yaml
@@ -5,6 +5,7 @@ deployment:
   # or use CUDA image `ghcr.io/chenhunghan/ialacol-cuda12:latest`
   # env:
     # DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-7B-Chat-GGML
+    # DEFAULT_MODEL_HG_REPO_REVISION: main
     # DEFAULT_MODEL_FILE: llama-2-7b-chat.ggmlv3.q4_0.bin
     # LOGGING_LEVEL: DEBUG
 resources:

diff --git a/main.py b/main.py
@@ -31,9 +31,13 @@
 DEFAULT_MODEL_HG_REPO_ID = get_env(
     "DEFAULT_MODEL_HG_REPO_ID", "TheBloke/Llama-2-7B-Chat-GGML"
 )
+DEFAULT_MODEL_HG_REPO_REVISION = get_env(
+    "DEFAULT_MODEL_HG_REPO_REVISION", "main"
+)
 DEFAULT_MODEL_FILE = get_env("DEFAULT_MODEL_FILE", "llama-2-7b-chat.ggmlv3.q4_0.bin")
 
 log.info("DEFAULT_MODEL_HG_REPO_ID: %s", DEFAULT_MODEL_HG_REPO_ID)
+log.info("DEFAULT_MODEL_HG_REPO_REVISION: %s", DEFAULT_MODEL_HG_REPO_REVISION)
 log.info("DEFAULT_MODEL_FILE: %s", DEFAULT_MODEL_FILE)
 
 DOWNLOADING_MODEL = False
@@ -93,6 +97,7 @@ async def startup_event():
                 )
                 snapshot_download(
                     repo_id=DEFAULT_MODEL_HG_REPO_ID,
+                    revision=DEFAULT_MODEL_HG_REPO_REVISION,
                     cache_dir="models/.cache",
                     local_dir="models",
                     resume_download=True,
@@ -106,6 +111,7 @@ async def startup_event():
                 )
                 hf_hub_download(
                     repo_id=DEFAULT_MODEL_HG_REPO_ID,
+                    revision=DEFAULT_MODEL_HG_REPO_REVISION,
                     cache_dir="models/.cache",
                     local_dir="models",
                     filename=DEFAULT_MODEL_FILE,