Release 0.1.0 (#12)

* fix gh actions * remove arm64 from gpu build * cleanup * enable cache for docker build on gha * refine docker build cache ref * refine gpu dependencies * quite conda installations * free disk space on gha * fix free space command on gha * fix free space command on gha
1b5d · Jul 23, 2023 · 928ec4a · 928ec4a
1 parent 4e43983
commit 928ec4a
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 36 deletions.
diff --git a/.github/workflows/publish-release.yml b/.github/workflows/publish-release.yml
@@ -3,6 +3,10 @@ name: Publish Docker image
 on:
   release:
     types: [published]
+  pull_request:
+    branches:
+      - main
+      - 'release/**'
 
 jobs:
   push_to_dockerhub:
@@ -37,10 +41,18 @@ jobs:
           platforms: linux/amd64,linux/arm64
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=registry,ref=1b5d/llm-api:latest
+          cache-to: type=inline
 
   push_gpu_to_dockerhub:
     runs-on: ubuntu-latest
     steps:
+      - name: Free disk space
+        run: |
+          sudo apt-get autoremove -y >/dev/null 2>&1
+          sudo apt-get autoclean -y >/dev/null 2>&1
+          docker rmi $(docker image ls -aq) >/dev/null 2>&1
+
       - name: Checkout
         uses: actions/checkout@v3
 
@@ -62,15 +74,16 @@ jobs:
         with:
           images: 1b5d/llm-api
           flavor: | 
-            suffix=-gpu
-            latest=false
+            suffix=-gpu,onlatest=true
       
       - name: Build and push
         uses: docker/build-push-action@v4
         with:
           context: .
           file: Dockerfile.gpu
           push: ${{ github.event_name != 'pull_request' }}
-          platforms: linux/amd64,linux/arm64
+          platforms: linux/amd64
           tags: ${{ steps.meta-gpu.outputs.tags }}
           labels: ${{ steps.meta-gpu.outputs.labels }}
+          cache-from: type=registry,ref=1b5d/llm-api:latest-gpu
+          cache-to: type=inline
diff --git a/Dockerfile.gpu b/Dockerfile.gpu
@@ -3,7 +3,7 @@ FROM debian:bullseye-slim as pytorch-install
 ARG PYTORCH_VERSION=2.0.0
 ARG PYTHON_VERSION=3.9
 ARG CUDA_VERSION=11.7.1
-ARG MAMBA_VERSION=23.1.0-1
+ARG MAMBA_VERSION=23.1.0-4
 ARG CUDA_CHANNEL=nvidia
 ARG INSTALL_CHANNEL=pytorch
 # Automatically set by buildx
@@ -25,19 +25,19 @@ RUN case ${TARGETPLATFORM} in \
     "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
     *)              MAMBA_ARCH=x86_64   ;; \
     esac && \
-    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
-RUN chmod +x ~/mambaforge.sh && \
-    bash ~/mambaforge.sh -b -p /opt/conda && \
-    rm ~/mambaforge.sh
+    curl -fsSL -v -o ~/miniforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Miniforge3-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
+RUN chmod +x ~/miniforge.sh && \
+    bash ~/miniforge.sh -b -p /opt/conda && \
+    rm ~/miniforge.sh
 
 # Install pytorch
 # On arm64 we exit with an error code
 RUN case ${TARGETPLATFORM} in \
     "linux/arm64")  exit 1 ;; \
-    *)              /opt/conda/bin/conda update -y conda &&  \
-    /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" pytorch==$PYTORCH_VERSION torchvision torchaudio "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
+    *)              /opt/conda/bin/conda update -qy conda &&  \
+    /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -yq "python=${PYTHON_VERSION}" pytorch==$PYTORCH_VERSION torchvision torchaudio "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
     esac && \
-    /opt/conda/bin/conda clean -ya
+    /opt/conda/bin/conda clean -yqa
 
 # CUDA kernels builder image
 FROM pytorch-install as kernel-builder
@@ -46,9 +46,8 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
     ninja-build \
     && rm -rf /var/lib/apt/lists/*
 
-RUN /opt/conda/bin/conda install -c "nvidia/label/cuda-11.7.1"  cuda==11.7.1 && \
-    /opt/conda/bin/conda clean -ya
-
+RUN /opt/conda/bin/conda install -q -c "nvidia/label/cuda-11.7.1"  cuda==11.7.1 && \
+    /opt/conda/bin/conda clean -yqa
 
 FROM debian:bullseye-slim as base
 
@@ -60,14 +59,20 @@ LABEL com.nvidia.volumes.needed="nvidia_driver"
 # Copy conda with PyTorch installed
 COPY --from=kernel-builder /opt/conda /opt/conda
 
-RUN apt-get update && apt-get install -y build-essential git zlib1g-dev
+RUN apt-get update && \
+    apt-get install -y build-essential git zlib1g-dev cmake && \
+    apt-get autoremove && \
+    apt-get clean
 
 WORKDIR /llm-api
 
 COPY ./requirements.txt /llm-api/requirements.txt
 RUN pip3 install --no-cache-dir --upgrade -r requirements.txt && \
-    pip3 install --no-cache-dir accelerate==0.20.3 && \
-    pip3 install triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python
+    pip3 install --no-cache-dir accelerate==0.20.3 packaging==23.0 ninja==1.11.1 && \
+    pip3 install --no-cache-dir --no-build-isolation flash-attn==v1.0.9 && \
+    pip3 install --no-cache-dir triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python && rm -fr $HOME/.triton && \
+    pip3 cache purge && \
+    /opt/conda/bin/conda clean -ya
 
 COPY ./app /llm-api/app
 ENV PYTHONPATH "/llm-api"

diff --git a/README.md b/README.md
@@ -134,6 +134,8 @@ To be able to accelerate inference using GPU, the `1b5d/llm-api:x.x.x-gpu` image
 docker compose -f docker-compose.gpu.yaml up
 ```
 
+Note: currenty only `linux/amd64` architecture is supported for gpu images
+
 ## Llama on CPU - using llama.cpp
 
 You can configure the model usage in a local `config.yaml` file, the configs, here is an example:
@@ -266,7 +268,7 @@ This app was tested with the following models :
 - Llama and models based on it (ALpaca, Vicuna, Koala ..etc.) using the ggml format
 - Llama and models based on it (ALpaca, Vicuna, Koala ..etc.) using the GPTQ format (4bit-128g)
 - Popular models on huggingface (MPT, GPT2, Falcon) using PT format 
-- Llama 2 using the ggml format
+- Llama 2 using ggml and gptq formats
 
 # Credits
 

diff --git a/config.yaml b/config.yaml
@@ -1,21 +1,10 @@
 models_dir: /models
-model_family: llama
+model_family: gptq_llama
 setup_params:
-  repo_id: TheBloke/Llama-2-7B-Chat-GGML
-  filename: llama-2-7b-chat.ggmlv3.q4_0.bin
+  repo_id: <repo id>
+  filename: gptq_model-4bit-128g.safetensors
 model_params:
-  n_ctx: 512
-  n_parts: -1
-  n_gpu_layers: 0
-  seed: -1
-  use_mmap: True
-  n_threads: 8
-  n_batch: 2048
-  last_n_tokens_size: 64
-  lora_base: null
-  lora_path: null
-  low_vram: False
-  tensor_split: null
-  rope_freq_base: 10000.0
-  rope_freq_scale: 1.0
-  verbose: True
+  group_size: 128
+  wbits: 4
+  cuda_visible_devices: "0"
+  device: "cuda:0"