Skip to content

Commit

Permalink
Release 0.1.0 (#12)
Browse files Browse the repository at this point in the history
* fix gh actions

* remove arm64 from gpu build

* cleanup

* enable cache for docker build on gha

* refine docker build cache ref

* refine gpu dependencies

* quite conda installations

* free disk space on gha

* fix free space command on gha

* fix free space command on gha
  • Loading branch information
1b5d authored Jul 23, 2023
1 parent 4e43983 commit 928ec4a
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 36 deletions.
19 changes: 16 additions & 3 deletions .github/workflows/publish-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@ name: Publish Docker image
on:
release:
types: [published]
pull_request:
branches:
- main
- 'release/**'

jobs:
push_to_dockerhub:
Expand Down Expand Up @@ -37,10 +41,18 @@ jobs:
platforms: linux/amd64,linux/arm64
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=registry,ref=1b5d/llm-api:latest
cache-to: type=inline

push_gpu_to_dockerhub:
runs-on: ubuntu-latest
steps:
- name: Free disk space
run: |
sudo apt-get autoremove -y >/dev/null 2>&1
sudo apt-get autoclean -y >/dev/null 2>&1
docker rmi $(docker image ls -aq) >/dev/null 2>&1
- name: Checkout
uses: actions/checkout@v3

Expand All @@ -62,15 +74,16 @@ jobs:
with:
images: 1b5d/llm-api
flavor: |
suffix=-gpu
latest=false
suffix=-gpu,onlatest=true
- name: Build and push
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile.gpu
push: ${{ github.event_name != 'pull_request' }}
platforms: linux/amd64,linux/arm64
platforms: linux/amd64
tags: ${{ steps.meta-gpu.outputs.tags }}
labels: ${{ steps.meta-gpu.outputs.labels }}
cache-from: type=registry,ref=1b5d/llm-api:latest-gpu
cache-to: type=inline
33 changes: 19 additions & 14 deletions Dockerfile.gpu
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ FROM debian:bullseye-slim as pytorch-install
ARG PYTORCH_VERSION=2.0.0
ARG PYTHON_VERSION=3.9
ARG CUDA_VERSION=11.7.1
ARG MAMBA_VERSION=23.1.0-1
ARG MAMBA_VERSION=23.1.0-4
ARG CUDA_CHANNEL=nvidia
ARG INSTALL_CHANNEL=pytorch
# Automatically set by buildx
Expand All @@ -25,19 +25,19 @@ RUN case ${TARGETPLATFORM} in \
"linux/arm64") MAMBA_ARCH=aarch64 ;; \
*) MAMBA_ARCH=x86_64 ;; \
esac && \
curl -fsSL -v -o ~/mambaforge.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
RUN chmod +x ~/mambaforge.sh && \
bash ~/mambaforge.sh -b -p /opt/conda && \
rm ~/mambaforge.sh
curl -fsSL -v -o ~/miniforge.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Miniforge3-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
RUN chmod +x ~/miniforge.sh && \
bash ~/miniforge.sh -b -p /opt/conda && \
rm ~/miniforge.sh

# Install pytorch
# On arm64 we exit with an error code
RUN case ${TARGETPLATFORM} in \
"linux/arm64") exit 1 ;; \
*) /opt/conda/bin/conda update -y conda && \
/opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" pytorch==$PYTORCH_VERSION torchvision torchaudio "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)" ;; \
*) /opt/conda/bin/conda update -qy conda && \
/opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -yq "python=${PYTHON_VERSION}" pytorch==$PYTORCH_VERSION torchvision torchaudio "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)" ;; \
esac && \
/opt/conda/bin/conda clean -ya
/opt/conda/bin/conda clean -yqa

# CUDA kernels builder image
FROM pytorch-install as kernel-builder
Expand All @@ -46,9 +46,8 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
ninja-build \
&& rm -rf /var/lib/apt/lists/*

RUN /opt/conda/bin/conda install -c "nvidia/label/cuda-11.7.1" cuda==11.7.1 && \
/opt/conda/bin/conda clean -ya

RUN /opt/conda/bin/conda install -q -c "nvidia/label/cuda-11.7.1" cuda==11.7.1 && \
/opt/conda/bin/conda clean -yqa

FROM debian:bullseye-slim as base

Expand All @@ -60,14 +59,20 @@ LABEL com.nvidia.volumes.needed="nvidia_driver"
# Copy conda with PyTorch installed
COPY --from=kernel-builder /opt/conda /opt/conda

RUN apt-get update && apt-get install -y build-essential git zlib1g-dev
RUN apt-get update && \
apt-get install -y build-essential git zlib1g-dev cmake && \
apt-get autoremove && \
apt-get clean

WORKDIR /llm-api

COPY ./requirements.txt /llm-api/requirements.txt
RUN pip3 install --no-cache-dir --upgrade -r requirements.txt && \
pip3 install --no-cache-dir accelerate==0.20.3 && \
pip3 install triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python
pip3 install --no-cache-dir accelerate==0.20.3 packaging==23.0 ninja==1.11.1 && \
pip3 install --no-cache-dir --no-build-isolation flash-attn==v1.0.9 && \
pip3 install --no-cache-dir triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python && rm -fr $HOME/.triton && \
pip3 cache purge && \
/opt/conda/bin/conda clean -ya

COPY ./app /llm-api/app
ENV PYTHONPATH "/llm-api"
Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ To be able to accelerate inference using GPU, the `1b5d/llm-api:x.x.x-gpu` image
docker compose -f docker-compose.gpu.yaml up
```

Note: currenty only `linux/amd64` architecture is supported for gpu images

## Llama on CPU - using llama.cpp

You can configure the model usage in a local `config.yaml` file, the configs, here is an example:
Expand Down Expand Up @@ -266,7 +268,7 @@ This app was tested with the following models :
- Llama and models based on it (ALpaca, Vicuna, Koala ..etc.) using the ggml format
- Llama and models based on it (ALpaca, Vicuna, Koala ..etc.) using the GPTQ format (4bit-128g)
- Popular models on huggingface (MPT, GPT2, Falcon) using PT format
- Llama 2 using the ggml format
- Llama 2 using ggml and gptq formats

# Credits

Expand Down
25 changes: 7 additions & 18 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -1,21 +1,10 @@
models_dir: /models
model_family: llama
model_family: gptq_llama
setup_params:
repo_id: TheBloke/Llama-2-7B-Chat-GGML
filename: llama-2-7b-chat.ggmlv3.q4_0.bin
repo_id: <repo id>
filename: gptq_model-4bit-128g.safetensors
model_params:
n_ctx: 512
n_parts: -1
n_gpu_layers: 0
seed: -1
use_mmap: True
n_threads: 8
n_batch: 2048
last_n_tokens_size: 64
lora_base: null
lora_path: null
low_vram: False
tensor_split: null
rope_freq_base: 10000.0
rope_freq_scale: 1.0
verbose: True
group_size: 128
wbits: 4
cuda_visible_devices: "0"
device: "cuda:0"

0 comments on commit 928ec4a

Please sign in to comment.