Skip to content
This repository has been archived by the owner on Mar 30, 2024. It is now read-only.

Add support for OpenChat 3.5/Zephyr 7B β, improve fallbacks of repetition_penalty, support multiple messages in request body #192

Add support for OpenChat 3.5/Zephyr 7B β, improve fallbacks of repetition_penalty, support multiple messages in request body

Add support for OpenChat 3.5/Zephyr 7B β, improve fallbacks of repetition_penalty, support multiple messages in request body #192

Workflow file for this run

name: Smoke Test
on: pull_request
env:
REGISTRY: quay.io
REPO_ORG_NAME: ialacol
IMAGE_NAME: ialacol-smoke-test
GPTQ_IMAGE_TAG: gptq
HELM_NAMESPACE: default
LOGGING_LEVEL: DEBUG
# for testing llama base models
LLAMA_HELM_RELEASE_NAME: orca-mini-3b
LLAMA_MODEL_HG_REPO_ID: TheBloke/orca_mini_3B-GGML
LLAMA_MODEL_FILE: orca-mini-3b.ggmlv3.q4_0.bin
LLAMA_SVC_PORT: 8000
# for testing gpt-neox base models
GPT_NEOX_HELM_RELEASE_NAME: stablecode-instruct-alpha-3b
GPT_NEOX_MODEL_HG_REPO_ID: TheBloke/stablecode-instruct-alpha-3b-GGML
GPT_NEOX_MODEL_FILE: stablecode-instruct-alpha-3b.ggmlv1.q4_0.bin
GPT_NEOX_SVC_PORT: 8001
# for testing starcoder base models
STARCODER_HELM_RELEASE_NAME: tiny-starcoder-py
STARCODER_MODEL_HG_REPO_ID: mike-ravkine/tiny_starcoder_py-GGML
STARCODER_MODEL_FILE: tiny_starcoder_py-q8_0.bin
STARCODER_SVC_PORT: 8002
# for testing gptq models
GPTQ_HELM_RELEASE_NAME: stablecode-instruct-alpha-3b-gptq
GPTQ_MODEL_HG_REPO_ID: TheBloke/stablecode-instruct-alpha-3b-GPTQ
GPTQ_MODEL_HG_REVISION: gptq-4bit-32g-actorder_True
GPTQ_MODEL_FILE: model.safetensors
GPTQ_SVC_PORT: 8003
jobs:
build-image:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Login to Registry
uses: docker/login-action@v2
with:
registry: ${{ env.REGISTRY }}
username: ${{ secrets.QUAY_ROBOT_USERNAME }}
password: ${{ secrets.QUAY_ROBOT_PASSWORD }}
- name: Build and push Docker image
uses: docker/build-push-action@v4
with:
context: .
file: ./Dockerfile
push: true
tags: |
${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ github.sha }}
build-gptq-cuda12-image:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Login to Registry
uses: docker/login-action@v2
with:
registry: ${{ env.REGISTRY }}
username: ${{ secrets.QUAY_ROBOT_USERNAME }}
password: ${{ secrets.QUAY_ROBOT_PASSWORD }}
- name: Build and push Docker image
uses: docker/build-push-action@v4
with:
context: .
file: ./Dockerfile.cuda12
push: true
tags: |
${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ env.GPTQ_IMAGE_TAG }}
llama-smoke-test:
runs-on: ubuntu-latest
needs: build-image
steps:
- name: Create k8s Kind Cluster
uses: helm/kind-action@v1.7.0
- name: Set up Helm
uses: azure/setup-helm@v3
with:
version: v3.12.0
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Install ialacol with LLaMa based model and wait for pods to be ready
run: |
cat > values.yaml <<EOF
replicas: 1
deployment:
image: ${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ github.sha }}
env:
DEFAULT_MODEL_HG_REPO_ID: $LLAMA_MODEL_HG_REPO_ID
DEFAULT_MODEL_FILE: $LLAMA_MODEL_FILE
LOGGING_LEVEL: $LOGGING_LEVEL
TOP_K: 0
resources:
{}
model:
persistence:
size: 2Gi
accessModes:
- ReadWriteOnce
service:
type: ClusterIP
port: $LLAMA_SVC_PORT
annotations: {}
nodeSelector: {}
tolerations: []
affinity: {}
EOF
helm install $LLAMA_HELM_RELEASE_NAME -f values.yaml --namespace $HELM_NAMESPACE ./charts/ialacol
echo "Wait for the pod to be ready, it takes about 36s to download a 1.93GB model (~50MB/s)"
sleep 40
- if: always()
run: |
kubectl get pods -n $HELM_NAMESPACE
- if: always()
run: |
kubectl logs --tail=200 --selector app.kubernetes.io/name=$LLAMA_HELM_RELEASE_NAME -n $HELM_NAMESPACE
- name: Port forward to the LLaMa model service
run: |
kubectl port-forward svc/$LLAMA_HELM_RELEASE_NAME $LLAMA_SVC_PORT:$LLAMA_SVC_PORT &
echo "Wait for port-forward to be ready"
sleep 5
- name: Check the GET /v1/models endpoint
run: |
curl http://localhost:$LLAMA_SVC_PORT/v1/models
- uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install OpenAI CLI
run: |
pip install --upgrade openai --quiet
- name: Test the OpenAI CLI with default parameters
run: |
openai -k "sk-fake" -b http://localhost:$LLAMA_SVC_PORT/v1 api models.list
openai -k "sk-fake" -b http://localhost:$LLAMA_SVC_PORT/v1 -vvvvv api chat_completions.create -m $LLAMA_MODEL_FILE -g user "Hello world!"
openai -k "sk-fake" -b http://localhost:$LLAMA_SVC_PORT/v1 -vvvvv api completions.create -m $LLAMA_MODEL_FILE -p "Who are"
- name: Ask the AI for a joke
run: |
REPLY=$(openai -k "sk-fake" -b http://localhost:$LLAMA_SVC_PORT/v1 api chat_completions.create -m $LLAMA_MODEL_FILE -g user "Tell me a joke." --max-tokens 4096 --temperature 2 --top_p 1.0)
REPLY=$(echo $REPLY | tr -d '\n')
echo "$REPLY"
if [ -z "$REPLY" ]; then
echo "No reply from AI"
exit 1
fi
echo "REPLY=$REPLY" >> $GITHUB_ENV
- if: always()
run: |
kubectl logs --tail=200 --selector app.kubernetes.io/name=$LLAMA_HELM_RELEASE_NAME -n $HELM_NAMESPACE
gpt-neox-smoke-test:
runs-on: ubuntu-latest
needs: build-image
steps:
- name: Create k8s Kind Cluster
uses: helm/kind-action@v1.7.0
- name: Set up Helm
uses: azure/setup-helm@v3
with:
version: v3.12.0
- uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install OpenAI CLI
run: |
pip install --upgrade openai --quiet
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Install ialacol with gpt-neox based model and wait for pods to be ready
run: |
cat > values.yaml <<EOF
replicas: 1
deployment:
image: ${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ github.sha }}
env:
DEFAULT_MODEL_HG_REPO_ID: $GPT_NEOX_MODEL_HG_REPO_ID
DEFAULT_MODEL_FILE: $GPT_NEOX_MODEL_FILE
LOGGING_LEVEL: $LOGGING_LEVEL
TOP_K: 40
REPETITION_PENALTY: 1.176
resources:
{}
model:
persistence:
size: 0.5Gi
accessModes:
- ReadWriteOnce
service:
type: ClusterIP
port: $GPT_NEOX_SVC_PORT
annotations: {}
nodeSelector: {}
tolerations: []
affinity: {}
EOF
helm install $GPT_NEOX_HELM_RELEASE_NAME -f values.yaml --namespace $HELM_NAMESPACE ./charts/ialacol
echo "Wait for the pod to be ready, it takes about 36s to download a 1.93GB model (~50MB/s)"
sleep 40
- if: always()
run: |
kubectl get pods -n $HELM_NAMESPACE
- if: always()
run: |
kubectl logs --tail=200 --selector app.kubernetes.io/name=$GPT_NEOX_HELM_RELEASE_NAME -n $HELM_NAMESPACE
- name: Port forward to the gpt-neox model service
run: |
kubectl port-forward svc/$GPT_NEOX_HELM_RELEASE_NAME $GPT_NEOX_SVC_PORT:$GPT_NEOX_SVC_PORT &
echo "Wait for port-forward to be ready"
sleep 5
- name: Check model response
run: |
openai -k "sk-fake" -b http://localhost:$GPT_NEOX_SVC_PORT/v1 -vvvvv api completions.create -m $GPT_NEOX_MODEL_FILE -p "A function adding 1 to 1 in Python."
- if: always()
run: |
kubectl logs --tail=200 --selector app.kubernetes.io/name=$GPT_NEOX_HELM_RELEASE_NAME -n $HELM_NAMESPACE
starcoder-smoke-test:
runs-on: ubuntu-latest
needs: build-image
steps:
- name: Create k8s Kind Cluster
uses: helm/kind-action@v1.7.0
- name: Set up Helm
uses: azure/setup-helm@v3
with:
version: v3.12.0
- uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install OpenAI CLI
run: |
pip install --upgrade openai --quiet
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Install ialacol with starcoder based model and wait for pods to be ready
run: |
cat > values.yaml <<EOF
replicas: 1
deployment:
image: ${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ github.sha }}
env:
DEFAULT_MODEL_HG_REPO_ID: $STARCODER_MODEL_HG_REPO_ID
DEFAULT_MODEL_FILE: $STARCODER_MODEL_FILE
LOGGING_LEVEL: $LOGGING_LEVEL
TOP_K: 40
REPETITION_PENALTY: 1.176
resources:
{}
model:
persistence:
size: 2Gi
accessModes:
- ReadWriteOnce
service:
type: ClusterIP
port: $STARCODER_SVC_PORT
annotations: {}
nodeSelector: {}
tolerations: []
affinity: {}
EOF
helm install $STARCODER_HELM_RELEASE_NAME -f values.yaml --namespace $HELM_NAMESPACE ./charts/ialacol
echo "Wait for the pod to be ready"
sleep 40
- if: always()
run: |
kubectl get pods -n $HELM_NAMESPACE
- if: always()
run: |
kubectl logs --tail=200 --selector app.kubernetes.io/name=$STARCODER_HELM_RELEASE_NAME -n $HELM_NAMESPACE
- name: Port forward to the starcoder model service
run: |
kubectl port-forward svc/$STARCODER_HELM_RELEASE_NAME $STARCODER_SVC_PORT:$STARCODER_SVC_PORT &
echo "Wait for port-forward to be ready"
sleep 5
- name: Check model response
run: |
openai -k "sk-fake" -b http://localhost:$STARCODER_SVC_PORT/v1 -vvvvv api completions.create -m $STARCODER_MODEL_FILE -p "def fibonnaci"
- if: always()
run: |
kubectl logs --tail=200 --selector app.kubernetes.io/name=$STARCODER_HELM_RELEASE_NAME -n $HELM_NAMESPACE
gptq-smoke-test:
runs-on: ubuntu-latest
needs: build-gptq-cuda12-image
steps:
- name: Create k8s Kind Cluster
uses: helm/kind-action@v1.7.0
- name: Set up Helm
uses: azure/setup-helm@v3
with:
version: v3.12.0
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Install ialacol with GPTQ model from a revision and wait for pods to be ready
run: |
cat > values.yaml <<EOF
replicas: 1
deployment:
image: ${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ env.GPTQ_IMAGE_TAG }}
env:
DEFAULT_MODEL_HG_REPO_ID: $GPTQ_MODEL_HG_REPO_ID
DEFAULT_MODEL_HG_REPO_REVISION: $GPTQ_MODEL_HG_REVISION
DEFAULT_MODEL_FILE: $GPTQ_MODEL_FILE
MODEL_TYPE: "gptq"
LOGGING_LEVEL: $LOGGING_LEVEL
resources:
{}
model:
persistence:
size: 3Gi
accessModes:
- ReadWriteOnce
service:
type: ClusterIP
port: $GPTQ_SVC_PORT
annotations: {}
nodeSelector: {}
tolerations: []
affinity: {}
EOF
helm install $GPTQ_HELM_RELEASE_NAME -f values.yaml --namespace $HELM_NAMESPACE ./charts/ialacol
echo "Wait for the pod to be ready, it takes about 36s to download a 1.93GB model (~50MB/s)"
sleep 120
- if: always()
run: |
kubectl get pods -n $HELM_NAMESPACE
- if: always()
run: |
kubectl logs --tail=200 --selector app.kubernetes.io/name=$GPTQ_HELM_RELEASE_NAME -n $HELM_NAMESPACE
- name: Port forward to the GPTQ model service
run: |
kubectl port-forward svc/$GPTQ_HELM_RELEASE_NAME $GPTQ_SVC_PORT:$GPTQ_SVC_PORT &
echo "Wait for port-forward to be ready"
sleep 5
- name: Check the GET /v1/models endpoint
run: |
curl http://localhost:$GPTQ_SVC_PORT/v1/models
- uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install OpenAI CLI
run: |
pip install --upgrade openai --quiet
# We can only test if download works and if GET /models returns something on CPU CI workers
- name: Test the OpenAI CLI with default parameters
run: |
openai -k "sk-fake" -b http://localhost:$GPTQ_SVC_PORT/v1 api models.list
- if: always()
run: |
kubectl logs --tail=200 --selector app.kubernetes.io/name=$LLAMA_HELM_RELEASE_NAME -n $HELM_NAMESPACE