This repository has been archived by the owner on Mar 30, 2024. It is now read-only.
Add support for OpenChat 3.5/Zephyr 7B β, improve fallbacks of repetition_penalty
, support multiple messages in request body
#189
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Smoke Test | |
on: pull_request | |
env: | |
REGISTRY: quay.io | |
REPO_ORG_NAME: ialacol | |
IMAGE_NAME: ialacol-smoke-test | |
GPTQ_IMAGE_TAG: gptq | |
HELM_NAMESPACE: default | |
LOGGING_LEVEL: DEBUG | |
# for testing llama base models | |
LLAMA_HELM_RELEASE_NAME: orca-mini-3b | |
LLAMA_MODEL_HG_REPO_ID: TheBloke/orca_mini_3B-GGML | |
LLAMA_MODEL_FILE: orca-mini-3b.ggmlv3.q4_0.bin | |
LLAMA_SVC_PORT: 8000 | |
# for testing gpt-neox base models | |
GPT_NEOX_HELM_RELEASE_NAME: stablecode-instruct-alpha-3b | |
GPT_NEOX_MODEL_HG_REPO_ID: TheBloke/stablecode-instruct-alpha-3b-GGML | |
GPT_NEOX_MODEL_FILE: stablecode-instruct-alpha-3b.ggmlv1.q4_0.bin | |
GPT_NEOX_SVC_PORT: 8001 | |
# for testing starcoder base models | |
STARCODER_HELM_RELEASE_NAME: tiny-starcoder-py | |
STARCODER_MODEL_HG_REPO_ID: mike-ravkine/tiny_starcoder_py-GGML | |
STARCODER_MODEL_FILE: tiny_starcoder_py-q8_0.bin | |
STARCODER_SVC_PORT: 8002 | |
# for testing gptq models | |
GPTQ_HELM_RELEASE_NAME: stablecode-instruct-alpha-3b-gptq | |
GPTQ_MODEL_HG_REPO_ID: TheBloke/stablecode-instruct-alpha-3b-GPTQ | |
GPTQ_MODEL_HG_REVISION: gptq-4bit-32g-actorder_True | |
GPTQ_MODEL_FILE: model.safetensors | |
GPTQ_SVC_PORT: 8003 | |
jobs: | |
build-image: | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v3 | |
with: | |
fetch-depth: 0 | |
- name: Login to Registry | |
uses: docker/login-action@v2 | |
with: | |
registry: ${{ env.REGISTRY }} | |
username: ${{ secrets.QUAY_ROBOT_USERNAME }} | |
password: ${{ secrets.QUAY_ROBOT_PASSWORD }} | |
- name: Build and push Docker image | |
uses: docker/build-push-action@v4 | |
with: | |
context: . | |
file: ./Dockerfile | |
push: true | |
tags: | | |
${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ github.sha }} | |
build-gptq-cuda12-image: | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v3 | |
with: | |
fetch-depth: 0 | |
- name: Login to Registry | |
uses: docker/login-action@v2 | |
with: | |
registry: ${{ env.REGISTRY }} | |
username: ${{ secrets.QUAY_ROBOT_USERNAME }} | |
password: ${{ secrets.QUAY_ROBOT_PASSWORD }} | |
- name: Build and push Docker image | |
uses: docker/build-push-action@v4 | |
with: | |
context: . | |
file: ./Dockerfile.cuda12 | |
push: true | |
tags: | | |
${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ env.GPTQ_IMAGE_TAG }} | |
llama-smoke-test: | |
runs-on: ubuntu-latest | |
needs: build-image | |
steps: | |
- name: Create k8s Kind Cluster | |
uses: helm/kind-action@v1.7.0 | |
- name: Set up Helm | |
uses: azure/setup-helm@v3 | |
with: | |
version: v3.12.0 | |
- name: Checkout | |
uses: actions/checkout@v3 | |
with: | |
fetch-depth: 0 | |
- name: Install ialacol with LLaMa based model and wait for pods to be ready | |
run: | | |
cat > values.yaml <<EOF | |
replicas: 1 | |
deployment: | |
image: ${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ github.sha }} | |
env: | |
DEFAULT_MODEL_HG_REPO_ID: $LLAMA_MODEL_HG_REPO_ID | |
DEFAULT_MODEL_FILE: $LLAMA_MODEL_FILE | |
LOGGING_LEVEL: $LOGGING_LEVEL | |
TOP_K: 0 | |
resources: | |
{} | |
model: | |
persistence: | |
size: 2Gi | |
accessModes: | |
- ReadWriteOnce | |
service: | |
type: ClusterIP | |
port: $LLAMA_SVC_PORT | |
annotations: {} | |
nodeSelector: {} | |
tolerations: [] | |
affinity: {} | |
EOF | |
helm install $LLAMA_HELM_RELEASE_NAME -f values.yaml --namespace $HELM_NAMESPACE ./charts/ialacol | |
echo "Wait for the pod to be ready, it takes about 36s to download a 1.93GB model (~50MB/s)" | |
sleep 40 | |
- if: always() | |
run: | | |
kubectl get pods -n $HELM_NAMESPACE | |
- if: always() | |
run: | | |
kubectl logs --tail=200 --selector app.kubernetes.io/name=$LLAMA_HELM_RELEASE_NAME -n $HELM_NAMESPACE | |
- name: Port forward to the LLaMa model service | |
run: | | |
kubectl port-forward svc/$LLAMA_HELM_RELEASE_NAME $LLAMA_SVC_PORT:$LLAMA_SVC_PORT & | |
echo "Wait for port-forward to be ready" | |
sleep 5 | |
- name: Check the GET /v1/models endpoint | |
run: | | |
curl http://localhost:$LLAMA_SVC_PORT/v1/models | |
- uses: actions/setup-python@v4 | |
with: | |
python-version: 3.11 | |
- name: Install OpenAI CLI | |
run: | | |
pip install --upgrade openai --quiet | |
- name: Test the OpenAI CLI with default parameters | |
run: | | |
openai -k "sk-fake" -b http://localhost:$LLAMA_SVC_PORT/v1 api models.list | |
openai -k "sk-fake" -b http://localhost:$LLAMA_SVC_PORT/v1 -vvvvv api chat_completions.create -m $LLAMA_MODEL_FILE -g user "Hello world!" | |
openai -k "sk-fake" -b http://localhost:$LLAMA_SVC_PORT/v1 -vvvvv api completions.create -m $LLAMA_MODEL_FILE -p "Who are" | |
- name: Ask the AI for a joke | |
run: | | |
REPLY=$(openai -k "sk-fake" -b http://localhost:$LLAMA_SVC_PORT/v1 api chat_completions.create -m $LLAMA_MODEL_FILE -g user "Tell me a joke." --max-tokens 4096 --temperature 2 --top_p 1.0) | |
REPLY=$(echo $REPLY | tr -d '\n') | |
echo "$REPLY" | |
if [ -z "$REPLY" ]; then | |
echo "No reply from AI" | |
exit 1 | |
fi | |
echo "REPLY=$REPLY" >> $GITHUB_ENV | |
- if: always() | |
run: | | |
kubectl logs --tail=200 --selector app.kubernetes.io/name=$LLAMA_HELM_RELEASE_NAME -n $HELM_NAMESPACE | |
gpt-neox-smoke-test: | |
runs-on: ubuntu-latest | |
needs: build-image | |
steps: | |
- name: Create k8s Kind Cluster | |
uses: helm/kind-action@v1.7.0 | |
- name: Set up Helm | |
uses: azure/setup-helm@v3 | |
with: | |
version: v3.12.0 | |
- uses: actions/setup-python@v4 | |
with: | |
python-version: 3.11 | |
- name: Install OpenAI CLI | |
run: | | |
pip install --upgrade openai --quiet | |
- name: Checkout | |
uses: actions/checkout@v3 | |
with: | |
fetch-depth: 0 | |
- name: Install ialacol with gpt-neox based model and wait for pods to be ready | |
run: | | |
cat > values.yaml <<EOF | |
replicas: 1 | |
deployment: | |
image: ${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ github.sha }} | |
env: | |
DEFAULT_MODEL_HG_REPO_ID: $GPT_NEOX_MODEL_HG_REPO_ID | |
DEFAULT_MODEL_FILE: $GPT_NEOX_MODEL_FILE | |
LOGGING_LEVEL: $LOGGING_LEVEL | |
TOP_K: 40 | |
REPETITION_PENALTY: 1.176 | |
resources: | |
{} | |
model: | |
persistence: | |
size: 0.5Gi | |
accessModes: | |
- ReadWriteOnce | |
service: | |
type: ClusterIP | |
port: $GPT_NEOX_SVC_PORT | |
annotations: {} | |
nodeSelector: {} | |
tolerations: [] | |
affinity: {} | |
EOF | |
helm install $GPT_NEOX_HELM_RELEASE_NAME -f values.yaml --namespace $HELM_NAMESPACE ./charts/ialacol | |
echo "Wait for the pod to be ready, it takes about 36s to download a 1.93GB model (~50MB/s)" | |
sleep 40 | |
- if: always() | |
run: | | |
kubectl get pods -n $HELM_NAMESPACE | |
- if: always() | |
run: | | |
kubectl logs --tail=200 --selector app.kubernetes.io/name=$GPT_NEOX_HELM_RELEASE_NAME -n $HELM_NAMESPACE | |
- name: Port forward to the gpt-neox model service | |
run: | | |
kubectl port-forward svc/$GPT_NEOX_HELM_RELEASE_NAME $GPT_NEOX_SVC_PORT:$GPT_NEOX_SVC_PORT & | |
echo "Wait for port-forward to be ready" | |
sleep 5 | |
- name: Check model response | |
run: | | |
openai -k "sk-fake" -b http://localhost:$GPT_NEOX_SVC_PORT/v1 -vvvvv api completions.create -m $GPT_NEOX_MODEL_FILE -p "A function adding 1 to 1 in Python." | |
- if: always() | |
run: | | |
kubectl logs --tail=200 --selector app.kubernetes.io/name=$GPT_NEOX_HELM_RELEASE_NAME -n $HELM_NAMESPACE | |
starcoder-smoke-test: | |
runs-on: ubuntu-latest | |
needs: build-image | |
steps: | |
- name: Create k8s Kind Cluster | |
uses: helm/kind-action@v1.7.0 | |
- name: Set up Helm | |
uses: azure/setup-helm@v3 | |
with: | |
version: v3.12.0 | |
- uses: actions/setup-python@v4 | |
with: | |
python-version: 3.11 | |
- name: Install OpenAI CLI | |
run: | | |
pip install --upgrade openai --quiet | |
- name: Checkout | |
uses: actions/checkout@v3 | |
with: | |
fetch-depth: 0 | |
- name: Install ialacol with starcoder based model and wait for pods to be ready | |
run: | | |
cat > values.yaml <<EOF | |
replicas: 1 | |
deployment: | |
image: ${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ github.sha }} | |
env: | |
DEFAULT_MODEL_HG_REPO_ID: $STARCODER_MODEL_HG_REPO_ID | |
DEFAULT_MODEL_FILE: $STARCODER_MODEL_FILE | |
LOGGING_LEVEL: $LOGGING_LEVEL | |
TOP_K: 40 | |
REPETITION_PENALTY: 1.176 | |
resources: | |
{} | |
model: | |
persistence: | |
size: 2Gi | |
accessModes: | |
- ReadWriteOnce | |
service: | |
type: ClusterIP | |
port: $STARCODER_SVC_PORT | |
annotations: {} | |
nodeSelector: {} | |
tolerations: [] | |
affinity: {} | |
EOF | |
helm install $STARCODER_HELM_RELEASE_NAME -f values.yaml --namespace $HELM_NAMESPACE ./charts/ialacol | |
echo "Wait for the pod to be ready" | |
sleep 20 | |
- if: always() | |
run: | | |
kubectl get pods -n $HELM_NAMESPACE | |
- if: always() | |
run: | | |
kubectl logs --tail=200 --selector app.kubernetes.io/name=$STARCODER_HELM_RELEASE_NAME -n $HELM_NAMESPACE | |
- name: Port forward to the starcoder model service | |
run: | | |
kubectl port-forward svc/$STARCODER_HELM_RELEASE_NAME $STARCODER_SVC_PORT:$STARCODER_SVC_PORT & | |
echo "Wait for port-forward to be ready" | |
sleep 5 | |
- name: Check model response | |
run: | | |
openai -k "sk-fake" -b http://localhost:$STARCODER_SVC_PORT/v1 -vvvvv api completions.create -m $STARCODER_MODEL_FILE -p "def fibonnaci" | |
- if: always() | |
run: | | |
kubectl logs --tail=200 --selector app.kubernetes.io/name=$STARCODER_HELM_RELEASE_NAME -n $HELM_NAMESPACE | |
gptq-smoke-test: | |
runs-on: ubuntu-latest | |
needs: build-gptq-cuda12-image | |
steps: | |
- name: Create k8s Kind Cluster | |
uses: helm/kind-action@v1.7.0 | |
- name: Set up Helm | |
uses: azure/setup-helm@v3 | |
with: | |
version: v3.12.0 | |
- name: Checkout | |
uses: actions/checkout@v3 | |
with: | |
fetch-depth: 0 | |
- name: Install ialacol with GPTQ model from a revision and wait for pods to be ready | |
run: | | |
cat > values.yaml <<EOF | |
replicas: 1 | |
deployment: | |
image: ${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ env.GPTQ_IMAGE_TAG }} | |
env: | |
DEFAULT_MODEL_HG_REPO_ID: $GPTQ_MODEL_HG_REPO_ID | |
DEFAULT_MODEL_HG_REPO_REVISION: $GPTQ_MODEL_HG_REVISION | |
DEFAULT_MODEL_FILE: $GPTQ_MODEL_FILE | |
MODEL_TYPE: "gptq" | |
LOGGING_LEVEL: $LOGGING_LEVEL | |
resources: | |
{} | |
model: | |
persistence: | |
size: 3Gi | |
accessModes: | |
- ReadWriteOnce | |
service: | |
type: ClusterIP | |
port: $GPTQ_SVC_PORT | |
annotations: {} | |
nodeSelector: {} | |
tolerations: [] | |
affinity: {} | |
EOF | |
helm install $GPTQ_HELM_RELEASE_NAME -f values.yaml --namespace $HELM_NAMESPACE ./charts/ialacol | |
echo "Wait for the pod to be ready, it takes about 36s to download a 1.93GB model (~50MB/s)" | |
sleep 40 | |
- if: always() | |
run: | | |
kubectl get pods -n $HELM_NAMESPACE | |
- if: always() | |
run: | | |
kubectl logs --tail=200 --selector app.kubernetes.io/name=$GPTQ_HELM_RELEASE_NAME -n $HELM_NAMESPACE | |
- name: Port forward to the GPTQ model service | |
run: | | |
kubectl port-forward svc/$GPTQ_HELM_RELEASE_NAME $GPTQ_SVC_PORT:$GPTQ_SVC_PORT & | |
echo "Wait for port-forward to be ready" | |
sleep 5 | |
- name: Check the GET /v1/models endpoint | |
run: | | |
curl http://localhost:$GPTQ_SVC_PORT/v1/models | |
- uses: actions/setup-python@v4 | |
with: | |
python-version: 3.11 | |
- name: Install OpenAI CLI | |
run: | | |
pip install --upgrade openai --quiet | |
# We can only test if download works and if GET /models returns something on CPU CI workers | |
- name: Test the OpenAI CLI with default parameters | |
run: | | |
openai -k "sk-fake" -b http://localhost:$GPTQ_SVC_PORT/v1 api models.list | |
- if: always() | |
run: | | |
kubectl logs --tail=200 --selector app.kubernetes.io/name=$LLAMA_HELM_RELEASE_NAME -n $HELM_NAMESPACE | |