diff --git a/.github/workflows/smoke_test.yaml b/.github/workflows/smoke_test.yaml index fbd4faf..3a4414a 100644 --- a/.github/workflows/smoke_test.yaml +++ b/.github/workflows/smoke_test.yaml @@ -137,7 +137,7 @@ jobs: echo "REPLY=$REPLY" >> $GITHUB_ENV - if: always() run: | - kubectl logs --tail=20 --selector app.kubernetes.io/name=$LLAMA_HELM_RELEASE_NAME -n $HELM_NAMESPACE + kubectl logs --tail=200 --selector app.kubernetes.io/name=$LLAMA_HELM_RELEASE_NAME -n $HELM_NAMESPACE gpt-neox-smoke-test: runs-on: ubuntu-latest needs: build-image @@ -205,7 +205,7 @@ jobs: openai -k "sk-fake" -b http://localhost:$GPT_NEOX_SVC_PORT/v1 -vvvvv api completions.create -m $GPT_NEOX_MODEL_FILE -p "A function adding 1 to 1 in Python." - if: always() run: | - kubectl logs --tail=20 --selector app.kubernetes.io/name=$GPT_NEOX_HELM_RELEASE_NAME -n $HELM_NAMESPACE + kubectl logs --tail=200 --selector app.kubernetes.io/name=$GPT_NEOX_HELM_RELEASE_NAME -n $HELM_NAMESPACE starcoder-smoke-test: runs-on: ubuntu-latest needs: build-image @@ -273,4 +273,4 @@ jobs: openai -k "sk-fake" -b http://localhost:$STARCODER_SVC_PORT/v1 -vvvvv api completions.create -m $STARCODER_MODEL_FILE -p "def fibonnaci" - if: always() run: | - kubectl logs --tail=20 --selector app.kubernetes.io/name=$STARCODER_HELM_RELEASE_NAME -n $HELM_NAMESPACE + kubectl logs --tail=200 --selector app.kubernetes.io/name=$STARCODER_HELM_RELEASE_NAME -n $HELM_NAMESPACE diff --git a/charts/ialacol/Chart.yaml b/charts/ialacol/Chart.yaml index 1de887f..4f65fb0 100644 --- a/charts/ialacol/Chart.yaml +++ b/charts/ialacol/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 -appVersion: 0.11.3 +appVersion: 0.11.4 description: A Helm chart for ialacol name: ialacol type: application -version: 0.11.3 +version: 0.11.4 diff --git a/const.py b/const.py index de15110..13b561f 100644 --- a/const.py +++ b/const.py @@ -1,2 +1,3 @@ DEFAULT_MAX_TOKENS = "512" -DEFAULT_CONTEXT_LENGTH = "512" \ No newline at end of file +DEFAULT_CONTEXT_LENGTH = "4096" +DEFAULT_LOG_LEVEL = "INFO" \ No newline at end of file diff --git a/log.py b/log.py index 4f5930b..f16ad6a 100644 --- a/log.py +++ b/log.py @@ -1,12 +1,12 @@ import logging from get_env import get_env +from const import DEFAULT_LOG_LEVEL - -LOGGING_LEVEL = get_env("LOGGING_LEVEL", "INFO") +LOGGING_LEVEL = get_env("LOGGING_LEVEL", DEFAULT_LOG_LEVEL) log = logging.getLogger("uvicorn") try: log.setLevel(LOGGING_LEVEL) except ValueError: - log.setLevel("INFO") + log.setLevel(DEFAULT_LOG_LEVEL) diff --git a/main.py b/main.py index 6d74c12..4fc906a 100644 --- a/main.py +++ b/main.py @@ -10,7 +10,9 @@ Union, Annotated, ) -from fastapi import FastAPI, Depends, HTTPException, Body, Request +from fastapi import FastAPI, Depends, HTTPException, Body, Request, status +from fastapi.exceptions import RequestValidationError +from fastapi.responses import JSONResponse from fastapi.responses import StreamingResponse from ctransformers import LLM, AutoModelForCausalLM, Config from huggingface_hub import hf_hub_download, snapshot_download @@ -64,6 +66,13 @@ def set_loading_model(boolean: bool): app = FastAPI() +# https://github.com/tiangolo/fastapi/issues/3361 +@app.exception_handler(RequestValidationError) +async def validation_exception_handler(request: Request, exc: RequestValidationError): + exc_str = f"{exc}".replace("\n", " ").replace(" ", " ") + log.error("%s: %s", request, exc_str) + content = {"status_code": 10422, "message": exc_str, "data": None} + return JSONResponse(content=content, status_code=status.HTTP_422_UNPROCESSABLE_ENTITY) @app.on_event("startup") async def startup_event(): @@ -271,48 +280,80 @@ async def chat_completions( log.warning( "n, logit_bias, user, presence_penalty and frequency_penalty are not supporte." ) - default_assistant_start = "### Assistant: " - default_assistant_end = "" - default_user_start = "### Human: " - default_user_end = "" - default_system = "" - - if "llama" in body.model: - default_assistant_start = "ASSISTANT: \n" - default_user_start = "USER: " - default_user_end = "\n" - default_system = "SYSTEM: You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n" + system_start = "" + system = "You are a helpful assistant." + system_end = "" + user_start = "" + user_end = "" + assistant_start = "" + assistant_end = "" + + # https://huggingface.co/blog/llama2#how-to-prompt-llama-2 + # https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/discussions/3 + if "llama-2" in body.model.lower() and "chat" in body.model.lower(): + system_start = "[INST] <>\n" + system = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n" + system_end = "<>\n\n" + assistant_start = " " + assistant_end = " [INST] " + user_start = "" + user_end = " [/INST]" # For most instruct fine-tuned models using Alpaca prompt template # Although instruct fine-tuned models are not tuned for chat, they can be to generate response as if chatting, using Alpaca # prompt template likely gives better results than using the default prompt template # See https://github.com/tatsu-lab/stanford_alpaca#data-release - if "instruct" in body.model: - default_assistant_start = "### Response:" - default_user_start = "### Instruction: " - default_user_end = "\n\n" - default_system = "Below is an instruction that describes a task. Write a response that appropriately completes the request\n\n" - if "starchat" in body.model: + if "instruct" in body.model.lower(): + system_start = "" + system = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n" + system_end = "" + assistant_start = "### Response:" + assistant_end = "" + user_start = "### Instruction:\n" + user_end = "\n\n" + if "starchat" in body.model.lower(): # See https://huggingface.co/blog/starchat-alpha and https://huggingface.co/TheBloke/starchat-beta-GGML#prompt-template - default_assistant_start = "<|assistant|>\n" - default_assistant_end = " <|end|>\n" - default_user_start = "<|user|>\n" - default_user_end = " <|end|>\n" - default_system = "<|system|>\nBelow is a dialogue between a human and an AI assistant called StarChat.<|end|>\n" - if "airoboros" in body.model: + system_start = "<|system|>" + system = ( + "Below is a dialogue between a human and an AI assistant called StarChat." + ) + system_end = " <|end|>\n" + user_start = "<|user|>" + user_end = " <|end|>\n" + assistant_start = "<|assistant|>\n" + assistant_end = " <|end|>\n" + if "airoboros" in body.model.lower(): # e.g. A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input. USER: [prompt] ASSISTANT: # see https://huggingface.co/jondurbin/airoboros-mpt-30b-gpt4-1p4-five-epochs - default_assistant_start = "ASSISTANT: " - default_user_start = "USER: " - default_system = "A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input." + system_start = "" + system = "A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input." + system_end = "" + user_start = "USER: " + user_end = "" + assistant_start = "ASSISTANT: " + assistant_end = "" # If it's a mpt-chat model, we need to add the default prompt # from https://huggingface.co/TheBloke/mpt-30B-chat-GGML#prompt-template # and https://huggingface.co/spaces/mosaicml/mpt-30b-chat/blob/main/app.py#L17 - if "mpt" in body.model and "chat" in body.model: - default_assistant_start = "<|im_start|>assistant\n" - default_assistant_end = "<|im_end|>\n" - default_user_start = "<|im_start|>user\n" - default_user_end = "<|im_end|>\n" - default_system = "<|im_start|>system\nA conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.<|im_end|>\n" + if "mpt" in body.model.lower() and "chat" in body.model.lower(): + system_start = "<|im_start|>system\n" + system = "A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers." + system_end = "<|im_end|>\n" + assistant_start = "<|im_start|>assistant\n" + assistant_end = "<|im_end|>\n" + user_start = "<|im_start|>user\n" + user_end = "<|im_end|>\n" + # orca mini https://huggingface.co/pankajmathur/orca_mini_3b + if "orca" in body.model.lower() and "mini" in body.model.lower(): + system_start = "### System:\n" + system = "You are an AI assistant that follows instruction extremely well. Help as much as you can." + system_end = "\n\n" + assistant_start = "### Response:\n" + assistant_end = "" + # v3 e.g. https://huggingface.co/pankajmathur/orca_mini_v3_13b + if "v3" in body.model.lower(): + assistant_start = "### Assistant:\n" + user_start = "### User:\n" + user_end = "\n\n" user_message = next( (message for message in body.messages if message.role == "user"), None @@ -322,18 +363,24 @@ async def chat_completions( (message for message in body.messages if message.role == "assistant"), None ) assistant_message_content = ( - f"{default_assistant_start}{assistant_message.content}{default_assistant_end}" + f"{assistant_start}{assistant_message.content}{assistant_end}" if assistant_message else "" ) system_message = next( (message for message in body.messages if message.role == "system"), None ) - system_message_content = ( - system_message.content if system_message else default_system - ) - - prompt = f"{system_message_content}{assistant_message_content} {default_user_start}{user_message_content}{default_user_end} {default_assistant_start}" + system_message_content = system_message.content if system_message else system + # avoid duplicate user start token in prompt if user message already includes it + if len(user_start) > 0 and user_start in user_message_content: + user_start = "" + # avoid duplicate user end token in prompt if user message already includes it + if len(user_end) > 0 and user_end in user_message_content: + user_end = "" + # avoid duplicate assistant start token in prompt if user message already includes it + if len(assistant_start) > 0 and assistant_start in user_message_content: + assistant_start = "" + prompt = f"{system_start}{system_message_content}{system_end}{assistant_message_content}{user_start}{user_message_content}{user_end}{assistant_start}" model_name = body.model llm = request.app.state.llm if body.stream is True: diff --git a/request_body.py b/request_body.py index 86c381d..d6e3859 100644 --- a/request_body.py +++ b/request_body.py @@ -19,7 +19,7 @@ class CompletionRequestBody(BaseModel): temperature: Optional[float] top_p: Optional[float] stop: Optional[List[str] | str] - stream: bool = Field() + stream: Optional[bool] = Field() model: str = Field() # llama.cpp specific parameters top_k: Optional[int] @@ -68,7 +68,7 @@ class ChatCompletionRequestBody(BaseModel): temperature: Optional[float] top_p: Optional[float] stop: Optional[List[str] | str] - stream: bool = Field() + stream: Optional[bool] = Field() model: str = Field() # llama.cpp specific parameters top_k: Optional[int] diff --git a/streamers.py b/streamers.py index d5a05af..f4a4815 100644 --- a/streamers.py +++ b/streamers.py @@ -3,6 +3,8 @@ from ctransformers import LLM, Config from log import log +from get_env import get_env +from const import DEFAULT_CONTEXT_LENGTH, DEFAULT_LOG_LEVEL def completions_streamer( @@ -37,8 +39,11 @@ def completions_streamer( stop = config.stop log.debug("stop: %s", stop) log.debug("prompt: %s", prompt) + CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", DEFAULT_CONTEXT_LENGTH)) + LOGGING_LEVEL = get_env("LOGGING_LEVEL", DEFAULT_LOG_LEVEL) log.debug("Streaming from ctransformer instance!") + total_tokens = 0 for token in llm( prompt, stream=True, @@ -54,6 +59,28 @@ def completions_streamer( max_new_tokens=max_new_tokens, stop=stop, ): + if LOGGING_LEVEL == "DEBUG": + # Only track token length if we're in debug mode to avoid overhead + total_tokens = total_tokens + len(token) + # tokens are not necessarily characters, but this is a good enough approximation + if total_tokens > CONTEXT_LENGTH: + log.debug( + "Total token length %s exceeded context length %s", + total_tokens, + CONTEXT_LENGTH, + ) + log.debug( + "Try to increase CONTEXT_LENGTH that is currently set to %s to your model's context length", + CONTEXT_LENGTH, + ) + log.debug( + "Alternatively, increse REPETITION_PENALTY %s and LAST_N_TOKENS %s AND/OR adjust temperature %s top_k %s top_p %s", + repetition_penalty, + last_n_tokens, + temperature, + top_k, + top_p, + ) log.debug("Streaming token %s", token) data = json.dumps( { @@ -123,8 +150,11 @@ def chat_completions_streamer( stop = config.stop log.debug("stop: %s", stop) log.debug("prompt: %s", prompt) + CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", DEFAULT_CONTEXT_LENGTH)) + LOGGING_LEVEL = get_env("LOGGING_LEVEL", DEFAULT_LOG_LEVEL) log.debug("Streaming from ctransformer instance") + total_tokens = 0 for token in llm( prompt, stream=True, @@ -140,6 +170,28 @@ def chat_completions_streamer( max_new_tokens=max_new_tokens, stop=stop, ): + if LOGGING_LEVEL == "DEBUG": + # Only track token length if we're in debug mode to avoid overhead + total_tokens = total_tokens + len(token) + # tokens are not necessarily characters, but this is a good enough approximation + if total_tokens > CONTEXT_LENGTH: + log.debug( + "Total token length %s exceeded context length %s", + total_tokens, + CONTEXT_LENGTH, + ) + log.debug( + "Try to increase CONTEXT_LENGTH that is currently set to %s to your model's context length", + CONTEXT_LENGTH, + ) + log.debug( + "Alternatively, increse REPETITION_PENALTY %s and LAST_N_TOKENS %s AND/OR adjust temperature %s top_k %s top_p %s", + repetition_penalty, + last_n_tokens, + temperature, + top_k, + top_p, + ) log.debug("Streaming token %s", token) data = json.dumps( {