diff --git a/__main__.py b/__main__.py index 2fbd575..6a6b407 100644 --- a/__main__.py +++ b/__main__.py @@ -42,13 +42,14 @@ def main(): description = "🦙 Llama.cpp python server. Host your own LLMs!🚀" parser = argparse.ArgumentParser(description=description) - + current_file_path = __file__ + current_directory = os.path.dirname(current_file_path) add_args_from_model(parser, Settings) parser.add_argument( "--config_file", type=str, help="Path to a config file to load.", - default="/home/test/api_server.cfg", + default= current_directory + "/server.cfg", ) server_settings: ServerSettings | None = None model_settings: list[ModelSettings] = [] diff --git a/model.py b/model.py index 191b615..028ae96 100644 --- a/model.py +++ b/model.py @@ -129,24 +129,12 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: kwargs = {} - if settings.hf_model_repo_id is not None: - create_fn = functools.partial( - llama_cpp.Llama.from_pretrained, - repo_id=settings.hf_model_repo_id, - filename=settings.model, - ) - elif settings.chat_format == "chatglm": - create_fn = chatglm_cpp.Pipeline - kwargs["model_path"] = settings.model - else: - create_fn = llama_cpp.Llama - kwargs["model_path"] = settings.model - if settings.chat_format == "chatglm3": + if settings.chat_format == "chatglm3" or settings.chat_format == "chatglm": _model = chatglm_cpp.Pipeline(settings.model) _model.create_chat_completion = chatglm.create_chat_completion - if settings.chat_format == "bge-onnx": + elif settings.chat_format == "bge-onnx": _model =extends.BgeOnnxModel(settings.model,settings.model_alias) elif settings.chat_format == "firefunction" : @@ -189,6 +177,16 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: n_threads_batch=settings.n_threads_batch, ) else: + if settings.hf_model_repo_id is not None: + create_fn = functools.partial( + llama_cpp.Llama.from_pretrained, + repo_id=settings.hf_model_repo_id, + filename=settings.model, + ) + else: + create_fn = llama_cpp.Llama + kwargs["model_path"] = settings.model + _model = create_fn( **kwargs, # Model Params diff --git a/server.cfg b/server.cfg index 5cebfa5..db399ef 100644 --- a/server.cfg +++ b/server.cfg @@ -2,12 +2,46 @@ "host": "0.0.0.0", "port": 8000, "models": [ + { + "model": "/home/test/llm-models/chatglm3-ggml.bin", + "model_alias": "chatglm3", + "chat_format": "chatglm3", + "n_gpu_layers": 0, + "offload_kqv": true, + "embedding": false, + "n_threads": 12, + "n_batch": 512 + }, + { + "model": "/home/test/llm-models/bge-large-zh-v1.5-q4_k_m.gguf", + "model_alias": "bge-large-zh-v1.5", + "chat_format": "bert", + "n_gpu_layers": 0, + "offload_kqv": true, + "n_threads": 12, + "n_ctx": 8192, + "embedding": true, + "n_batch": 512, + "verbose": false + }, + { + "model": "/home/test/.cache/huggingface/hub/models--BAAI--bge-m3/snapshots/babcf60cae0a1f438d7ade582983d4ba462303c2/onnx/", + "model_alias": "bge-m3", + "chat_format": "bge-onnx", + "embedding": true, + "n_gpu_layers": 0, + "n_ctx": 8192, + "offload_kqv": true, + "n_threads": 12, + "n_batch": 512 + }, { "model": "/home/test/llm-models/chatglm3-ggml-q8.bin", - "model_alias": "chatglm3", + "model_alias": "chatglm3-q8", "chat_format": "chatglm3", "n_gpu_layers": 0, "offload_kqv": true, + "embedding": false, "n_threads": 12, "n_batch": 512 }, @@ -18,6 +52,7 @@ "chat_format": "openfunctions", "n_gpu_layers": 0, "n_ctx":4096, + "embedding": false, "offload_kqv": true, "n_threads": 12, "n_batch": 512 @@ -31,20 +66,13 @@ "offload_kqv": true, "n_threads": 12, "n_batch": 512, + "embedding": false, "n_ctx": 8192, "use_mmap":true }, - { - "model": "/home/test/llm-models/bge-large-zh-v1.5-q4_k_m.gguf", - "model_alias": "bge-large-zh-v1.5", - "chat_format": "bert", - "n_gpu_layers": 0, - "offload_kqv": true, - "n_threads": 12, - "n_batch": 512 - }, { "model": "/home/test/llm-models/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf", + "hf_pretrained_model_name_or_path":"meta-llama/Meta-Llama-3-8B-Instruct", "model_alias": "llama-3-8b", "chat_format": "llama-3", "n_gpu_layers": 0, @@ -60,6 +88,7 @@ "chat_format": "gemma", "n_gpu_layers": 0, "offload_kqv": true, + "embedding": false, "n_threads": 12, "n_ctx": 8192, "n_batch": 512 @@ -71,6 +100,7 @@ "clip_model_path": "/home/test/llm-models/mmproj-model-f16.gguf", "n_gpu_layers": 0, "offload_kqv": true, + "embedding": false, "n_threads": 12, "n_ctx": 4096, "n_batch": 512