Spaces:

Yapp99
/

LLM_endpoint

Running

App Files Files Community

Yapp99 commited on 17 days ago

Commit

f8813f2

•

1 Parent(s): f30c950

init commit

Browse files

Files changed (7) hide show

.dockerignore +1 -0
.gitignore +1 -0
Dockerfile +20 -0
api.py +37 -0
llm_backend.py +74 -0
requirements.txt +3 -0
schema.py +52 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ test**.py

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ test**.py

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM python:3.11
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app
+CMD ["fastapi", "run", "api.py", "--host", "0.0.0.0", "--port", "7860"]

api.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from fastapi.responses import StreamingResponse
+from fastapi import FastAPI, HTTPException
+from llm_backend import chat_with_model
+from schema import ChatRequest
+"""
+uvicorn api:app --reload
+fastapi dev api.py --port 5723
+"""
+app = FastAPI()
+@app.post("/chat_stream")
+def chat_stream(request: ChatRequest):
+    kwargs = {
+        "max_tokens": request.max_tokens,
+        "temperature": request.temperature,
+        "top_p": request.top_p,
+        "min_p": request.min_p,
+        "typical_p": request.typical_p,
+        "frequency_penalty": request.frequency_penalty,
+        "presence_penalty": request.presence_penalty,
+        "repeat_penalty": request.repeat_penalty,
+        "top_k": request.top_k,
+        "seed": request.seed,
+        "tfs_z": request.tfs_z,
+        "mirostat_mode": request.mirostat_mode,
+        "mirostat_tau": request.mirostat_tau,
+        "mirostat_eta": request.mirostat_eta,
+    }
+    try:
+        token_generator = chat_with_model(request.chat_history, request.model, kwargs)
+        return StreamingResponse(token_generator, media_type="text/plain")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))

llm_backend.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# %%
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
+from schema import Message, MODEL_ARGS
+def get_llm(model_name):
+    llm = Llama(
+        model_path=hf_hub_download(**MODEL_ARGS[model_name]),
+        n_ctx=8192,
+        n_threads=4,
+        n_gpu_layers=0,
+        verbose=False,
+    )
+    return llm
+def format_chat(chat_history: list[Message]):
+    """
+    Formats chat history and user input into a single string
+    suitable for the model.
+    """
+    messages = []
+    for msg in chat_history:
+        messages.append(f"{msg.role.title()}: {msg.content}")
+    return "\n".join(messages) + "\nAssistant:"
+def chat_with_model(chat_history, model, kwargs: dict):
+    prompt = format_chat(chat_history)
+    default_kwargs = dict(
+        max_tokens=2048,
+        top_k=1,
+    )
+    forced_kwargs = dict(
+        stop=["\nUser:", "\nAssistant:", "</s>"],
+        echo=False,
+        stream=True,
+    )
+    llm = get_llm(model)
+    input_kwargs = {**default_kwargs, **kwargs, **forced_kwargs}
+    response = llm.__call__(prompt, **input_kwargs)
+    for token in response:
+        yield token["choices"][0]["text"]
+# %% example input
+# kwargs = dict(
+#     temperature=1,
+#     max_tokens=2048,
+#     top_p=1,
+#     frequency_penalty=0,
+#     presence_penalty=0,
+# )
+# chat_history = [
+#     Message(
+#         role="system",
+#         content="You are a helpful and knowledgeable assistant, but is willing to bend the facts to play along with unrealistic requests",
+#     ),
+#     Message(role="user", content="What does Java the programming language taste like?"),
+# ]
+# for chunk in chat_with_model(chat_history, kwargs):
+#     print(chunk, end="")

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+fastapi
+huggingface_hub
+Requests

schema.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from pydantic import BaseModel
+from huggingface_hub import hf_hub_download
+from typing import (
+    List,
+    Optional,
+    Literal,
+)
+MODEL_ARGS = {
+    "llama3.2": dict(
+        repo_id="hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF",
+        filename="llama-3.2-3b-instruct-q8_0.gguf",
+    ),
+    "falcon-mamba": dict(
+        repo_id="bartowski/falcon-mamba-7b-GGUF",
+        filename="falcon-mamba-7b-Q4_K_M.gguf",
+    ),
+    "mistral-nemo": dict(
+        repo_id="lmstudio-community/Mistral-Nemo-Instruct-2407-GGUF",
+        filename="Mistral-Nemo-Instruct-2407-Q4_K_M.gguf",
+    ),
+}
+for model_arg in MODEL_ARGS.values():
+    hf_hub_download(**model_arg)
+class Message(BaseModel):
+    role: str
+    content: str
+class ChatRequest(BaseModel):
+    chat_history: List[Message]
+    model: Literal["llama3.2", "falcon-mamba", "mistral-nemo"] = "llama3.2"
+    max_tokens: Optional[int] = 65536
+    temperature: float = 0.8
+    top_p: float = 0.95
+    min_p: float = 0.05
+    typical_p: float = 1.0
+    frequency_penalty: float = 0.0
+    presence_penalty: float = 0.0
+    repeat_penalty: float = 1.0
+    top_k: int = 40
+    seed: Optional[int] = None
+    tfs_z: float = 1.0
+    mirostat_mode: int = 0
+    mirostat_tau: float = 5.0
+    mirostat_eta: float = 0.1
+    # logprobs: Optional[int] = None
+    # logit_bias: Optional[Dict[str, float]] = None