Yapp99 commited on
Commit
f8813f2
1 Parent(s): f30c950

init commit

Browse files
Files changed (7) hide show
  1. .dockerignore +1 -0
  2. .gitignore +1 -0
  3. Dockerfile +20 -0
  4. api.py +37 -0
  5. llm_backend.py +74 -0
  6. requirements.txt +3 -0
  7. schema.py +52 -0
.dockerignore ADDED
@@ -0,0 +1 @@
 
 
1
+ test**.py
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ test**.py
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
+
9
+ RUN useradd -m -u 1000 user
10
+
11
+ USER user
12
+
13
+ ENV HOME=/home/user \
14
+ PATH=/home/user/.local/bin:$PATH
15
+
16
+ WORKDIR $HOME/app
17
+
18
+ COPY --chown=user . $HOME/app
19
+
20
+ CMD ["fastapi", "run", "api.py", "--host", "0.0.0.0", "--port", "7860"]
api.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi.responses import StreamingResponse
2
+ from fastapi import FastAPI, HTTPException
3
+
4
+ from llm_backend import chat_with_model
5
+ from schema import ChatRequest
6
+
7
+ """
8
+ uvicorn api:app --reload
9
+ fastapi dev api.py --port 5723
10
+ """
11
+
12
+ app = FastAPI()
13
+
14
+
15
+ @app.post("/chat_stream")
16
+ def chat_stream(request: ChatRequest):
17
+ kwargs = {
18
+ "max_tokens": request.max_tokens,
19
+ "temperature": request.temperature,
20
+ "top_p": request.top_p,
21
+ "min_p": request.min_p,
22
+ "typical_p": request.typical_p,
23
+ "frequency_penalty": request.frequency_penalty,
24
+ "presence_penalty": request.presence_penalty,
25
+ "repeat_penalty": request.repeat_penalty,
26
+ "top_k": request.top_k,
27
+ "seed": request.seed,
28
+ "tfs_z": request.tfs_z,
29
+ "mirostat_mode": request.mirostat_mode,
30
+ "mirostat_tau": request.mirostat_tau,
31
+ "mirostat_eta": request.mirostat_eta,
32
+ }
33
+ try:
34
+ token_generator = chat_with_model(request.chat_history, request.model, kwargs)
35
+ return StreamingResponse(token_generator, media_type="text/plain")
36
+ except Exception as e:
37
+ raise HTTPException(status_code=500, detail=str(e))
llm_backend.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ from llama_cpp import Llama
3
+ from huggingface_hub import hf_hub_download
4
+
5
+ from schema import Message, MODEL_ARGS
6
+
7
+
8
+ def get_llm(model_name):
9
+ llm = Llama(
10
+ model_path=hf_hub_download(**MODEL_ARGS[model_name]),
11
+ n_ctx=8192,
12
+ n_threads=4,
13
+ n_gpu_layers=0,
14
+ verbose=False,
15
+ )
16
+
17
+ return llm
18
+
19
+
20
+ def format_chat(chat_history: list[Message]):
21
+ """
22
+ Formats chat history and user input into a single string
23
+ suitable for the model.
24
+ """
25
+ messages = []
26
+ for msg in chat_history:
27
+ messages.append(f"{msg.role.title()}: {msg.content}")
28
+
29
+ return "\n".join(messages) + "\nAssistant:"
30
+
31
+
32
+ def chat_with_model(chat_history, model, kwargs: dict):
33
+ prompt = format_chat(chat_history)
34
+
35
+ default_kwargs = dict(
36
+ max_tokens=2048,
37
+ top_k=1,
38
+ )
39
+
40
+ forced_kwargs = dict(
41
+ stop=["\nUser:", "\nAssistant:", "</s>"],
42
+ echo=False,
43
+ stream=True,
44
+ )
45
+
46
+ llm = get_llm(model)
47
+
48
+ input_kwargs = {**default_kwargs, **kwargs, **forced_kwargs}
49
+ response = llm.__call__(prompt, **input_kwargs)
50
+
51
+ for token in response:
52
+ yield token["choices"][0]["text"]
53
+
54
+
55
+ # %% example input
56
+ # kwargs = dict(
57
+ # temperature=1,
58
+ # max_tokens=2048,
59
+ # top_p=1,
60
+ # frequency_penalty=0,
61
+ # presence_penalty=0,
62
+ # )
63
+
64
+ # chat_history = [
65
+ # Message(
66
+ # role="system",
67
+ # content="You are a helpful and knowledgeable assistant, but is willing to bend the facts to play along with unrealistic requests",
68
+ # ),
69
+ # Message(role="user", content="What does Java the programming language taste like?"),
70
+ # ]
71
+
72
+
73
+ # for chunk in chat_with_model(chat_history, kwargs):
74
+ # print(chunk, end="")
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ fastapi
2
+ huggingface_hub
3
+ Requests
schema.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from huggingface_hub import hf_hub_download
3
+
4
+ from typing import (
5
+ List,
6
+ Optional,
7
+ Literal,
8
+ )
9
+
10
+ MODEL_ARGS = {
11
+ "llama3.2": dict(
12
+ repo_id="hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF",
13
+ filename="llama-3.2-3b-instruct-q8_0.gguf",
14
+ ),
15
+ "falcon-mamba": dict(
16
+ repo_id="bartowski/falcon-mamba-7b-GGUF",
17
+ filename="falcon-mamba-7b-Q4_K_M.gguf",
18
+ ),
19
+ "mistral-nemo": dict(
20
+ repo_id="lmstudio-community/Mistral-Nemo-Instruct-2407-GGUF",
21
+ filename="Mistral-Nemo-Instruct-2407-Q4_K_M.gguf",
22
+ ),
23
+ }
24
+
25
+ for model_arg in MODEL_ARGS.values():
26
+ hf_hub_download(**model_arg)
27
+
28
+
29
+ class Message(BaseModel):
30
+ role: str
31
+ content: str
32
+
33
+
34
+ class ChatRequest(BaseModel):
35
+ chat_history: List[Message]
36
+ model: Literal["llama3.2", "falcon-mamba", "mistral-nemo"] = "llama3.2"
37
+ max_tokens: Optional[int] = 65536
38
+ temperature: float = 0.8
39
+ top_p: float = 0.95
40
+ min_p: float = 0.05
41
+ typical_p: float = 1.0
42
+ frequency_penalty: float = 0.0
43
+ presence_penalty: float = 0.0
44
+ repeat_penalty: float = 1.0
45
+ top_k: int = 40
46
+ seed: Optional[int] = None
47
+ tfs_z: float = 1.0
48
+ mirostat_mode: int = 0
49
+ mirostat_tau: float = 5.0
50
+ mirostat_eta: float = 0.1
51
+ # logprobs: Optional[int] = None
52
+ # logit_bias: Optional[Dict[str, float]] = None