John6666 commited on
Commit
c500d6d
·
verified ·
1 Parent(s): b7ec932

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +2 -9
  2. app.py +160 -0
  3. packages.txt +1 -0
  4. requirements.txt +1 -0
README.md CHANGED
@@ -1,12 +1,5 @@
1
  ---
2
- title: Llama Cpp Python Test1
3
- emoji: 🏃
4
- colorFrom: gray
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 6.2.0
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: GGUF Chatbot (llama-cpp-python)
 
 
 
3
  sdk: gradio
4
+ sdk_version: "6.2.0"
 
 
5
  ---
 
 
app.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import threading
3
+ from typing import Any, Dict, Iterable, List, Union
4
+
5
+ import gradio as gr
6
+ from huggingface_hub import hf_hub_download
7
+
8
+ from llama_cpp import Llama
9
+
10
+ # -----------------------------
11
+ # Model (HF GGUF)
12
+ # -----------------------------
13
+ MODEL_REPO_ID = os.getenv("MODEL_REPO_ID", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
14
+ MODEL_FILENAME = os.getenv("MODEL_FILENAME", "qwen2.5-0.5b-instruct-q4_k_m.gguf")
15
+
16
+ SYSTEM_PROMPT = os.getenv(
17
+ "SYSTEM_PROMPT",
18
+ "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
19
+ )
20
+
21
+ # Keep modest on free CPU (KV cache grows with context).
22
+ N_CTX = int(os.getenv("N_CTX", "4096"))
23
+
24
+ # Generation defaults
25
+ TEMPERATURE = float(os.getenv("TEMPERATURE", "0.7"))
26
+ TOP_P = float(os.getenv("TOP_P", "0.9"))
27
+ MAX_TOKENS = int(os.getenv("MAX_TOKENS", "512"))
28
+
29
+ # -----------------------------
30
+ # Lazy singleton model loader
31
+ # -----------------------------
32
+ _llm: Llama | None = None
33
+ _llm_lock = threading.Lock()
34
+
35
+
36
+ def _load_llm() -> Llama:
37
+ global _llm
38
+ if _llm is not None:
39
+ return _llm
40
+
41
+ with _llm_lock:
42
+ if _llm is not None:
43
+ return _llm
44
+
45
+ model_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename=MODEL_FILENAME)
46
+
47
+ # Qwen instruct GGUFs commonly use ChatML-style formatting.
48
+ _llm = Llama(
49
+ model_path=model_path,
50
+ n_ctx=N_CTX,
51
+ n_threads=os.cpu_count() or 4,
52
+ n_gpu_layers=0,
53
+ chat_format="chatml",
54
+ verbose=False,
55
+ )
56
+ return _llm
57
+
58
+
59
+ # -----------------------------
60
+ # Gradio message normalization
61
+ # -----------------------------
62
+ Content = Union[str, List[Any], Dict[str, Any]]
63
+
64
+
65
+ def _content_to_text(content: Content) -> str:
66
+ if isinstance(content, str):
67
+ return content
68
+ if isinstance(content, list):
69
+ parts: List[str] = []
70
+ for item in content:
71
+ if isinstance(item, str):
72
+ parts.append(item)
73
+ elif isinstance(item, dict) and item.get("type") == "text":
74
+ parts.append(str(item.get("text", "")))
75
+ return "".join(parts).strip()
76
+ if isinstance(content, dict):
77
+ for k in ("text", "content"):
78
+ v = content.get(k)
79
+ if isinstance(v, str):
80
+ return v
81
+ return str(content)
82
+
83
+
84
+ def _history_to_messages(history: Any) -> List[Dict[str, str]]:
85
+ if not history:
86
+ return []
87
+
88
+ msgs: List[Dict[str, str]] = []
89
+
90
+ # Old format: list[(user, assistant), ...]
91
+ if isinstance(history, list) and history and isinstance(history[0], (tuple, list)) and len(history[0]) == 2:
92
+ for user, assistant in history:
93
+ if user:
94
+ msgs.append({"role": "user", "content": str(user)})
95
+ if assistant:
96
+ msgs.append({"role": "assistant", "content": str(assistant)})
97
+ return msgs
98
+
99
+ # Newer format: list[{"role": "...", "content": ...}, ...]
100
+ if isinstance(history, list) and history and isinstance(history[0], dict):
101
+ for m in history:
102
+ role = m.get("role")
103
+ if role not in ("user", "assistant", "system"):
104
+ continue
105
+ text = _content_to_text(m.get("content", ""))
106
+ if text:
107
+ msgs.append({"role": role, "content": text})
108
+ return msgs
109
+
110
+ return []
111
+
112
+
113
+ def _stream_chat(llm: Llama, messages: List[Dict[str, str]]) -> Iterable[str]:
114
+ # llama-cpp-python yields OpenAI-like streaming chunks.
115
+ stream = llm.create_chat_completion(
116
+ messages=messages,
117
+ temperature=TEMPERATURE,
118
+ top_p=TOP_P,
119
+ max_tokens=MAX_TOKENS,
120
+ stream=True,
121
+ )
122
+
123
+ partial = ""
124
+ for chunk in stream:
125
+ token = ""
126
+ try:
127
+ choice = chunk["choices"][0]
128
+ delta = choice.get("delta") or {}
129
+ token = delta.get("content") or ""
130
+ except Exception:
131
+ token = ""
132
+ if token:
133
+ partial += token
134
+ yield partial
135
+
136
+
137
+ def respond(message: str, history: Any):
138
+ llm = _load_llm()
139
+
140
+ msgs: List[Dict[str, str]] = [{"role": "system", "content": SYSTEM_PROMPT}]
141
+ prior = _history_to_messages(history)
142
+
143
+ # Simple history trim
144
+ if len(prior) > 20:
145
+ prior = prior[-20:]
146
+
147
+ msgs.extend(prior)
148
+ msgs.append({"role": "user", "content": message})
149
+
150
+ for partial in _stream_chat(llm, msgs):
151
+ yield partial
152
+
153
+
154
+ demo = gr.ChatInterface(
155
+ fn=respond,
156
+ title="GGUF Chatbot (llama-cpp-python)",
157
+ )
158
+
159
+ if __name__ == "__main__":
160
+ demo.launch()
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ libopenblas0-pthread
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ llama-cpp-python @ https://huggingface.co/Luigi/llama-cpp-python-wheels-hf-spaces-free-cpu/resolve/main/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl