Spaces:

John6666
/

llama_cpp_python_test1

Paused

App Files Files Community

John6666 commited on 28 days ago

Commit

c500d6d

verified ·

1 Parent(s): b7ec932

Upload 4 files

Browse files

Files changed (4) hide show

README.md +2 -9
app.py +160 -0
packages.txt +1 -0
requirements.txt +1 -0

README.md CHANGED Viewed

@@ -1,12 +1,5 @@
 ---
-title: Llama Cpp Python Test1
-emoji: 🏃
-colorFrom: gray
-colorTo: purple
 sdk: gradio
-sdk_version: 6.2.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: GGUF Chatbot (llama-cpp-python)
 sdk: gradio
+sdk_version: "6.2.0"
 ---

app.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import os
+import threading
+from typing import Any, Dict, Iterable, List, Union
+import gradio as gr
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
+# -----------------------------
+# Model (HF GGUF)
+# -----------------------------
+MODEL_REPO_ID = os.getenv("MODEL_REPO_ID", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
+MODEL_FILENAME = os.getenv("MODEL_FILENAME", "qwen2.5-0.5b-instruct-q4_k_m.gguf")
+SYSTEM_PROMPT = os.getenv(
+    "SYSTEM_PROMPT",
+    "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
+)
+# Keep modest on free CPU (KV cache grows with context).
+N_CTX = int(os.getenv("N_CTX", "4096"))
+# Generation defaults
+TEMPERATURE = float(os.getenv("TEMPERATURE", "0.7"))
+TOP_P = float(os.getenv("TOP_P", "0.9"))
+MAX_TOKENS = int(os.getenv("MAX_TOKENS", "512"))
+# -----------------------------
+# Lazy singleton model loader
+# -----------------------------
+_llm: Llama | None = None
+_llm_lock = threading.Lock()
+def _load_llm() -> Llama:
+    global _llm
+    if _llm is not None:
+        return _llm
+    with _llm_lock:
+        if _llm is not None:
+            return _llm
+        model_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename=MODEL_FILENAME)
+        # Qwen instruct GGUFs commonly use ChatML-style formatting.
+        _llm = Llama(
+            model_path=model_path,
+            n_ctx=N_CTX,
+            n_threads=os.cpu_count() or 4,
+            n_gpu_layers=0,
+            chat_format="chatml",
+            verbose=False,
+        )
+        return _llm
+# -----------------------------
+# Gradio message normalization
+# -----------------------------
+Content = Union[str, List[Any], Dict[str, Any]]
+def _content_to_text(content: Content) -> str:
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        parts: List[str] = []
+        for item in content:
+            if isinstance(item, str):
+                parts.append(item)
+            elif isinstance(item, dict) and item.get("type") == "text":
+                parts.append(str(item.get("text", "")))
+        return "".join(parts).strip()
+    if isinstance(content, dict):
+        for k in ("text", "content"):
+            v = content.get(k)
+            if isinstance(v, str):
+                return v
+    return str(content)
+def _history_to_messages(history: Any) -> List[Dict[str, str]]:
+    if not history:
+        return []
+    msgs: List[Dict[str, str]] = []
+    # Old format: list[(user, assistant), ...]
+    if isinstance(history, list) and history and isinstance(history[0], (tuple, list)) and len(history[0]) == 2:
+        for user, assistant in history:
+            if user:
+                msgs.append({"role": "user", "content": str(user)})
+            if assistant:
+                msgs.append({"role": "assistant", "content": str(assistant)})
+        return msgs
+    # Newer format: list[{"role": "...", "content": ...}, ...]
+    if isinstance(history, list) and history and isinstance(history[0], dict):
+        for m in history:
+            role = m.get("role")
+            if role not in ("user", "assistant", "system"):
+                continue
+            text = _content_to_text(m.get("content", ""))
+            if text:
+                msgs.append({"role": role, "content": text})
+        return msgs
+    return []
+def _stream_chat(llm: Llama, messages: List[Dict[str, str]]) -> Iterable[str]:
+    # llama-cpp-python yields OpenAI-like streaming chunks.
+    stream = llm.create_chat_completion(
+        messages=messages,
+        temperature=TEMPERATURE,
+        top_p=TOP_P,
+        max_tokens=MAX_TOKENS,
+        stream=True,
+    )
+    partial = ""
+    for chunk in stream:
+        token = ""
+        try:
+            choice = chunk["choices"][0]
+            delta = choice.get("delta") or {}
+            token = delta.get("content") or ""
+        except Exception:
+            token = ""
+        if token:
+            partial += token
+            yield partial
+def respond(message: str, history: Any):
+    llm = _load_llm()
+    msgs: List[Dict[str, str]] = [{"role": "system", "content": SYSTEM_PROMPT}]
+    prior = _history_to_messages(history)
+    # Simple history trim
+    if len(prior) > 20:
+        prior = prior[-20:]
+    msgs.extend(prior)
+    msgs.append({"role": "user", "content": message})
+    for partial in _stream_chat(llm, msgs):
+        yield partial
+demo = gr.ChatInterface(
+    fn=respond,
+    title="GGUF Chatbot (llama-cpp-python)",
+)
+if __name__ == "__main__":
+    demo.launch()

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ libopenblas0-pthread

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ llama-cpp-python @ https://huggingface.co/Luigi/llama-cpp-python-wheels-hf-spaces-free-cpu/resolve/main/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl