Spaces:

redhairedshanks1
/

DeepSeek

Build error

App Files Files Community

redhairedshanks1 commited on Oct 10

Commit

1612fe8

verified ·

1 Parent(s): 893d9ce

Update app.py

Browse files

Files changed (1) hide show

app.py +339 -42

app.py CHANGED Viewed

@@ -1,36 +1,194 @@
 import os
 import re
 import gradio as gr
 from llama_cpp import Llama
-from huggingface_hub import login
 from transformers import AutoTokenizer
-# If your model/quant repo is gated, add an HF token as a Space secret named HF_TOKEN
 HF_TOKEN = os.getenv("HF_TOKEN")
 if HF_TOKEN:
-    login(HF_TOKEN)
-# Choose a base model ID (for tokenizer/chat template) and a GGUF file to load
-# Defaults target the Llama 8B distill. You can change these three vars.
-MODEL_ID = os.getenv("MODEL_ID", "deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
-GGUF_REPO = os.getenv("GGUF_REPO", "TheBloke/DeepSeek-R1-Distill-Llama-8B-GGUF")
-GGUF_FILE = os.getenv("GGUF_FILE", "deepseek-r1-distill-llama-8b.Q4_K_M.gguf")
-# Load tokenizer (to apply chat template correctly)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
-# Load the quantized model via llama.cpp (downloads just the selected file)
-llm = Llama.from_pretrained(
-    repo_id=GGUF_REPO,
-    filename=GGUF_FILE,
-    n_ctx=4096,
-    n_threads=max(1, (os.cpu_count() or 2) // 2),
-    n_batch=128,
-    verbose=False,
-)
-def apply_template(history, message):
-    # history is list of [user, assistant]
     msgs = []
     for u, a in history:
         if u:
@@ -42,25 +200,164 @@ def apply_template(history, message):
         msgs, tokenize=False, add_generation_prompt=True
     )
-def strip_reasoning(text):
-    # Hide DeepSeek-style reasoning tokens if desired
-    return re.sub(r"<\|begin_of_thought\|>.*?<\|end_of_thought\|>", "", text, flags=re.DOTALL)
-def chat_fn(message, history, max_new_tokens, temperature, top_p, show_reasoning):
-    prompt = apply_template(history, message)
-    stream = llm(
-        prompt,
-        max_tokens=int(max_new_tokens),
-        temperature=float(temperature),
-        top_p=float(top_p),
-        stop=[tokenizer.eos_token or "", "<|eot_id|>"],
-        stream=True,
     )
-    raw = ""
-    for part in stream:
-        delta = part["choices"][0]["text"]
-        raw += delta
-        yield raw if show_reasoning else strip_reasoning(raw)
 demo = gr.ChatInterface(
     fn=chat_fn,
@@ -71,7 +368,7 @@ demo = gr.ChatInterface(
         gr.Checkbox(label="Show reasoning", value=False),
     ],
     title="DeepSeek R1 Distill (CPU, GGUF)",
-    description="Running a distilled DeepSeek R1 on free-tier CPU. Expect slow generation.",
     examples=[
         "Prove that the sum of two even numbers is even.",
         "A train leaves at 3 PM at 60 km/h. Another at 4 PM at 80 km/h. When will the second catch up?",

 import os
 import re
+import sys
+import traceback
 import gradio as gr
+from huggingface_hub import (
+    login,
+    HfApi,
+    hf_hub_download,
+    whoami,
+)
 from llama_cpp import Llama
 from transformers import AutoTokenizer
+"""
+Environment variables you can set in your Space (Settings -> Variables & secrets):
+Required (pick one of these approaches):
+- GGUF_REPO: The Hugging Face repo that contains your .gguf files
+- GGUF_FILE: The specific .gguf filename to load from that repo
+Optional (recommended):
+- MODEL_ID: Base model repo to pull the tokenizer/chat template from.
+  Use the matching family for your quant:
+  - Qwen family:  deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B or -Qwen-7B
+  - Llama family: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+If MODEL_ID is not set, we will attempt to guess it from GGUF_REPO.
+Other optional env vars:
+- HF_TOKEN: If your repo is gated/private, add this as a Space secret (read scope).
+- PREFER_FAMILY: "qwen" or "llama" (only used if we need to guess MODEL_ID). Default: qwen
+- PREFER_SIZE: "1.5b", "7b", or "8b" (only used if we need to guess MODEL_ID). Default: 1.5b
+- N_CTX: context window (default 4096)
+- N_THREADS: CPU threads (default: half your CPU cores, at least 1)
+- N_BATCH: batch size (default 128)
+"""
+# --------------------
+# Auth (optional)
+# --------------------
 HF_TOKEN = os.getenv("HF_TOKEN")
 if HF_TOKEN:
+    try:
+        login(HF_TOKEN)
+        try:
+            user = whoami().get("name", "ok")
+            print(f"[auth] Logged into Hugging Face as: {user}")
+        except Exception:
+            print("[auth] Logged in (could not fetch user name).")
+    except Exception as e:
+        print(f"[auth] Failed to login with HF_TOKEN: {e}")
+# --------------------
+# Config / Defaults
+# --------------------
+GGUF_REPO = os.getenv("GGUF_REPO", "").strip()
+GGUF_FILE = os.getenv("GGUF_FILE", "").strip()
+PREFER_FAMILY = os.getenv("PREFER_FAMILY", "qwen").lower()
+PREFER_SIZE = os.getenv("PREFER_SIZE", "1.5b").lower()
+# Runtime knobs
+def _default_threads():
+    try:
+        cores = os.cpu_count() or 2
+        return max(1, cores // 2)  # be gentle on free CPU
+    except Exception:
+        return 1
+N_CTX = int(os.getenv("N_CTX", "4096"))
+N_THREADS = int(os.getenv("N_THREADS", str(_default_threads())))
+N_BATCH = int(os.getenv("N_BATCH", "128"))
+# --------------------
+# Helpers
+# --------------------
+api = HfApi()
+def repo_exists(repo_id: str) -> bool:
+    try:
+        api.model_info(repo_id)
+        return True
+    except Exception:
+        return False
+def pick_q4_file(repo_id: str) -> str:
+    """Choose a reasonable 4-bit GGUF from a repo (prefer Q4_K_M, then Q4_0)."""
+    info = api.model_info(repo_id)
+    ggufs = [s.rfilename for s in info.siblings if s.rfilename.lower().endswith(".gguf")]
+    # Prefer Q4_K_M, then any Q4, then Q3 as last resort
+    priority = []
+    for f in ggufs:
+        fl = f.lower()
+        score = 0
+        if "q4_k_m" in fl:
+            score = 100
+        elif "q4_k_s" in fl or "q4_k_l" in fl or "q4_k" in fl:
+            score = 95
+        elif "q4_0" in fl or "q4" in fl:
+            score = 90
+        elif "q3_k_m" in fl or "q3" in fl:
+            score = 70
+        else:
+            score = 10
+        priority.append((score, f))
+    if not priority:
+        raise FileNotFoundError(f"No .gguf files found in {repo_id}")
+    priority.sort(reverse=True, key=lambda x: x[0])
+    chosen = priority[0][1]
+    return chosen
+def guess_model_id_from_repo(repo_id: str) -> str:
+    """Guess a matching tokenizer/chat-template model based on the GGUF repo name."""
+    rid = repo_id.lower()
+    # Family
+    if "qwen" in rid or PREFER_FAMILY == "qwen":
+        # Size
+        if "1.5" in rid or "1_5" in rid or PREFER_SIZE == "1.5b":
+            return "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+        elif "7b" in rid or PREFER_SIZE == "7b":
+            return "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
+        else:
+            return "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+    # Llama family
+    if "llama" in rid or PREFER_FAMILY == "llama":
+        return "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
+    # Fallback
+    return "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+def ensure_model_source(repo_id: str | None, filename: str | None) -> tuple[str, str]:
+    """
+    Ensure we have a valid GGUF repo + file.
+    - If both provided, verify they exist.
+    - If only repo provided, pick a reasonable Q4 file.
+    - If none provided, raise with a helpful message.
+    """
+    if repo_id and filename:
+        try:
+            api.model_info(repo_id)  # raises if missing or no access
+        except Exception as e:
+            raise FileNotFoundError(
+                f"Repo not accessible: {repo_id}\n{e}\n"
+                "Check the repo id spelling, your HF token, and license access."
+            )
+        # Now check the file exists in the repo
+        info = api.model_info(repo_id)
+        files = {s.rfilename for s in info.siblings}
+        if filename not in files:
+            # Try case-insensitive match
+            lower_map = {s.rfilename.lower(): s.rfilename for s in info.siblings}
+            if filename.lower() in lower_map:
+                filename = lower_map[filename.lower()]
+            else:
+                raise FileNotFoundError(
+                    f"File not found in repo: {filename}\n"
+                    f"Available gguf files: {[f for f in files if f.lower().endswith('.gguf')]}"
+                )
+        return repo_id, filename
+    if repo_id and not filename:
+        return repo_id, pick_q4_file(repo_id)
+    raise ValueError(
+        "No GGUF_REPO/GGUF_FILE provided. Set them in your Space Variables.\n"
+        "Examples you can try (you must verify these exist and accept access if gated):\n"
+        "  - GGUF_REPO = TheBloke/DeepSeek-R1-Distill-Qwen-7B-GGUF\n"
+        "    GGUF_FILE = deepseek-r1-distill-qwen-7b.Q4_K_M.gguf\n"
+        "  - GGUF_REPO = bartowski/DeepSeek-R1-Distill-Qwen-1.5B-GGUF\n"
+        "    GGUF_FILE = deepseek-r1-distill-qwen-1.5b.Q4_K_M.gguf\n"
+        "  - GGUF_REPO = MaziyarPanahi/DeepSeek-R1-Distill-Llama-8B-GGUF\n"
+        "    GGUF_FILE = deepseek-r1-distill-llama-8b.Q4_K_M.gguf\n"
+    )
+def build_tokenizer(model_id: str) -> AutoTokenizer:
+    print(f"[tokenizer] Loading tokenizer/chat template from {model_id}")
+    tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    return tok
+def apply_template(tokenizer: AutoTokenizer, history, message: str) -> str:
+    # history: list of [user, assistant]
     msgs = []
     for u, a in history:
         if u:
         msgs, tokenize=False, add_generation_prompt=True
     )
+def strip_reasoning(text: str) -> str:
+    # Hide DeepSeek-style reasoning tags if present
+    return re.sub(
+        r"<\|begin_of_thought\|>.*?<\|end_of_thought\|>",
+        "",
+        text,
+        flags=re.DOTALL,
+    )
+# --------------------
+# Resolve model + file
+# --------------------
+try:
+    GGUF_REPO, GGUF_FILE = ensure_model_source(GGUF_REPO, GGUF_FILE)
+    print(f"[gguf] Using repo: {GGUF_REPO}")
+    print(f"[gguf] Using file: {GGUF_FILE}")
+except Exception as e:
+    # Fail fast with a clear error; Gradio will show logs
+    print("[startup] Failed to resolve GGUF model source:")
+    print(e)
+    traceback.print_exc()
+    # Provide a minimal dummy UI to show the error instead of crashing Space build
+    def _error_ui():
+        return gr.Markdown(
+            f"Cannot start: {e}\n\n"
+            "Go to Settings → Variables and set GGUF_REPO and GGUF_FILE to a valid GGUF."
+        )
+    with gr.Blocks() as demo:
+        gr.Markdown("# DeepSeek R1 Distill (CPU, GGUF)")
+        _error_ui()
+    if __name__ == "__main__":
+        demo.launch()
+    sys.exit(0)
+# Guess MODEL_ID if not provided
+MODEL_ID = os.getenv("MODEL_ID", "").strip()
+if not MODEL_ID:
+    MODEL_ID = guess_model_id_from_repo(GGUF_REPO)
+# --------------------
+# Download and load
+# --------------------
+try:
+    # Download exact file; raises if not found or no access
+    print(f"[download] Fetching {GGUF_FILE} from {GGUF_REPO} ...")
+    model_path = hf_hub_download(repo_id=GGUF_REPO, filename=GGUF_FILE)
+    print(f"[download] File ready at: {model_path}")
+except Exception as e:
+    print("[download] Failed to download the GGUF file:")
+    print(e)
+    traceback.print_exc()
+    # Same graceful error UI
+    def _error_ui():
+        return gr.Markdown(
+            f"Download failed: {e}\n\n"
+            "Check that GGUF_REPO and GGUF_FILE are correct and your HF_TOKEN has access."
+        )
+    with gr.Blocks() as demo:
+        gr.Markdown("# DeepSeek R1 Distill (CPU, GGUF)")
+        _error_ui()
+    if __name__ == "__main__":
+        demo.launch()
+    sys.exit(0)
+# Load tokenizer for chat template
+try:
+    tokenizer = build_tokenizer(MODEL_ID)
+except Exception as e:
+    print("[tokenizer] Failed to load tokenizer/chat template:")
+    print(e)
+    traceback.print_exc()
+    # Still try to continue with a naive prompt if tokenizer fails
+    tokenizer = None
+    def naive_template(history, message):
+        # Simple ChatML-like format
+        parts = []
+        for u, a in history:
+            if u:
+                parts.append(f"<|im_start|>user\n{u}\n<|im_end|>")
+            if a:
+                parts.append(f"<|im_start|>assistant\n{a}\n<|im_end|>")
+        parts.append(f"<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n")
+        return "\n".join(parts)
+def make_prompt(history, message):
+    if tokenizer is not None:
+        return apply_template(tokenizer, history, message)
+    return naive_template(history, message)  # type: ignore[name-defined]
+# Load llama.cpp
+try:
+    llm = Llama(
+        model_path=model_path,
+        n_ctx=N_CTX,
+        n_threads=N_THREADS,
+        n_batch=N_BATCH,
+        n_gpu_layers=0,  # CPU Space
+        verbose=False,
     )
+    print("[llama] Model loaded.")
+except Exception as e:
+    print("[llama] Failed to load llama.cpp with the downloaded GGUF:")
+    print(e)
+    traceback.print_exc()
+    def _error_ui():
+        return gr.Markdown(f"Failed to load model: {e}")
+    with gr.Blocks() as demo:
+        gr.Markdown("# DeepSeek R1 Distill (CPU, GGUF)")
+        _error_ui()
+    if __name__ == "__main__":
+        demo.launch()
+    sys.exit(0)
+# --------------------
+# Gradio app
+# --------------------
+def chat_fn(message, history, max_new_tokens, temperature, top_p, show_reasoning):
+    try:
+        prompt = make_prompt(history, message)
+        # Common stop markers; eos from tokenizer if available
+        stops = ["<|eot_id|>", "<|im_end|>", "<|end_of_text|>"]
+        try:
+            if tokenizer is not None and getattr(tokenizer, "eos_token", None):
+                eos = tokenizer.eos_token
+                if eos and eos not in stops:
+                    stops.append(eos)
+        except Exception:
+            pass
+        stream = llm(
+            prompt,
+            max_tokens=int(max_new_tokens),
+            temperature=float(temperature),
+            top_p=float(top_p),
+            stop=stops,
+            stream=True,
+        )
+        raw = ""
+        for part in stream:
+            delta = part["choices"][0]["text"]
+            raw += delta
+            yield raw if show_reasoning else strip_reasoning(raw)
+    except Exception as e:
+        err = f"[error] {type(e).__name__}: {e}"
+        yield err
+header_md = f"""
+### DeepSeek R1 Distill (CPU, GGUF)
+Loaded:
+- GGUF_REPO: `{GGUF_REPO}`
+- GGUF_FILE: `{GGUF_FILE}`
+- Chat template from: `{MODEL_ID}`
+- n_ctx={N_CTX}, n_threads={N_THREADS}, n_batch={N_BATCH}
+Tip: If you see a 404/403 at startup, set GGUF_REPO/GGUF_FILE correctly and ensure HF_TOKEN has access.
+"""
 demo = gr.ChatInterface(
     fn=chat_fn,
         gr.Checkbox(label="Show reasoning", value=False),
     ],
     title="DeepSeek R1 Distill (CPU, GGUF)",
+    description=header_md,
     examples=[
         "Prove that the sum of two even numbers is even.",
         "A train leaves at 3 PM at 60 km/h. Another at 4 PM at 80 km/h. When will the second catch up?",