drdudddd commited on
Commit
fbc54cd
·
verified ·
1 Parent(s): 66a650d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -16
app.py CHANGED
@@ -1,31 +1,49 @@
1
  import gradio as gr
2
- from ctransformers import AutoModelForCausalLM
 
 
3
 
4
- print("Lade Modell... bitte warten (CPU Modus)")
5
 
6
- # Wir nutzen 'llama' als Typ, da dies der stabilste CPU-Loader ist
7
- llm = AutoModelForCausalLM.from_pretrained(
8
- "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF",
9
- model_file="DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf",
10
- model_type="llama", # Das verhindert die Suche nach exllama/gpu Modulen
11
- threads=4, # Nutzt die volle Power der HF-CPU
12
- context_length=1024
 
 
 
 
 
13
  )
14
 
15
  def respond(message, history):
16
- # DeepSeek-R1 Prompt-Struktur
17
  prompt = f"User: {message}\nAssistant: <think>\n"
18
 
19
  response = ""
20
- # Generierung mit Streaming
21
- for token in llm(prompt, stream=True):
22
- response += token
23
- yield response
 
 
 
 
 
 
 
 
 
 
24
 
 
25
  demo = gr.ChatInterface(
26
  fn=respond,
27
- title="DeepSeek-R1 CPU (Stable)",
28
- description="Dieser Space nutzt ctransformers ohne GPU-Abhängigkeiten."
29
  )
30
 
31
  if __name__ == "__main__":
 
1
  import gradio as gr
2
+ from huggingface_hub import hf_hub_download
3
+ from llama_cpp import Llama
4
+ import os
5
 
6
+ print("Lade DeepSeek-R1 GGUF...")
7
 
8
+ # Modell-Download
9
+ model_path = hf_hub_download(
10
+ repo_id="unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF",
11
+ filename="DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf"
12
+ )
13
+
14
+ # Initialisierung (n_ctx auf 1024 begrenzt für CPU-Stabilität)
15
+ llm = Llama(
16
+ model_path=model_path,
17
+ n_ctx=1024,
18
+ n_threads=2,
19
+ verbose=False
20
  )
21
 
22
  def respond(message, history):
23
+ # DeepSeek Format: Wir starten direkt mit dem Denk-Tag
24
  prompt = f"User: {message}\nAssistant: <think>\n"
25
 
26
  response = ""
27
+ # Streaming-Generierung
28
+ stream = llm(
29
+ prompt,
30
+ max_tokens=512,
31
+ stop=["User:", "<|endoftext|>"],
32
+ stream=True,
33
+ temperature=0.7
34
+ )
35
+
36
+ for chunk in stream:
37
+ if "text" in chunk["choices"][0]:
38
+ token = chunk["choices"][0]["text"]
39
+ response += token
40
+ yield response
41
 
42
+ # Gradio Interface
43
  demo = gr.ChatInterface(
44
  fn=respond,
45
+ title="DeepSeek-R1 CPU (GGUF Safe Mode)",
46
+ description="Läuft stabil auf Hugging Face CPU Hardware."
47
  )
48
 
49
  if __name__ == "__main__":