Alibrown commited on
Commit
0ce308c
Β·
verified Β·
1 Parent(s): 3e47970

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -19
app.py CHANGED
@@ -1,34 +1,110 @@
1
- import spaces # nur importieren wenn ZeroGPU verfΓΌgbar
2
  import gradio as gr
3
- from transformers import AutoModelForCausalLM, AutoTokenizer
4
  import torch
5
 
6
- MODEL = "HuggingFaceTB/SmolLM2-135M-Instruct"
 
 
 
 
 
 
 
7
 
8
- tokenizer = AutoTokenizer.from_pretrained(MODEL)
 
 
 
9
 
10
- # Fallback: versuche CUDA, sonst CPU
 
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
- model = AutoModelForCausalLM.from_pretrained(MODEL).to(device)
 
 
 
 
 
 
 
 
 
13
 
14
- @spaces.GPU(duration=30) # ZeroGPU decorator β€” wird ignoriert wenn kein GPU da
15
- def generate(prompt, max_new_tokens=200):
16
- messages = [{"role": "user", "content": prompt}]
17
- text = tokenizer.apply_chat_template(messages, tokenize=False)
 
 
18
  inputs = tokenizer.encode(text, return_tensors="pt").to(device)
19
-
 
 
20
  with torch.no_grad():
21
  outputs = model.generate(
22
  inputs,
23
  max_new_tokens=max_new_tokens,
24
- temperature=0.2,
25
- top_p=0.9,
26
- do_sample=True,
 
27
  )
28
-
29
- # nur neue tokens zurΓΌckgeben
30
- new_tokens = outputs[0][inputs.shape[-1]:]
31
- return tokenizer.decode(new_tokens, skip_special_tokens=True)
32
 
33
- demo = gr.Interface(fn=generate, inputs="text", outputs="text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  demo.launch()
 
1
+ import os
2
  import gradio as gr
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
4
  import torch
5
 
6
+ # ── Token Resolution (Open Source friendly) ──────────────────────────────────
7
+ token = (
8
+ os.environ.get("HF_TOKEN") or
9
+ os.environ.get("TEST_TOKEN") or
10
+ os.environ.get("HUGGINGFACE_TOKEN") or
11
+ os.environ.get("HF_API_TOKEN") or
12
+ None
13
+ )
14
 
15
+ if not token:
16
+ print("⚠️ No HF token found β€” running unauthenticated (rate limits apply)")
17
+ else:
18
+ print("βœ… HF token loaded")
19
 
20
+ # ── Model ─────────────────────────────────────────────────────────────────────
21
+ MODEL = "HuggingFaceTB/SmolLM2-135M-Instruct"
22
  device = "cuda" if torch.cuda.is_available() else "cpu"
23
+ print(f"πŸ”§ Device: {device}")
24
+
25
+ tokenizer = AutoTokenizer.from_pretrained(MODEL, token=token)
26
+ model = AutoModelForCausalLM.from_pretrained(MODEL, token=token).to(device)
27
+ print(f"βœ… Model loaded: {MODEL}")
28
+
29
+ # ── Inference ─────────────────────────────────────────────────────────────────
30
+ def generate(prompt: str, max_new_tokens: int, temperature: float, system_prompt: str):
31
+ if not prompt.strip():
32
+ return "⚠️ Empty prompt", ""
33
 
34
+ messages = []
35
+ if system_prompt.strip():
36
+ messages.append({"role": "system", "content": system_prompt})
37
+ messages.append({"role": "user", "content": prompt})
38
+
39
+ text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
40
  inputs = tokenizer.encode(text, return_tensors="pt").to(device)
41
+
42
+ input_tokens = inputs.shape[-1]
43
+
44
  with torch.no_grad():
45
  outputs = model.generate(
46
  inputs,
47
  max_new_tokens=max_new_tokens,
48
+ temperature=temperature if temperature > 0 else None,
49
+ do_sample=temperature > 0,
50
+ top_p=0.9 if temperature > 0 else None,
51
+ pad_token_id=tokenizer.eos_token_id,
52
  )
 
 
 
 
53
 
54
+ new_tokens = outputs[0][input_tokens:]
55
+ result = tokenizer.decode(new_tokens, skip_special_tokens=True)
56
+
57
+ stats = f"Input tokens: {input_tokens} | Output tokens: {len(new_tokens)} | Device: {device}"
58
+ return result, stats
59
+
60
+ # ── UI ────────────────────────────────────────────────────────────────────────
61
+ with gr.Blocks(title="SmolLM2 Pipeline Test", theme=gr.themes.Monochrome()) as demo:
62
+ gr.Markdown("""
63
+ # πŸ§ͺ SmolLM2-135M Pipeline Test
64
+ `HuggingFaceTB/SmolLM2-135M-Instruct` β€” CPU/ZeroGPU fallback
65
+ """)
66
+
67
+ with gr.Row():
68
+ with gr.Column(scale=2):
69
+ system_prompt = gr.Textbox(
70
+ label="System Prompt (optional)",
71
+ placeholder="You are a helpful assistant.",
72
+ lines=2,
73
+ )
74
+ prompt = gr.Textbox(
75
+ label="User Prompt",
76
+ placeholder="Was ist die Hauptstadt von Deutschland?",
77
+ lines=4,
78
+ )
79
+ with gr.Row():
80
+ max_tokens = gr.Slider(10, 300, value=150, step=10, label="Max New Tokens")
81
+ temperature = gr.Slider(0.0, 1.5, value=0.2, step=0.05, label="Temperature (0 = greedy)")
82
+ btn = gr.Button("β–Ά Generate", variant="primary")
83
+
84
+ with gr.Column(scale=2):
85
+ output = gr.Textbox(label="Output", lines=10, interactive=False)
86
+ stats = gr.Textbox(label="Stats", lines=1, interactive=False)
87
+
88
+ # Quick test examples
89
+ gr.Examples(
90
+ examples=[
91
+ ["You are a helpful assistant.", "What is 2+2? Answer in one sentence.", 50, 0.0],
92
+ ["", "Summarize in one sentence: The Eiffel Tower is a wrought-iron lattice tower in Paris, built in 1889.", 80, 0.2],
93
+ ["You are a JSON API. Respond only with valid JSON.", 'Extract name and age from: "I am Klaus, 34 years old."', 100, 0.0],
94
+ ["", "Write a Python function that reverses a string.", 150, 0.3],
95
+ ],
96
+ inputs=[system_prompt, prompt, max_tokens, temperature],
97
+ label="Quick Tests",
98
+ )
99
+
100
+ btn.click(fn=generate, inputs=[prompt, max_tokens, temperature, system_prompt], outputs=[output, stats])
101
+ prompt.submit(fn=generate, inputs=[prompt, max_tokens, temperature, system_prompt], outputs=[output, stats])
102
+
103
+ gr.Markdown(f"""
104
+ ---
105
+ **Token:** `{'βœ… loaded' if token else '⚠️ not set'}` |
106
+ **Model:** `{MODEL}` |
107
+ **Device:** `{device}`
108
+ """)
109
+
110
  demo.launch()