Spaces:

codey-lab
/

SmolLM2-360M-Instruct

Running

App Files Files Community

Alibrown commited on Mar 13

Commit

0ce308c

verified ·

1 Parent(s): 3e47970

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -19

app.py CHANGED Viewed

@@ -1,34 +1,110 @@
-import spaces  # nur importieren wenn ZeroGPU verfügbar
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
-MODEL = "HuggingFaceTB/SmolLM2-135M-Instruct"
-tokenizer = AutoTokenizer.from_pretrained(MODEL)
-# Fallback: versuche CUDA, sonst CPU
 device = "cuda" if torch.cuda.is_available() else "cpu"
-model = AutoModelForCausalLM.from_pretrained(MODEL).to(device)
-@spaces.GPU(duration=30)  # ZeroGPU decorator — wird ignoriert wenn kein GPU da
-def generate(prompt, max_new_tokens=200):
-    messages = [{"role": "user", "content": prompt}]
-    text = tokenizer.apply_chat_template(messages, tokenize=False)
     inputs = tokenizer.encode(text, return_tensors="pt").to(device)
     with torch.no_grad():
         outputs = model.generate(
             inputs,
             max_new_tokens=max_new_tokens,
-            temperature=0.2,
-            top_p=0.9,
-            do_sample=True,
         )
-    # nur neue tokens zurückgeben
-    new_tokens = outputs[0][inputs.shape[-1]:]
-    return tokenizer.decode(new_tokens, skip_special_tokens=True)
-demo = gr.Interface(fn=generate, inputs="text", outputs="text")
 demo.launch()

+import os
 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 import torch
+# ── Token Resolution (Open Source friendly) ──────────────────────────────────
+token = (
+    os.environ.get("HF_TOKEN") or
+    os.environ.get("TEST_TOKEN") or
+    os.environ.get("HUGGINGFACE_TOKEN") or
+    os.environ.get("HF_API_TOKEN") or
+    None
+)
+if not token:
+    print("⚠️  No HF token found — running unauthenticated (rate limits apply)")
+else:
+    print("✅  HF token loaded")
+# ── Model ─────────────────────────────────────────────────────────────────────
+MODEL = "HuggingFaceTB/SmolLM2-135M-Instruct"
 device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"🔧  Device: {device}")
+tokenizer = AutoTokenizer.from_pretrained(MODEL, token=token)
+model = AutoModelForCausalLM.from_pretrained(MODEL, token=token).to(device)
+print(f"✅  Model loaded: {MODEL}")
+# ── Inference ─────────────────────────────────────────────────────────────────
+def generate(prompt: str, max_new_tokens: int, temperature: float, system_prompt: str):
+    if not prompt.strip():
+        return "⚠️ Empty prompt", ""
+    messages = []
+    if system_prompt.strip():
+        messages.append({"role": "system", "content": system_prompt})
+    messages.append({"role": "user", "content": prompt})
+    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = tokenizer.encode(text, return_tensors="pt").to(device)
+    input_tokens = inputs.shape[-1]
     with torch.no_grad():
         outputs = model.generate(
             inputs,
             max_new_tokens=max_new_tokens,
+            temperature=temperature if temperature > 0 else None,
+            do_sample=temperature > 0,
+            top_p=0.9 if temperature > 0 else None,
+            pad_token_id=tokenizer.eos_token_id,
         )
+    new_tokens = outputs[0][input_tokens:]
+    result = tokenizer.decode(new_tokens, skip_special_tokens=True)
+    stats = f"Input tokens: {input_tokens} | Output tokens: {len(new_tokens)} | Device: {device}"
+    return result, stats
+# ── UI ────────────────────────────────────────────────────────────────────────
+with gr.Blocks(title="SmolLM2 Pipeline Test", theme=gr.themes.Monochrome()) as demo:
+    gr.Markdown("""
+    # 🧪 SmolLM2-135M Pipeline Test
+    `HuggingFaceTB/SmolLM2-135M-Instruct` — CPU/ZeroGPU fallback
+    """)
+    with gr.Row():
+        with gr.Column(scale=2):
+            system_prompt = gr.Textbox(
+                label="System Prompt (optional)",
+                placeholder="You are a helpful assistant.",
+                lines=2,
+            )
+            prompt = gr.Textbox(
+                label="User Prompt",
+                placeholder="Was ist die Hauptstadt von Deutschland?",
+                lines=4,
+            )
+            with gr.Row():
+                max_tokens = gr.Slider(10, 300, value=150, step=10, label="Max New Tokens")
+                temperature = gr.Slider(0.0, 1.5, value=0.2, step=0.05, label="Temperature (0 = greedy)")
+            btn = gr.Button("▶ Generate", variant="primary")
+        with gr.Column(scale=2):
+            output = gr.Textbox(label="Output", lines=10, interactive=False)
+            stats = gr.Textbox(label="Stats", lines=1, interactive=False)
+    # Quick test examples
+    gr.Examples(
+        examples=[
+            ["You are a helpful assistant.", "What is 2+2? Answer in one sentence.", 50, 0.0],
+            ["", "Summarize in one sentence: The Eiffel Tower is a wrought-iron lattice tower in Paris, built in 1889.", 80, 0.2],
+            ["You are a JSON API. Respond only with valid JSON.", 'Extract name and age from: "I am Klaus, 34 years old."', 100, 0.0],
+            ["", "Write a Python function that reverses a string.", 150, 0.3],
+        ],
+        inputs=[system_prompt, prompt, max_tokens, temperature],
+        label="Quick Tests",
+    )
+    btn.click(fn=generate, inputs=[prompt, max_tokens, temperature, system_prompt], outputs=[output, stats])
+    prompt.submit(fn=generate, inputs=[prompt, max_tokens, temperature, system_prompt], outputs=[output, stats])
+    gr.Markdown(f"""
+    ---
+    **Token:** `{'✅ loaded' if token else '⚠️ not set'}` |
+    **Model:** `{MODEL}` |
+    **Device:** `{device}`
+    """)
 demo.launch()