File size: 1,678 Bytes
4c31195
9f37a6e
6b5a511
d7662d7
3d582fc
 
 
 
 
6b5a511
3d582fc
 
6b5a511
9f37a6e
6b5a511
9f37a6e
4c31195
6b5a511
 
4c31195
6b5a511
 
 
 
 
 
 
 
4c31195
 
9f37a6e
4c31195
 
3d582fc
 
 
 
4c31195
 
6b5a511
 
 
 
3d582fc
6b5a511
4c31195
 
6b5a511
 
 
4c31195
3d582fc
4c31195
6b5a511
 
fcea4c9
6b5a511
 
41479d7
7171080
41479d7
 
6b5a511
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load model on CPU
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float32,
)
model.to("cpu")
model.eval()

# Minimal generation: single user prompt, static system
def generate_response(user_prompt):
    messages = [
        {
            "role": "system",
            "content": [{"type": "text", "text": "You are a helpful assistant."}]
        },
        {
            "role": "user",
            "content": [{"type": "text", "text": user_prompt.strip()}]
        }
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cpu")

    input_len = inputs["input_ids"].shape[-1]

    with torch.inference_mode():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=100,
            do_sample=False,
            use_cache=False
        )

    generated_tokens = outputs[0][input_len:]
    decoded = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    return decoded.strip()

# Gradio UI
demo = gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(lines=3, label="Enter your question"),
    outputs=gr.Textbox(label="Gemma 3n Response"),
    title="🧪 Simple Gemma 3n Demo (CPU)",
    description="Test the Gemma 3n model with minimal output. Max 100 tokens.",
)

if __name__ == "__main__":
    demo.launch()