File size: 1,678 Bytes
4c31195 9f37a6e 6b5a511 d7662d7 3d582fc 6b5a511 3d582fc 6b5a511 9f37a6e 6b5a511 9f37a6e 4c31195 6b5a511 4c31195 6b5a511 4c31195 9f37a6e 4c31195 3d582fc 4c31195 6b5a511 3d582fc 6b5a511 4c31195 6b5a511 4c31195 3d582fc 4c31195 6b5a511 fcea4c9 6b5a511 41479d7 7171080 41479d7 6b5a511 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
model_id = "unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit"
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Load model on CPU
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.float32,
)
model.to("cpu")
model.eval()
# Minimal generation: single user prompt, static system
def generate_response(user_prompt):
messages = [
{
"role": "system",
"content": [{"type": "text", "text": "You are a helpful assistant."}]
},
{
"role": "user",
"content": [{"type": "text", "text": user_prompt.strip()}]
}
]
inputs = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt"
).to("cpu")
input_len = inputs["input_ids"].shape[-1]
with torch.inference_mode():
outputs = model.generate(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
max_new_tokens=100,
do_sample=False,
use_cache=False
)
generated_tokens = outputs[0][input_len:]
decoded = tokenizer.decode(generated_tokens, skip_special_tokens=True)
return decoded.strip()
# Gradio UI
demo = gr.Interface(
fn=generate_response,
inputs=gr.Textbox(lines=3, label="Enter your question"),
outputs=gr.Textbox(label="Gemma 3n Response"),
title="🧪 Simple Gemma 3n Demo (CPU)",
description="Test the Gemma 3n model with minimal output. Max 100 tokens.",
)
if __name__ == "__main__":
demo.launch()
|