Spaces:
Paused
Paused
import os | |
import torch | |
import threading | |
from transformers import AutoTokenizer, TextIteratorStreamer | |
from unsloth import FastModel | |
import gradio as gr | |
# Set environment for Hugging Face Spaces | |
os.environ['CUDA_LAUNCH_BLOCKING'] = '1' | |
# Load the model from Hugging Face Model Hub | |
model_repo_id = 'adarsh3601/my_gemma3_pt' | |
# Load model and tokenizer using FastModel | |
model, tokenizer = FastModel.from_pretrained( | |
model_name=model_repo_id, | |
max_seq_length=2048, | |
load_in_4bit=True, # Load model with 4-bit quantization | |
load_in_8bit=False, | |
full_finetuning=False | |
) | |
# Optional: Compile model for speed boost if using PyTorch 2.x | |
if torch.__version__.startswith("2"): | |
model = torch.compile(model) | |
# Function to generate text with streaming | |
def generate_text(user_input): | |
messages = [{ | |
"role": "user", | |
"content": [{"type": "text", "text": user_input}] | |
}] | |
text = tokenizer.apply_chat_template(messages, add_generation_prompt=True) | |
inputs = tokenizer([text], return_tensors="pt").to("cuda") | |
# Set up streaming | |
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) | |
generation_kwargs = dict( | |
**inputs, | |
max_new_tokens=128, # Adjust based on desired response length | |
temperature=1.0, | |
top_p=0.95, | |
top_k=64, | |
streamer=streamer | |
) | |
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs) | |
thread.start() | |
output = "" | |
for new_text in streamer: | |
output += new_text | |
yield output | |
# Build the Gradio interface with streaming enabled | |
iface = gr.Interface( | |
fn=generate_text, | |
inputs=gr.Textbox(lines=2, placeholder="Enter your text here..."), | |
outputs=gr.Textbox(lines=10, placeholder="Generated text will appear here..."), | |
title="Gemma-3 Model (Streaming)", | |
description="This is a simple interface to interact with the Gemma-3 model. Now streams output as it's generated.", | |
live=True # Enables real-time response updates | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
iface.launch(share=True) | |