import os import torch import threading from transformers import AutoTokenizer, TextIteratorStreamer from unsloth import FastModel import gradio as gr # Set environment for Hugging Face Spaces os.environ['CUDA_LAUNCH_BLOCKING'] = '1' # Load the model from Hugging Face Model Hub model_repo_id = 'adarsh3601/my_gemma3_pt' # Load model and tokenizer using FastModel model, tokenizer = FastModel.from_pretrained( model_name=model_repo_id, max_seq_length=2048, load_in_4bit=True, # Load model with 4-bit quantization load_in_8bit=False, full_finetuning=False ) # Optional: Compile model for speed boost if using PyTorch 2.x if torch.__version__.startswith("2"): model = torch.compile(model) # Function to generate text with streaming def generate_text(user_input): messages = [{ "role": "user", "content": [{"type": "text", "text": user_input}] }] text = tokenizer.apply_chat_template(messages, add_generation_prompt=True) inputs = tokenizer([text], return_tensors="pt").to("cuda") # Set up streaming streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) generation_kwargs = dict( **inputs, max_new_tokens=128, # Adjust based on desired response length temperature=1.0, top_p=0.95, top_k=64, streamer=streamer ) thread = threading.Thread(target=model.generate, kwargs=generation_kwargs) thread.start() output = "" for new_text in streamer: output += new_text yield output # Build the Gradio interface with streaming enabled iface = gr.Interface( fn=generate_text, inputs=gr.Textbox(lines=2, placeholder="Enter your text here..."), outputs=gr.Textbox(lines=10, placeholder="Generated text will appear here..."), title="Gemma-3 Model (Streaming)", description="This is a simple interface to interact with the Gemma-3 model. Now streams output as it's generated.", live=True # Enables real-time response updates ) # Launch the app if __name__ == "__main__": iface.launch(share=True)