import os import torch from transformers import AutoTokenizer, AutoModelForCausalLM import gradio as gr from unsloth import FastModel # Set environment for Hugging Face Spaces os.environ['CUDA_LAUNCH_BLOCKING'] = '1' # Load the model from Hugging Face Model Hub model_repo_id = 'adarsh3601/my_gemma3_pt' # Load model and tokenizer using FastModel model, tokenizer = FastModel.from_pretrained( model_name=model_repo_id, max_seq_length=2048, load_in_4bit=True, # Load model with 4-bit quantization load_in_8bit=False, full_finetuning=False ) # Function to generate text based on user input def generate_text(user_input): # Prepare the input as per the model's expected format messages = [{ "role": "user", "content": [{"type" : "text", "text" : user_input}] }] text = tokenizer.apply_chat_template( messages, add_generation_prompt=True, # Must add for generation ) # Generate output with model with torch.no_grad(): output = model.generate( **tokenizer([text], return_tensors="pt").to("cuda"), max_new_tokens=512, # Adjust if you need more tokens temperature=1.0, top_p=0.95, top_k=64, streamer=None # You can set a streamer if needed ) # Decode the model output and return the result decoded_output = tokenizer.decode(output[0], skip_special_tokens=True) return decoded_output # Build the Gradio interface iface = gr.Interface( fn=generate_text, inputs=gr.Textbox(lines=2, placeholder="Enter your text here..."), outputs=gr.Textbox(lines=2, placeholder="Generated text will appear here..."), title="Gemma-3 Model", description="This is a simple interface to interact with the Gemma-3 model. Enter a prompt and see the generated response." ) # Launch the app if __name__ == "__main__": iface.launch(share=True)