Spaces:
Paused
Paused
File size: 2,104 Bytes
da40d5c f900ba5 dea3ce7 da40d5c dea3ce7 da40d5c 5fe0a09 da40d5c 5fe0a09 da40d5c 5fe0a09 da40d5c 5a13e2d 5fe0a09 da40d5c dea3ce7 5fe0a09 dea3ce7 5fe0a09 dea3ce7 85e225d dea3ce7 5fe0a09 0f22707 dea3ce7 0f22707 dea3ce7 5fe0a09 dea3ce7 5fe0a09 dea3ce7 5fe0a09 da40d5c 5fe0a09 da40d5c dea3ce7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
import os
import torch
import threading
from transformers import AutoTokenizer, TextIteratorStreamer
from unsloth import FastModel
import gradio as gr
# Set environment for Hugging Face Spaces
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
# Load the model from Hugging Face Model Hub
model_repo_id = 'adarsh3601/my_gemma3_pt'
# Load model and tokenizer using FastModel
model, tokenizer = FastModel.from_pretrained(
model_name=model_repo_id,
max_seq_length=2048,
load_in_4bit=True, # Load model with 4-bit quantization
load_in_8bit=False,
full_finetuning=False
)
# Optional: Compile model for speed boost if using PyTorch 2.x
if torch.__version__.startswith("2"):
model = torch.compile(model)
# Function to generate text with streaming
def generate_text(user_input):
messages = [{
"role": "user",
"content": [{"type": "text", "text": user_input}]
}]
text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
inputs = tokenizer([text], return_tensors="pt").to("cuda")
# Set up streaming
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = dict(
**inputs,
max_new_tokens=128, # Adjust based on desired response length
temperature=1.0,
top_p=0.95,
top_k=64,
streamer=streamer
)
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
output = ""
for new_text in streamer:
output += new_text
yield output
# Build the Gradio interface with streaming enabled
iface = gr.Interface(
fn=generate_text,
inputs=gr.Textbox(lines=2, placeholder="Enter your text here..."),
outputs=gr.Textbox(lines=10, placeholder="Generated text will appear here..."),
title="Gemma-3 Model (Streaming)",
description="This is a simple interface to interact with the Gemma-3 model. Now streams output as it's generated.",
live=True # Enables real-time response updates
)
# Launch the app
if __name__ == "__main__":
iface.launch(share=True)
|