File size: 2,104 Bytes
da40d5c
f900ba5
dea3ce7
 
da40d5c
dea3ce7
da40d5c
5fe0a09
da40d5c
 
5fe0a09
da40d5c
 
5fe0a09
da40d5c
 
5a13e2d
5fe0a09
da40d5c
 
 
 
dea3ce7
 
 
 
 
5fe0a09
 
 
dea3ce7
5fe0a09
dea3ce7
 
 
 
 
 
 
 
 
85e225d
dea3ce7
 
 
 
5fe0a09
0f22707
dea3ce7
 
0f22707
dea3ce7
 
 
 
5fe0a09
dea3ce7
5fe0a09
dea3ce7
 
 
 
 
 
5fe0a09
da40d5c
5fe0a09
da40d5c
dea3ce7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import os
import torch
import threading
from transformers import AutoTokenizer, TextIteratorStreamer
from unsloth import FastModel
import gradio as gr

# Set environment for Hugging Face Spaces
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Load the model from Hugging Face Model Hub
model_repo_id = 'adarsh3601/my_gemma3_pt'

# Load model and tokenizer using FastModel
model, tokenizer = FastModel.from_pretrained(
    model_name=model_repo_id,
    max_seq_length=2048,
    load_in_4bit=True,  # Load model with 4-bit quantization
    load_in_8bit=False,
    full_finetuning=False
)

# Optional: Compile model for speed boost if using PyTorch 2.x
if torch.__version__.startswith("2"):
    model = torch.compile(model)

# Function to generate text with streaming
def generate_text(user_input):
    messages = [{
        "role": "user",
        "content": [{"type": "text", "text": user_input}]
    }]

    text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    inputs = tokenizer([text], return_tensors="pt").to("cuda")

    # Set up streaming
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    generation_kwargs = dict(
        **inputs,
        max_new_tokens=128,  # Adjust based on desired response length
        temperature=1.0,
        top_p=0.95,
        top_k=64,
        streamer=streamer
    )

    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    output = ""
    for new_text in streamer:
        output += new_text
        yield output

# Build the Gradio interface with streaming enabled
iface = gr.Interface(
    fn=generate_text,
    inputs=gr.Textbox(lines=2, placeholder="Enter your text here..."),
    outputs=gr.Textbox(lines=10, placeholder="Generated text will appear here..."),
    title="Gemma-3 Model (Streaming)",
    description="This is a simple interface to interact with the Gemma-3 model. Now streams output as it's generated.",
    live=True  # Enables real-time response updates
)

# Launch the app
if __name__ == "__main__":
    iface.launch(share=True)