Spaces:

made1570
/

TestingModelAPI

Paused

App Files Files Community

made1570 commited on 25 days ago

Commit

dea3ce7

verified ·

1 Parent(s): c0dceab

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -35

app.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import os
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-import gradio as gr
 from unsloth import FastModel
 # Set environment for Hugging Face Spaces
 os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
@@ -19,49 +20,50 @@ model, tokenizer = FastModel.from_pretrained(
     full_finetuning=False
 )
-# Function to generate text based on user input
 def generate_text(user_input):
-    # Prepare the input as per the model's expected format
     messages = [{
         "role": "user",
-        "content": [{"type" : "text", "text" : user_input}]
     }]
-    text = tokenizer.apply_chat_template(
-        messages,
-        add_generation_prompt=True,  # Must add for generation
     )
-    # Generate output with model
-    with torch.no_grad():
-        output = model.generate(
-            **tokenizer([text], return_tensors="pt").to("cuda"),
-            max_new_tokens=512,  # Adjust if you need more tokens
-            temperature=1.0,
-            top_p=0.95,
-            top_k=64,
-            streamer=None  # You can set a streamer if needed
-        )
-    # Decode the model output and return the result
-    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
-    index = decoded_output.lower().find("model")
-    if index != -1:
-        return decoded_output[index + len("model"):].strip()
-    # Fallback: return full decoded output if structure is unexpected
-    return decoded_output
-# Build the Gradio interface
 iface = gr.Interface(
-    fn=generate_text,
-    inputs=gr.Textbox(lines=2, placeholder="Enter your text here..."),
-    outputs=gr.Textbox(lines=2, placeholder="Generated text will appear here..."),
-    title="Gemma-3 Model",
-    description="This is a simple interface to interact with the Gemma-3 model. Enter a prompt and see the generated response."
 )
 # Launch the app
 if __name__ == "__main__":
-    iface.launch(share=True)

 import os
 import torch
+import threading
+from transformers import AutoTokenizer, TextIteratorStreamer
 from unsloth import FastModel
+import gradio as gr
 # Set environment for Hugging Face Spaces
 os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
     full_finetuning=False
 )
+# Optional: Compile model for speed boost if using PyTorch 2.x
+if torch.__version__.startswith("2"):
+    model = torch.compile(model)
+# Function to generate text with streaming
 def generate_text(user_input):
     messages = [{
         "role": "user",
+        "content": [{"type": "text", "text": user_input}]
     }]
+    text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
+    inputs = tokenizer([text], return_tensors="pt").to("cuda")
+    # Set up streaming
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = dict(
+        **inputs,
+        max_new_tokens=256,  # Adjust based on desired response length
+        temperature=1.0,
+        top_p=0.95,
+        top_k=64,
+        streamer=streamer
     )
+    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    output = ""
+    for new_text in streamer:
+        output += new_text
+        yield output
+# Build the Gradio interface with streaming enabled
 iface = gr.Interface(
+    fn=generate_text,
+    inputs=gr.Textbox(lines=2, placeholder="Enter your text here..."),
+    outputs=gr.Textbox(lines=10, placeholder="Generated text will appear here..."),
+    title="Gemma-3 Model (Streaming)",
+    description="This is a simple interface to interact with the Gemma-3 model. Now streams output as it's generated.",
+    live=True  # Enables real-time response updates
 )
 # Launch the app
 if __name__ == "__main__":
+    iface.launch(share=True)