made1570 commited on
Commit
dea3ce7
·
verified ·
1 Parent(s): c0dceab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -35
app.py CHANGED
@@ -1,8 +1,9 @@
1
  import os
2
  import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
4
- import gradio as gr
5
  from unsloth import FastModel
 
6
 
7
  # Set environment for Hugging Face Spaces
8
  os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
@@ -19,49 +20,50 @@ model, tokenizer = FastModel.from_pretrained(
19
  full_finetuning=False
20
  )
21
 
22
- # Function to generate text based on user input
 
 
 
 
23
  def generate_text(user_input):
24
- # Prepare the input as per the model's expected format
25
  messages = [{
26
  "role": "user",
27
- "content": [{"type" : "text", "text" : user_input}]
28
  }]
29
-
30
- text = tokenizer.apply_chat_template(
31
- messages,
32
- add_generation_prompt=True, # Must add for generation
 
 
 
 
 
 
 
 
 
 
33
  )
34
-
35
- # Generate output with model
36
- with torch.no_grad():
37
- output = model.generate(
38
- **tokenizer([text], return_tensors="pt").to("cuda"),
39
- max_new_tokens=512, # Adjust if you need more tokens
40
- temperature=1.0,
41
- top_p=0.95,
42
- top_k=64,
43
- streamer=None # You can set a streamer if needed
44
- )
45
-
46
- # Decode the model output and return the result
47
- decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
48
 
49
- index = decoded_output.lower().find("model")
50
- if index != -1:
51
- return decoded_output[index + len("model"):].strip()
52
 
53
- # Fallback: return full decoded output if structure is unexpected
54
- return decoded_output
 
 
55
 
56
- # Build the Gradio interface
57
  iface = gr.Interface(
58
- fn=generate_text,
59
- inputs=gr.Textbox(lines=2, placeholder="Enter your text here..."),
60
- outputs=gr.Textbox(lines=2, placeholder="Generated text will appear here..."),
61
- title="Gemma-3 Model",
62
- description="This is a simple interface to interact with the Gemma-3 model. Enter a prompt and see the generated response."
 
63
  )
64
 
65
  # Launch the app
66
  if __name__ == "__main__":
67
- iface.launch(share=True)
 
1
  import os
2
  import torch
3
+ import threading
4
+ from transformers import AutoTokenizer, TextIteratorStreamer
5
  from unsloth import FastModel
6
+ import gradio as gr
7
 
8
  # Set environment for Hugging Face Spaces
9
  os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
 
20
  full_finetuning=False
21
  )
22
 
23
+ # Optional: Compile model for speed boost if using PyTorch 2.x
24
+ if torch.__version__.startswith("2"):
25
+ model = torch.compile(model)
26
+
27
+ # Function to generate text with streaming
28
  def generate_text(user_input):
 
29
  messages = [{
30
  "role": "user",
31
+ "content": [{"type": "text", "text": user_input}]
32
  }]
33
+
34
+ text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
35
+ inputs = tokenizer([text], return_tensors="pt").to("cuda")
36
+
37
+ # Set up streaming
38
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
39
+
40
+ generation_kwargs = dict(
41
+ **inputs,
42
+ max_new_tokens=256, # Adjust based on desired response length
43
+ temperature=1.0,
44
+ top_p=0.95,
45
+ top_k=64,
46
+ streamer=streamer
47
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
50
+ thread.start()
 
51
 
52
+ output = ""
53
+ for new_text in streamer:
54
+ output += new_text
55
+ yield output
56
 
57
+ # Build the Gradio interface with streaming enabled
58
  iface = gr.Interface(
59
+ fn=generate_text,
60
+ inputs=gr.Textbox(lines=2, placeholder="Enter your text here..."),
61
+ outputs=gr.Textbox(lines=10, placeholder="Generated text will appear here..."),
62
+ title="Gemma-3 Model (Streaming)",
63
+ description="This is a simple interface to interact with the Gemma-3 model. Now streams output as it's generated.",
64
+ live=True # Enables real-time response updates
65
  )
66
 
67
  # Launch the app
68
  if __name__ == "__main__":
69
+ iface.launch(share=True)