akhaliq HF Staff commited on
Commit
609cf38
ยท
verified ยท
1 Parent(s): 4913703

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +359 -0
app.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ I'll create a chat application using the Qwen3-VL-4B-Instruct model that can handle both text and image inputs. This will be a multimodal chatbot that can analyze images and respond to questions about them.
2
+
3
+ ```python
4
+ import gradio as gr
5
+ import torch
6
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
7
+ from PIL import Image
8
+ import numpy as np
9
+ from typing import List, Dict, Any, Optional, Tuple
10
+ import spaces
11
+ import io
12
+ import base64
13
+
14
+ # Initialize the model and processor
15
+ model_id = "Qwen/Qwen2-VL-2B-Instruct" # Using 2B version for better performance on Spaces
16
+
17
+ # Load model with optimizations for inference
18
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
19
+ model_id,
20
+ torch_dtype=torch.bfloat16,
21
+ device_map="auto"
22
+ )
23
+ processor = AutoProcessor.from_pretrained(model_id)
24
+
25
+ @spaces.GPU(duration=60)
26
+ def process_chat_message(
27
+ message: str,
28
+ image: Optional[Image.Image],
29
+ history: List[Dict[str, Any]]
30
+ ) -> str:
31
+ """
32
+ Process a chat message with optional image input using Qwen3-VL model.
33
+
34
+ Args:
35
+ message: The user's text message
36
+ image: Optional PIL Image
37
+ history: Chat history
38
+
39
+ Returns:
40
+ The model's response
41
+ """
42
+ # Prepare the message content
43
+ content = []
44
+
45
+ # Add image if provided
46
+ if image is not None:
47
+ # Convert PIL image to format expected by the model
48
+ content.append({"type": "image", "image": image})
49
+
50
+ # Add text message
51
+ if message:
52
+ content.append({"type": "text", "text": message})
53
+
54
+ # Create the messages format for the model
55
+ messages = []
56
+
57
+ # Add history if exists (text only for simplicity)
58
+ for hist_item in history:
59
+ if hist_item["role"] == "user":
60
+ messages.append({
61
+ "role": "user",
62
+ "content": hist_item.get("content", "")
63
+ })
64
+ elif hist_item["role"] == "assistant":
65
+ messages.append({
66
+ "role": "assistant",
67
+ "content": hist_item.get("content", "")
68
+ })
69
+
70
+ # Add current message
71
+ if content:
72
+ messages.append({
73
+ "role": "user",
74
+ "content": content
75
+ })
76
+
77
+ # Prepare inputs for the model
78
+ text = processor.apply_chat_template(
79
+ messages,
80
+ tokenize=False,
81
+ add_generation_prompt=True
82
+ )
83
+
84
+ if image is not None:
85
+ inputs = processor(
86
+ text=[text],
87
+ images=[image],
88
+ return_tensors="pt"
89
+ ).to(model.device)
90
+ else:
91
+ inputs = processor(
92
+ text=[text],
93
+ return_tensors="pt"
94
+ ).to(model.device)
95
+
96
+ # Generate response
97
+ with torch.no_grad():
98
+ generated_ids = model.generate(
99
+ **inputs,
100
+ max_new_tokens=512,
101
+ temperature=0.7,
102
+ do_sample=True,
103
+ top_p=0.95
104
+ )
105
+
106
+ # Decode the generated response
107
+ generated_ids_trimmed = [
108
+ out_ids[len(in_ids):]
109
+ for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
110
+ ]
111
+
112
+ response = processor.batch_decode(
113
+ generated_ids_trimmed,
114
+ skip_special_tokens=True,
115
+ clean_up_tokenization_spaces=False
116
+ )[0]
117
+
118
+ return response
119
+
120
+ def chat_fn(message: Dict[str, Any], history: List[List[Any]]) -> Tuple[str, List[List[Any]]]:
121
+ """
122
+ Main chat function that processes user input and returns response.
123
+
124
+ Args:
125
+ message: Dictionary containing text and optional files
126
+ history: Chat history as list of [user_msg, assistant_msg] pairs
127
+
128
+ Returns:
129
+ Empty string and updated history
130
+ """
131
+ text = message.get("text", "")
132
+ files = message.get("files", [])
133
+
134
+ # Process image if provided
135
+ image = None
136
+ if files and len(files) > 0:
137
+ try:
138
+ image = Image.open(files[0])
139
+ # Convert RGBA to RGB if necessary
140
+ if image.mode == "RGBA":
141
+ background = Image.new("RGB", image.size, (255, 255, 255))
142
+ background.paste(image, mask=image.split()[3])
143
+ image = background
144
+ except Exception as e:
145
+ print(f"Error loading image: {e}")
146
+ image = None
147
+
148
+ # Convert history to format expected by model
149
+ model_history = []
150
+ for user_msg, assistant_msg in history:
151
+ if isinstance(user_msg, dict):
152
+ model_history.append({"role": "user", "content": user_msg.get("text", "")})
153
+ elif isinstance(user_msg, str):
154
+ model_history.append({"role": "user", "content": user_msg})
155
+
156
+ if assistant_msg:
157
+ model_history.append({"role": "assistant", "content": assistant_msg})
158
+
159
+ # Get response from model
160
+ try:
161
+ response = process_chat_message(text, image, model_history)
162
+ except Exception as e:
163
+ response = f"Sorry, I encountered an error: {str(e)}"
164
+
165
+ # Update history
166
+ if image is not None:
167
+ # Store message with image indicator
168
+ user_message = {"text": text, "image": "[Image uploaded]"}
169
+ else:
170
+ user_message = text
171
+
172
+ history.append([user_message, response])
173
+
174
+ return "", history
175
+
176
+ def retry_fn(history: List[List[Any]]) -> Tuple[str, List[List[Any]]]:
177
+ """Retry the last message."""
178
+ if not history:
179
+ return "", history
180
+
181
+ # Remove last assistant response and regenerate
182
+ last_user_msg = history[-1][0]
183
+ history = history[:-1]
184
+
185
+ # Recreate the message dict
186
+ if isinstance(last_user_msg, dict):
187
+ message = {"text": last_user_msg.get("text", "")}
188
+ else:
189
+ message = {"text": last_user_msg}
190
+
191
+ return chat_fn(message, history)
192
+
193
+ def undo_fn(history: List[List[Any]]) -> List[List[Any]]:
194
+ """Undo the last message."""
195
+ if history:
196
+ return history[:-1]
197
+ return history
198
+
199
+ def clear_fn() -> Tuple[None, List]:
200
+ """Clear the chat."""
201
+ return None, []
202
+
203
+ # Create the Gradio interface
204
+ with gr.Blocks(theme=gr.themes.Soft(), fill_height=True) as demo:
205
+ gr.Markdown(
206
+ """
207
+ # ๐ŸŒŸ Qwen3-VL Multimodal Chat
208
+
209
+ Chat with Qwen3-VL - A powerful vision-language model that can understand and discuss images!
210
+
211
+ **Features:**
212
+ - ๐Ÿ“ Text conversations
213
+ - ๐Ÿ–ผ๏ธ Image understanding and analysis
214
+ - ๐ŸŽจ Visual question answering
215
+ - ๐Ÿ” Detailed image descriptions
216
+
217
+ [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
218
+ """
219
+ )
220
+
221
+ with gr.Row():
222
+ with gr.Column(scale=1):
223
+ gr.Markdown(
224
+ """
225
+ ### ๐Ÿ’ก Tips:
226
+ - Upload an image and ask questions about it
227
+ - Try asking for detailed descriptions
228
+ - Ask about objects, colors, text in images
229
+ - Compare elements within the image
230
+ """
231
+ )
232
+
233
+ gr.Markdown(
234
+ """
235
+ ### ๐Ÿ“ธ Example Prompts:
236
+ - "What's in this image?"
237
+ - "Describe this scene in detail"
238
+ - "What text can you see?"
239
+ - "Count the objects in the image"
240
+ - "What's the mood of this image?"
241
+ """
242
+ )
243
+
244
+ with gr.Column(scale=3):
245
+ chatbot = gr.Chatbot(
246
+ label="Chat",
247
+ type="messages",
248
+ height=500,
249
+ show_copy_button=True,
250
+ bubble_full_width=False,
251
+ avatar_images=[None, "๐Ÿค–"]
252
+ )
253
+
254
+ with gr.Row():
255
+ msg = gr.MultimodalTextbox(
256
+ label="Message",
257
+ placeholder="Type a message or upload an image...",
258
+ file_types=["image"],
259
+ submit_btn=True,
260
+ stop_btn=False
261
+ )
262
+
263
+ with gr.Row():
264
+ retry_btn = gr.Button("๐Ÿ”„ Retry", variant="secondary", size="sm")
265
+ undo_btn = gr.Button("โ†ฉ๏ธ Undo", variant="secondary", size="sm")
266
+ clear_btn = gr.Button("๐Ÿ—‘๏ธ Clear", variant="secondary", size="sm")
267
+
268
+ with gr.Accordion("โš™๏ธ Advanced Settings", open=False):
269
+ gr.Markdown(
270
+ """
271
+ **Model Information:**
272
+ - Model: Qwen3-VL-4B-Instruct
273
+ - Optimized for vision-language tasks
274
+ - Supports multiple languages
275
+ - Best performance with clear, well-lit images
276
+ """
277
+ )
278
+
279
+ # Set up event handlers
280
+ msg.submit(
281
+ chat_fn,
282
+ inputs=[msg, chatbot],
283
+ outputs=[msg, chatbot],
284
+ queue=True
285
+ )
286
+
287
+ retry_btn.click(
288
+ retry_fn,
289
+ inputs=[chatbot],
290
+ outputs=[msg, chatbot],
291
+ queue=True
292
+ )
293
+
294
+ undo_btn.click(
295
+ undo_fn,
296
+ inputs=[chatbot],
297
+ outputs=[chatbot],
298
+ queue=False
299
+ )
300
+
301
+ clear_btn.click(
302
+ clear_fn,
303
+ outputs=[msg, chatbot],
304
+ queue=False
305
+ )
306
+
307
+ # Add examples
308
+ gr.Examples(
309
+ examples=[
310
+ {"text": "Hello! What can you help me with today?"},
311
+ {"text": "Can you describe an image if I upload one?"},
312
+ {"text": "What are your capabilities?"},
313
+ ],
314
+ inputs=msg,
315
+ label="Example Messages"
316
+ )
317
+
318
+ if __name__ == "__main__":
319
+ demo.launch(
320
+ show_error=True,
321
+ share=False,
322
+ debug=True
323
+ )
324
+ ```
325
+
326
+ Now let's create the requirements.txt file:
327
+
328
+ ```
329
+ gradio
330
+ transformers
331
+ torch
332
+ torchvision
333
+ spaces
334
+ Pillow
335
+ numpy
336
+ accelerate
337
+ sentencepiece
338
+ einops
339
+ transformers_stream_generator
340
+ ```
341
+
342
+ This application creates a multimodal chat interface with the following features:
343
+
344
+ 1. **Multimodal Input**: Users can send text messages, images, or both
345
+ 2. **Vision-Language Understanding**: The Qwen3-VL model can analyze images and answer questions about them
346
+ 3. **Chat History**: Maintains conversation context
347
+ 4. **Interactive Controls**: Retry, undo, and clear buttons for better user experience
348
+ 5. **GPU Optimization**: Uses the @spaces.GPU decorator for efficient inference
349
+ 6. **Clean UI**: Professional interface with helpful tips and examples
350
+
351
+ The app can:
352
+ - Describe images in detail
353
+ - Answer questions about image content
354
+ - Count objects in images
355
+ - Read text from images
356
+ - Discuss colors, composition, and mood
357
+ - Maintain conversational context
358
+
359
+ The interface is user-friendly with a clean design and provides guidance on how to use the multimodal capabilities effectively.