nguyenbh commited on
Commit
5325553
·
1 Parent(s): 2a7243d
Files changed (2) hide show
  1. app.py +487 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,487 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import requests
4
+ import urllib.request
5
+ import os
6
+ import ssl
7
+ import base64
8
+ from PIL import Image
9
+ import soundfile as sf
10
+ import mimetypes
11
+ import logging
12
+ from io import BytesIO
13
+ import tempfile
14
+
15
+ # Set up logging
16
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # Azure ML endpoint configuration
20
+ url = os.getenv("AZURE_ENDPOINT")
21
+ api_key = os.getenv("AZURE_API_KEY")
22
+
23
+ # Initialize MIME types
24
+ mimetypes.init()
25
+
26
+ def call_aml_endpoint(payload, url, api_key):
27
+ """Call Azure ML endpoint with the given payload."""
28
+ # Allow self-signed HTTPS certificates
29
+ def allow_self_signed_https(allowed):
30
+ if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
31
+ ssl._create_default_https_context = ssl._create_unverified_context
32
+
33
+ allow_self_signed_https(True)
34
+
35
+ # Set parameters (can be adjusted based on your needs)
36
+ parameters = {"temperature": 0.7}
37
+ if "parameters" not in payload["input_data"]:
38
+ payload["input_data"]["parameters"] = parameters
39
+
40
+ # Encode the request body
41
+ body = str.encode(json.dumps(payload))
42
+
43
+ if not api_key:
44
+ raise Exception("A key should be provided to invoke the endpoint")
45
+
46
+ # Set up headers
47
+ headers = {'Content-Type': 'application/json', 'Authorization': ('Bearer ' + api_key)}
48
+
49
+ # Create and send the request
50
+ req = urllib.request.Request(url, body, headers)
51
+
52
+ try:
53
+ logger.info(f"Sending request to {url}")
54
+ response = urllib.request.urlopen(req)
55
+ result = response.read().decode('utf-8')
56
+ logger.info("Received response successfully")
57
+ return json.loads(result)
58
+ except urllib.error.HTTPError as error:
59
+ logger.error(f"Request failed with status code: {error.code}")
60
+ logger.error(f"Headers: {error.info()}")
61
+ error_message = error.read().decode("utf8", 'ignore')
62
+ logger.error(f"Error message: {error_message}")
63
+ return {"error": error_message}
64
+
65
+ def load_audio_from_url(url):
66
+ """Load audio from a URL using soundfile
67
+ Args:
68
+ url (str): URL of the audio file
69
+ Returns:
70
+ tuple: (sample_rate, audio_data) if successful, None otherwise
71
+ str: file path to the temporary saved audio file
72
+ """
73
+ try:
74
+ # Get the audio file from the URL
75
+ response = requests.get(url)
76
+ response.raise_for_status() # Raise exception for bad status codes
77
+
78
+ # For other formats that soundfile supports directly (WAV, FLAC, etc.)
79
+ audio_data, sample_rate = sf.read(BytesIO(response.content))
80
+
81
+ # Save to a temporary file to be used by the chatbot
82
+ file_extension = os.path.splitext(url)[1].lower()
83
+ if not file_extension:
84
+ file_extension = '.wav' # Default to .wav if no extension
85
+
86
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=file_extension)
87
+ sf.write(temp_file.name, audio_data, sample_rate)
88
+
89
+ return (sample_rate, audio_data), temp_file.name
90
+ except Exception as e:
91
+ logger.error(f"Error loading audio from URL: {e}")
92
+ return None, None
93
+
94
+ def encode_base64_from_file(file_path):
95
+ """Encode file content to base64 string and determine MIME type."""
96
+ file_extension = os.path.splitext(file_path)[1].lower()
97
+
98
+ # Map file extensions to MIME types
99
+ if file_extension in ['.jpg', '.jpeg']:
100
+ mime_type = "image/jpeg"
101
+ elif file_extension == '.png':
102
+ mime_type = "image/png"
103
+ elif file_extension == '.gif':
104
+ mime_type = "image/gif"
105
+ elif file_extension in ['.bmp', '.tiff', '.webp']:
106
+ mime_type = f"image/{file_extension[1:]}"
107
+ elif file_extension == '.flac':
108
+ mime_type = "audio/flac"
109
+ elif file_extension == '.wav':
110
+ mime_type = "audio/wav"
111
+ elif file_extension == '.mp3':
112
+ mime_type = "audio/mpeg"
113
+ elif file_extension in ['.m4a', '.aac']:
114
+ mime_type = "audio/aac"
115
+ elif file_extension == '.ogg':
116
+ mime_type = "audio/ogg"
117
+ else:
118
+ mime_type = "application/octet-stream"
119
+
120
+ # Read and encode file content
121
+ with open(file_path, "rb") as file:
122
+ encoded_string = base64.b64encode(file.read()).decode('utf-8')
123
+
124
+ return encoded_string, mime_type
125
+
126
+ def process_message(history, message, conversation_state):
127
+ """Process user message and update both history and internal state."""
128
+ # Extract text and files
129
+ text_content = message["text"] if message["text"] else ""
130
+
131
+ image_files = []
132
+ audio_files = []
133
+
134
+ # Create content array for internal state
135
+ content_items = []
136
+
137
+ # Add text if available
138
+ if text_content:
139
+ content_items.append({"type": "text", "text": text_content})
140
+
141
+ # Process and immediately convert files to base64
142
+ if message["files"] and len(message["files"]) > 0:
143
+ for file_path in message["files"]:
144
+ file_extension = os.path.splitext(file_path)[1].lower()
145
+ file_name = os.path.basename(file_path)
146
+
147
+ # Convert the file to base64 immediately
148
+ base64_content, mime_type = encode_base64_from_file(file_path)
149
+
150
+ # Add to content items for the API
151
+ if mime_type.startswith("image/"):
152
+ content_items.append({
153
+ "type": "image_url",
154
+ "image_url": {
155
+ "url": f"data:{mime_type};base64,{base64_content}"
156
+ }
157
+ })
158
+ image_files.append(file_path)
159
+ elif mime_type.startswith("audio/"):
160
+ content_items.append({
161
+ "type": "audio_url",
162
+ "audio_url": {
163
+ "url": f"data:{mime_type};base64,{base64_content}"
164
+ }
165
+ })
166
+ audio_files.append(file_path)
167
+
168
+ # Only proceed if we have content
169
+ if content_items:
170
+ # Add to Gradio chatbot history (for display)
171
+ history.append({"role": "user", "content": text_content})
172
+
173
+ # Add file messages if present
174
+ for file_path in image_files + audio_files:
175
+ history.append({"role": "user", "content": {"path": file_path}})
176
+
177
+ print(f"DEBUG: history = {history}")
178
+
179
+
180
+ # Add to internal conversation state (with base64 data)
181
+ conversation_state.append({
182
+ "role": "user",
183
+ "content": content_items
184
+ })
185
+
186
+ return history, gr.MultimodalTextbox(value=None, interactive=False), conversation_state
187
+
188
+ def bot_response(history, conversation_state):
189
+ """Generate bot response based on conversation state."""
190
+ if not conversation_state:
191
+ return history, conversation_state
192
+
193
+ # Create the payload
194
+ payload = {
195
+ "input_data": {
196
+ "input_string": conversation_state
197
+ }
198
+ }
199
+
200
+ # Log the payload for debugging (without base64 data)
201
+ debug_payload = json.loads(json.dumps(payload))
202
+ for item in debug_payload["input_data"]["input_string"]:
203
+ if "content" in item and isinstance(item["content"], list):
204
+ for content_item in item["content"]:
205
+ if "image_url" in content_item:
206
+ parts = content_item["image_url"]["url"].split(",")
207
+ if len(parts) > 1:
208
+ content_item["image_url"]["url"] = parts[0] + ",[BASE64_DATA_REMOVED]"
209
+ if "audio_url" in content_item:
210
+ parts = content_item["audio_url"]["url"].split(",")
211
+ if len(parts) > 1:
212
+ content_item["audio_url"]["url"] = parts[0] + ",[BASE64_DATA_REMOVED]"
213
+
214
+ logger.info(f"Sending payload: {json.dumps(debug_payload, indent=2)}")
215
+
216
+ # Call Azure ML endpoint
217
+ response = call_aml_endpoint(payload, url, api_key)
218
+
219
+ # Extract text response from the Azure ML endpoint response
220
+ try:
221
+ if isinstance(response, dict):
222
+ if "result" in response:
223
+ result = response["result"]
224
+ elif "output" in response:
225
+ # Depending on your API's response format
226
+ if isinstance(response["output"], list) and len(response["output"]) > 0:
227
+ result = response["output"][0]
228
+ else:
229
+ result = str(response["output"])
230
+ elif "error" in response:
231
+ result = f"Error: {response['error']}"
232
+ else:
233
+ # Just return the whole response as string if we can't parse it
234
+ result = f"Received response: {json.dumps(response)}"
235
+ else:
236
+ result = str(response)
237
+ except Exception as e:
238
+ result = f"Error processing response: {str(e)}"
239
+
240
+ # Add bot response to history
241
+ if result == "None":
242
+ result = "Current implementation does not support text + audio + image inputs in the same conversation. Please hit Clear conversation button."
243
+ history.append({"role": "assistant", "content": result})
244
+
245
+ # Add to conversation state
246
+ conversation_state.append({
247
+ "role": "assistant",
248
+ "content": [{"type": "text", "text": result}]
249
+ })
250
+
251
+ print(f"DEBUG: history after response: {history}")
252
+
253
+ return history, conversation_state
254
+
255
+ # Create Gradio demo
256
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
257
+ title = gr.Markdown("# Azure ML Multimodal Chatbot Demo")
258
+ description = gr.Markdown("""
259
+ This demo allows you to interact with a multimodal AI model through Azure ML.
260
+ You can type messages, upload images, or record audio to communicate with the AI.
261
+ """)
262
+
263
+ # Store the conversation state with base64 data
264
+ conversation_state = gr.State([])
265
+
266
+ with gr.Row():
267
+ with gr.Column(scale=4):
268
+ chatbot = gr.Chatbot(
269
+ type="messages",
270
+ avatar_images=(None, "https://upload.wikimedia.org/wikipedia/commons/d/d3/Phi-integrated-information-symbol.png",),
271
+ height=600
272
+ )
273
+
274
+ with gr.Row():
275
+ chat_input = gr.MultimodalTextbox(
276
+ interactive=True,
277
+ file_count="multiple",
278
+ placeholder="Enter a message or upload files (images, audio)...",
279
+ show_label=False,
280
+ sources=["microphone", "upload"],
281
+ )
282
+ with gr.Row():
283
+ clear_btn = gr.ClearButton([chatbot, chat_input], value="Clear conversation")
284
+ clear_btn.click(lambda: [], None, conversation_state) # Also clear the conversation state
285
+ gr.HTML("<div style='text-align: right; margin-top: 5px;'><small>Powered by Azure ML</small></div>")
286
+
287
+ # Define function to handle example submission directly
288
+ def handle_example_submission(text, files, history, conv_state):
289
+ """
290
+ Process an example submission directly including bot response
291
+ This bypasses the regular chat_input.submit flow
292
+ """
293
+ # Create a message object similar to what would be submitted by the user
294
+ message = {"text": text, "files": files if files else []}
295
+
296
+ # Use the same processing function as normal submissions
297
+ new_history, _, new_conv_state = process_message(history, message, conv_state)
298
+
299
+ # Then immediately trigger the bot response
300
+ final_history, final_conv_state = bot_response(new_history, new_conv_state)
301
+
302
+ # Re-enable the input box
303
+ chat_input.update(interactive=True)
304
+
305
+ # Return everything needed
306
+ return final_history, final_conv_state
307
+
308
+ with gr.Column(scale=1):
309
+ gr.Markdown("### Examples")
310
+
311
+ with gr.Tab("Text Only"):
312
+ # For text examples, just submit them directly
313
+ def run_text_example(example_text, history, conv_state):
314
+ # Process the example directly
315
+ return handle_example_submission(example_text, [], history, conv_state)
316
+
317
+ text_examples = gr.Examples(
318
+ examples=[
319
+ ["Tell me about Microsoft Azure cloud services."],
320
+ ["What can you help me with today?"],
321
+ ["Explain the difference between AI and machine learning."],
322
+ ],
323
+ inputs=[gr.Textbox(visible=False)],
324
+ outputs=[chatbot, conversation_state],
325
+ fn=lambda text, h=chatbot, c=conversation_state: run_text_example(text, h, c),
326
+ label="Text Examples (Click to run the example)"
327
+ )
328
+
329
+ with gr.Tab("Text & Audio"):
330
+ # Function to handle loading both text and audio from URL and sending directly
331
+ def run_audio_example(example_text, example_audio_url, history, conv_state):
332
+ try:
333
+ # Download and process the audio from URL
334
+ print(f"Downloading audio from: {example_audio_url}")
335
+ response = requests.get(example_audio_url)
336
+ response.raise_for_status()
337
+
338
+ # Save to a temporary file
339
+ file_extension = os.path.splitext(example_audio_url)[1].lower()
340
+ if not file_extension:
341
+ file_extension = '.wav' # Default to .wav if no extension
342
+
343
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=file_extension)
344
+ temp_file.write(response.content)
345
+ temp_file.close()
346
+
347
+ print(f"Saved audio to temporary file: {temp_file.name}")
348
+
349
+ # Process the example directly
350
+ return handle_example_submission(example_text, [temp_file.name], history, conv_state)
351
+ except Exception as e:
352
+ print(f"Error processing audio example: {e}")
353
+ # If an error occurs, just add the text to history
354
+ history.append({"role": "user", "content": f"{example_text} (Error loading audio: {e})"})
355
+ return history, conv_state
356
+
357
+ audio_examples = gr.Examples(
358
+ examples=[
359
+ ["Transcribe this audio clip", "https://diamondfan.github.io/audio_files/english.weekend.plan.wav"],
360
+ ["What language is being spoken in this recording?", "https://www2.cs.uic.edu/~i101/SoundFiles/BabyElephantWalk60.wav"],
361
+ ],
362
+ inputs=[
363
+ gr.Textbox(visible=False),
364
+ gr.Textbox(visible=False)
365
+ ],
366
+ outputs=[chatbot, conversation_state],
367
+ fn=lambda text, url, h=chatbot, c=conversation_state: run_audio_example(text, url, h, c),
368
+ label="Audio Examples (Click to run the example)"
369
+ )
370
+
371
+ with gr.Tab("Text & Image"):
372
+ # Function to handle loading both text and image from URL and sending directly
373
+ def run_image_example(example_text, example_image_url, history, conv_state):
374
+ try:
375
+ # Download the image from URL
376
+ print(f"Downloading image from: {example_image_url}")
377
+ response = requests.get(example_image_url)
378
+ response.raise_for_status()
379
+
380
+ # Save to a temporary file
381
+ file_extension = os.path.splitext(example_image_url)[1].lower()
382
+ if not file_extension:
383
+ file_extension = '.jpg' # Default to .jpg if no extension
384
+
385
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=file_extension)
386
+ temp_file.write(response.content)
387
+ temp_file.close()
388
+
389
+ print(f"Saved image to temporary file: {temp_file.name}")
390
+
391
+ # Process the example directly
392
+ return handle_example_submission(example_text, [temp_file.name], history, conv_state)
393
+ except Exception as e:
394
+ print(f"Error processing image example: {e}")
395
+ # If an error occurs, just add the text to history
396
+ history.append({"role": "user", "content": f"{example_text} (Error loading image: {e})"})
397
+ return history, conv_state
398
+
399
+ image_examples = gr.Examples(
400
+ examples=[
401
+ ["What's in this image?", "https://storage.googleapis.com/demo-image/dog.jpg"],
402
+ ["Describe this chart", "https://matplotlib.org/stable/_images/sphx_glr_bar_stacked_001.png"],
403
+ ],
404
+ inputs=[
405
+ gr.Textbox(visible=False),
406
+ gr.Textbox(visible=False)
407
+ ],
408
+ outputs=[chatbot, conversation_state],
409
+ fn=lambda text, url, h=chatbot, c=conversation_state: run_image_example(text, url, h, c),
410
+ label="Image Examples (Click to run the example)"
411
+ )
412
+
413
+ gr.Markdown("### Instructions")
414
+ gr.Markdown("""
415
+ - Type a question or statement
416
+ - Upload images or audio files
417
+ - You can combine text with media files
418
+ - The model can analyze images and transcribe audio
419
+ - For best results with images, use JPG or PNG files
420
+ - For audio, use WAV, MP3, or FLAC files
421
+ """)
422
+
423
+ gr.Markdown("### Capabilities")
424
+ gr.Markdown("""
425
+ This chatbot can:
426
+ - Answer questions and provide explanations
427
+ - Describe and analyze images
428
+ - Transcribe and analyze audio content
429
+ - Process multiple inputs in the same message
430
+ - Maintain context throughout the conversation
431
+ """)
432
+
433
+ with gr.Accordion("Debug Info", open=False):
434
+ debug_output = gr.JSON(
435
+ label="Last API Request",
436
+ value={}
437
+ )
438
+
439
+ def update_debug(conversation_state):
440
+ """Update debug output with the last payload that would be sent."""
441
+ if not conversation_state:
442
+ return {}
443
+
444
+ # Create a payload from the conversation
445
+ payload = {
446
+ "input_data": {
447
+ "input_string": conversation_state
448
+ }
449
+ }
450
+
451
+ # Remove base64 data to avoid cluttering the UI
452
+ sanitized_payload = json.loads(json.dumps(payload))
453
+ for item in sanitized_payload["input_data"]["input_string"]:
454
+ if "content" in item and isinstance(item["content"], list):
455
+ for content_item in item["content"]:
456
+ if "image_url" in content_item:
457
+ parts = content_item["image_url"]["url"].split(",")
458
+ if len(parts) > 1:
459
+ content_item["image_url"]["url"] = parts[0] + ",[BASE64_DATA_REMOVED]"
460
+ if "audio_url" in content_item:
461
+ parts = content_item["audio_url"]["url"].split(",")
462
+ if len(parts) > 1:
463
+ content_item["audio_url"]["url"] = parts[0] + ",[BASE64_DATA_REMOVED]"
464
+
465
+ return sanitized_payload
466
+
467
+ def enable_input():
468
+ """Re-enable the input box after bot responds."""
469
+ return gr.MultimodalTextbox(interactive=True)
470
+
471
+ # Set up event handlers
472
+ msg_submit = chat_input.submit(
473
+ process_message, [chatbot, chat_input, conversation_state], [chatbot, chat_input, conversation_state], queue=False
474
+ )
475
+
476
+ msg_response = msg_submit.then(
477
+ bot_response, [chatbot, conversation_state], [chatbot, conversation_state], api_name="bot_response"
478
+ )
479
+
480
+ msg_response.then(enable_input, None, chat_input)
481
+ # btn_response.then(enable_input, None, chat_input)
482
+
483
+ # Update debug info
484
+ # msg_response.then(update_debug, conversation_state, debug_output)
485
+ # btn_response.then(update_debug, conversation_state, debug_output)
486
+
487
+ demo.launch(share=True, debug=True)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ azure-ai-inference==1.0.0b9
2
+ azureml-inference-server-http==1.0.0
3
+ pillow==11.1.0
4
+ soundfile