Lasha commited on
Commit
9ee41ab
·
1 Parent(s): 1ee329d

Deploy Music Flamingo Gradio app

Browse files
Files changed (4) hide show
  1. README.md +3 -2
  2. app.py +541 -0
  3. packages.txt +4 -0
  4. requirements.txt +10 -0
README.md CHANGED
@@ -1,12 +1,13 @@
1
  ---
2
  title: Music Flamingo
3
- emoji: 🌍
4
  colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.49.1
 
8
  app_file: app.py
9
- pinned: false
10
  license: apache-2.0
11
  ---
12
 
 
1
  ---
2
  title: Music Flamingo
3
+ emoji: 🎵
4
  colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.49.1
8
+ python_version: 3.12
9
  app_file: app.py
10
+ pinned: true
11
  license: apache-2.0
12
  ---
13
 
app.py ADDED
@@ -0,0 +1,541 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import shutil
2
+ import gradio as gr
3
+ import yt_dlp
4
+ import os
5
+ import tempfile
6
+ import re
7
+ import subprocess
8
+ import socket
9
+ import time
10
+ import atexit
11
+
12
+ from transformers import AutoModel, AutoProcessor
13
+
14
+ PROXY_URL = None
15
+ _tunnel_proc = None
16
+
17
+
18
+ def _write_temp_key_and_kh(key_str, kh_line):
19
+ key_clean = key_str.replace("\r\n", "\n").replace("\r", "\n")
20
+ if not key_clean.endswith("\n"):
21
+ key_clean += "\n"
22
+ keyf = tempfile.NamedTemporaryFile("w", delete=False)
23
+ keyf.write(key_clean)
24
+ keyf.flush()
25
+ os.chmod(keyf.name, 0o600)
26
+ keyf.close()
27
+ khf = tempfile.NamedTemporaryFile("w", delete=False)
28
+ khf.write(kh_line.strip() + "\n")
29
+ khf.flush()
30
+ khf.close()
31
+ return keyf.name, khf.name
32
+
33
+
34
+ def _validate_private_key(path):
35
+ if not shutil.which("ssh-keygen"):
36
+ return True
37
+ try:
38
+ subprocess.check_output(["ssh-keygen", "-y", "-f", path], stderr=subprocess.STDOUT)
39
+ return True
40
+ except subprocess.CalledProcessError:
41
+ return False
42
+
43
+
44
+ def _ensure_local_socks_tunnel():
45
+ global PROXY_URL, _tunnel_proc
46
+ if PROXY_URL:
47
+ return
48
+ srv = os.getenv("SSH_SERVER")
49
+ port = os.getenv("SSH_PORT", "22")
50
+ key = os.getenv("SSH_PRIVATE_KEY")
51
+ hk = os.getenv("SSH_HOSTKEY")
52
+ if not (srv and key and hk and shutil.which("ssh")):
53
+ return
54
+ key_path, kh_path = _write_temp_key_and_kh(key, hk)
55
+ if not _validate_private_key(key_path):
56
+ return
57
+ cmd = [
58
+ "ssh","-NT","-p", port,"-i", key_path,
59
+ "-D","127.0.0.1:1080",
60
+ "-o","IdentitiesOnly=yes",
61
+ "-o","ExitOnForwardFailure=yes",
62
+ "-o","BatchMode=yes",
63
+ "-o","StrictHostKeyChecking=yes",
64
+ "-o", f"UserKnownHostsFile={kh_path}",
65
+ "-o","GlobalKnownHostsFile=/dev/null",
66
+ "-o","ServerAliveInterval=30","-o","ServerAliveCountMax=3",
67
+ srv,
68
+ ]
69
+ with open("/tmp/ssh_tunnel.log", "w") as lf:
70
+ _tunnel_proc = subprocess.Popen(cmd, stdout=lf, stderr=lf)
71
+ for _ in range(40):
72
+ if _tunnel_proc.poll() is not None:
73
+ return
74
+ try:
75
+ socket.create_connection(("127.0.0.1", 1080), 0.5).close()
76
+ PROXY_URL = "socks5h://127.0.0.1:1080"
77
+ break
78
+ except OSError:
79
+ time.sleep(0.25)
80
+ atexit.register(lambda: _tunnel_proc and _tunnel_proc.terminate())
81
+
82
+
83
+ _ensure_local_socks_tunnel()
84
+
85
+
86
+ MODEL_ID = "nvidia/music-flamingo-hf"
87
+ HERO_IMAGE_URL = "https://musicflamingo.github.io/logo-no-bg.png"
88
+ HERO_TITLE = "Music Flamingo: Scaling Music Understanding in Audio Language Models"
89
+ HERO_SUBTITLE = "Upload a song and ask anything — including captions, lyrics, genre, key, chords, or complex questions. Music Flamingo gives detailed answers."
90
+ HERO_AUTHORS = """
91
+ <div style="margin-top: 8px; margin-bottom: 4px; padding: 8px 20px; text-align: center; max-width: 900px; margin-inline: auto;">
92
+ <p style="font-size: 0.95rem; line-height: 1.6; margin-bottom: 10px;">
93
+ <strong>Authors:</strong> Sreyan Ghosh<sup>1,2*</sup>, Arushi Goel<sup>1*</sup>, Lasha Koroshinadze<sup>2**</sup>, Sang-gil Lee<sup>1</sup>, Zhifeng Kong<sup>1</sup>, Joao Felipe Santos<sup>1</sup>,<br>Ramani Duraiswami<sup>2</sup>, Dinesh Manocha<sup>2</sup>, Wei Ping<sup>1</sup>, Mohammad Shoeybi<sup>1</sup>, Bryan Catanzaro<sup>1</sup>
94
+ </p>
95
+ <p style="font-size: 0.88rem; opacity: 0.75; margin-bottom: 8px;">
96
+ <sup>1</sup>NVIDIA, CA, USA | <sup>2</sup>University of Maryland, College Park, USA
97
+ </p>
98
+ <p style="font-size: 0.82rem; opacity: 0.65; font-style: italic; margin-bottom: 6px;">
99
+ *Equally contributed and led the project. Names randomly ordered. **Significant technical contribution.
100
+ </p>
101
+ <p style="font-size: 0.85rem; opacity: 0.7; margin-bottom: 0;">
102
+ <strong>Correspondence:</strong> <a href="mailto:sreyang@umd.edu" style="color: inherit; text-decoration: underline;">sreyang@umd.edu</a>, <a href="mailto:arushig@nvidia.com" style="color: inherit; text-decoration: underline;">arushig@nvidia.com</a>
103
+ </p>
104
+ </div>
105
+ """
106
+ HERO_BADGES = """
107
+ <div style="display: flex; justify-content: center; margin-top: 6px; align-items: center;">
108
+ <div style="display: flex; justify-content: center; flex-wrap: wrap; gap: 8px;">
109
+ <a href="https://arxiv.org/abs/2511.10289"><img src="https://img.shields.io/badge/arXiv-2511.10289-AD1C18" alt="arXiv"></a>
110
+ <a href="https://research.nvidia.com/labs/adlr/MF/"><img src="https://img.shields.io/badge/Demo page-228B22" alt="Demo page"></a>
111
+ <a href="https://github.com/NVIDIA/audio-flamingo"><img src='https://img.shields.io/badge/Github-Audio Flamingo 3-9C276A' alt="Github"></a>
112
+ <a href="https://github.com/NVIDIA/audio-flamingo/stargazers"><img src="https://img.shields.io/github/stars/NVIDIA/audio-flamingo.svg?style=social" alt="Stars"></a>
113
+ <a href="https://huggingface.co/nvidia/music-flamingo">
114
+ <img src="https://img.shields.io/badge/🤗-Checkpoints-ED5A22.svg" alt="Checkpoints">
115
+ </a>
116
+ <a href="https://huggingface.co/datasets/nvidia/MF-Skills">
117
+ <img src="https://img.shields.io/badge/🤗-Dataset: MF--Skills-ED5A22.svg" alt="Dataset">
118
+ </a>
119
+ </div>
120
+ </div>
121
+ """
122
+ APP_CSS = """
123
+ :root {
124
+ --font-sans: ui-sans-serif, system-ui, sans-serif,
125
+ "Apple Color Emoji", "Segoe UI Emoji",
126
+ "Segoe UI Symbol", "Noto Color Emoji";
127
+ --font-mono: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas,
128
+ "Liberation Mono", "Courier New", monospace;
129
+
130
+ --app-font: var(--font-sans);
131
+ }
132
+
133
+ body {
134
+ font-family: var(--app-font);
135
+ }
136
+
137
+ .gradio-container {
138
+ font-family: var(--app-font);
139
+ max-width: 80rem !important; /* Tailwind max-w-7xl (1280px) */
140
+ width: 100%;
141
+ margin-inline: auto; /* mx-auto */
142
+ padding-inline: 1rem; /* px-4 */
143
+ padding-bottom: 64px;
144
+ }
145
+
146
+ .hero {
147
+ display: flex;
148
+ flex-direction: column;
149
+ align-items: center;
150
+ gap: 12px;
151
+ padding: 24px 24px 32px;
152
+ text-align: center;
153
+ }
154
+
155
+ .hero__logo {
156
+ width: 112px;
157
+ height: 112px;
158
+ border-radius: 50%;
159
+ box-shadow: 0 12px 40px rgba(0, 0, 0, 0.15);
160
+ }
161
+
162
+ .hero__title {
163
+ font-size: clamp(2.4rem, 5.4vw, 3.2rem);
164
+ font-weight: 700;
165
+ line-height: 1.5;
166
+ letter-spacing: -0.01em;
167
+ background: linear-gradient(120deg, #ff6bd6 0%, #af66ff 35%, #4e9cff 100%);
168
+ -webkit-background-clip: text;
169
+ background-clip: text;
170
+ color: transparent;
171
+ }
172
+
173
+ .hero__subtitle {
174
+ max-width: none;
175
+ font-size: 1.08rem;
176
+ opacity: 0.8;
177
+ }
178
+
179
+ .tab-nav {
180
+ border-radius: 18px;
181
+ border: 1px solid var(--border-color-primary);
182
+ padding: 6px;
183
+ margin: 0 18px 12px;
184
+ }
185
+
186
+ .tab-nav button {
187
+ border-radius: 12px !important;
188
+ }
189
+
190
+ .tab-nav button[aria-selected="true"] {
191
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
192
+ }
193
+
194
+ .panel-row {
195
+ gap: 24px !important;
196
+ align-items: stretch;
197
+ flex-wrap: wrap;
198
+ }
199
+
200
+ .glass-card {
201
+ border: 1px solid var(--border-color-primary);
202
+ border-radius: 26px;
203
+ padding: 28px;
204
+ box-shadow: 0 8px 25px rgba(0, 0, 0, 0.1);
205
+ display: flex;
206
+ flex-direction: column;
207
+ gap: 18px;
208
+ }
209
+
210
+ /* Glass card content styling */
211
+ .glass-card .gradio-input,
212
+ .glass-card .gradio-output {
213
+ /* Let Gradio handle default styling */
214
+ }
215
+
216
+ .glass-card label {
217
+ font-weight: 600;
218
+ letter-spacing: 0.01em;
219
+ }
220
+
221
+ /* Text input styling */
222
+ .glass-card textarea {
223
+ border-radius: 18px !important;
224
+ }
225
+
226
+ .glass-card textarea:focus {
227
+ box-shadow: 0 0 0 3px rgba(0, 123, 255, 0.25) !important;
228
+ }
229
+
230
+ /* Audio component fix */
231
+ .glass-card [data-testid="Audio"] .wrap {
232
+ /* Let Gradio handle default styling */
233
+ }
234
+
235
+ /* YouTube embed styling */
236
+ .glass-card [data-testid="HTML"] {
237
+ margin: 12px 0;
238
+ }
239
+
240
+ /* Load button styling */
241
+ .glass-card button[variant="secondary"] {
242
+ border-radius: 12px !important;
243
+ font-weight: 500 !important;
244
+ }
245
+
246
+ /* Action button styling */
247
+ .accent-button {
248
+ background: linear-gradient(120deg, #ff6bd6 0%, #8f5bff 45%, #4e9cff 100%) !important;
249
+ border-radius: 14px !important;
250
+ box-shadow: 0 6px 20px rgba(0, 0, 0, 0.15);
251
+ color: #ffffff !important;
252
+ font-weight: 600 !important;
253
+ letter-spacing: 0.01em;
254
+ padding: 0.85rem 1.5rem !important;
255
+ transition: transform 0.18s ease, box-shadow 0.18s ease;
256
+ }
257
+
258
+ .accent-button:hover {
259
+ transform: translateY(-2px);
260
+ box-shadow: 0 8px 25px rgba(0, 0, 0, 0.2);
261
+ }
262
+
263
+ .accent-button:active {
264
+ transform: translateY(0px);
265
+ box-shadow: 0 4px 15px rgba(0, 0, 0, 0.15);
266
+ }
267
+
268
+ .footer-note {
269
+ text-align: center;
270
+ opacity: 0.6;
271
+ margin-top: 28px;
272
+ font-size: 0.95rem;
273
+ }
274
+ """
275
+ EXAMPLE_YOUTUBE_PROMPTS = [
276
+ [
277
+ "https://youtu.be/ko70cExuzZM",
278
+ "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
279
+ ],
280
+ [
281
+ "https://youtu.be/iywaBOMvYLI",
282
+ "Generate a structured lyric sheet from the input music.",
283
+ ],
284
+ [
285
+ "https://youtu.be/_mTRvJ9fugM",
286
+ "Which line directly precedes the chorus?",
287
+ ],
288
+ ]
289
+ processor = AutoProcessor.from_pretrained(MODEL_ID)
290
+ model = AutoModel.from_pretrained(MODEL_ID, device_map="auto").eval()
291
+
292
+ _youtube_cache = {}
293
+
294
+
295
+ def clear_youtube_cache():
296
+ """Clear the YouTube audio cache and delete cached files."""
297
+ import shutil
298
+
299
+ for url, (file_path, title) in _youtube_cache.items():
300
+ try:
301
+ if os.path.exists(file_path):
302
+ temp_dir = os.path.dirname(file_path)
303
+ shutil.rmtree(temp_dir)
304
+ except Exception:
305
+ pass
306
+ _youtube_cache.clear()
307
+
308
+
309
+ def truncate_title(title, max_length=50):
310
+ """Truncate long titles with ellipsis to prevent UI wrapping."""
311
+ if len(title) <= max_length:
312
+ return title
313
+ return title[: max_length - 3] + "..."
314
+
315
+
316
+ def extract_youtube_id(url):
317
+ """Extract YouTube video ID from various YouTube URL formats."""
318
+ patterns = [
319
+ r"(?:https?://)?(?:www\.)?youtube\.com/watch\?v=([^&=%\?]{11})",
320
+ r"(?:https?://)?(?:www\.)?youtu\.be/([^&=%\?]{11})",
321
+ r"(?:https?://)?(?:www\.)?youtube\.com/embed/([^&=%\?]{11})",
322
+ r"(?:https?://)?(?:www\.)?youtube-nocookie\.com/embed/([^&=%\?]{11})",
323
+ r"(?:https?://)?(?:www\.)?youtube\.com/v/([^&=%\?]{11})",
324
+ ]
325
+
326
+ for pattern in patterns:
327
+ match = re.search(pattern, url)
328
+ if match:
329
+ return match.group(1)
330
+ return None
331
+
332
+
333
+ def generate_youtube_embed(url, title="YouTube Video"):
334
+ """Generate YouTube embed HTML from URL."""
335
+ video_id = extract_youtube_id(url)
336
+ if not video_id:
337
+ return ""
338
+
339
+ embed_html = f"""
340
+ <div style="position: relative; width: 100%; height: 0; padding-bottom: 56.25%; border-radius: 12px; overflow: hidden; box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3);">
341
+ <iframe
342
+ style="position: absolute; top: 0; left: 0; width: 100%; height: 100%;"
343
+ src="https://www.youtube.com/embed/{video_id}"
344
+ title="{title}"
345
+ frameborder="0"
346
+ allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
347
+ referrerpolicy="strict-origin-when-cross-origin"
348
+ allowfullscreen>
349
+ </iframe>
350
+ </div>
351
+ """
352
+ return embed_html
353
+
354
+
355
+ def download_youtube_audio(url, force_reload=False):
356
+ """Download audio from YouTube URL and return the file path."""
357
+ try:
358
+ youtube_regex = re.compile(r"(https?://)?(www\.)?(youtube|youtu|youtube-nocookie)\.(com|be)/" r"(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})")
359
+ if not youtube_regex.match(url):
360
+ return None, "❌ Invalid YouTube URL format"
361
+
362
+ if not force_reload and url in _youtube_cache:
363
+ cached_path, cached_title = _youtube_cache[url]
364
+ if os.path.exists(cached_path):
365
+ return cached_path, f"✅ Using cached: {truncate_title(cached_title)}"
366
+
367
+ if force_reload and url in _youtube_cache:
368
+ old_path, _ = _youtube_cache[url]
369
+ try:
370
+ if os.path.exists(old_path):
371
+ import shutil
372
+
373
+ temp_dir = os.path.dirname(old_path)
374
+ shutil.rmtree(temp_dir)
375
+ except Exception:
376
+ pass
377
+ del _youtube_cache[url]
378
+
379
+ temp_dir = tempfile.mkdtemp()
380
+
381
+ ydl_opts = {
382
+ "format": "bestaudio/best",
383
+ "outtmpl": os.path.join(temp_dir, "%(title)s.%(ext)s"),
384
+ "postprocessors": [
385
+ {
386
+ "key": "FFmpegExtractAudio",
387
+ "preferredcodec": "mp3",
388
+ "preferredquality": "128",
389
+ }
390
+ ],
391
+ "noplaylist": True,
392
+ }
393
+ if PROXY_URL:
394
+ ydl_opts["proxy"] = PROXY_URL
395
+
396
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
397
+ info = ydl.extract_info(url, download=False)
398
+ title = info.get("title", "Unknown")
399
+
400
+ ydl.download([url])
401
+
402
+ for file in os.listdir(temp_dir):
403
+ if file.endswith(".mp3"):
404
+ file_path = os.path.join(temp_dir, file)
405
+ _youtube_cache[url] = (file_path, title)
406
+ return file_path, f"✅ Downloaded: {truncate_title(title)}"
407
+
408
+ return None, "❌ Failed to download audio file"
409
+
410
+ except Exception as e:
411
+ return None, f"❌ Download error: {str(e)}"
412
+
413
+
414
+ def infer(audio_path, youtube_url, prompt_text):
415
+ try:
416
+ final_audio_path = None
417
+ status_message = ""
418
+
419
+ if audio_path:
420
+ final_audio_path = audio_path
421
+ status_message = "✅ Using audio file"
422
+ elif youtube_url.strip():
423
+ final_audio_path, status_message = download_youtube_audio(youtube_url.strip())
424
+ if not final_audio_path:
425
+ return status_message
426
+ else:
427
+ return "❌ Please either upload an audio file or provide a YouTube URL."
428
+
429
+ conversations = [
430
+ [
431
+ {
432
+ "role": "user",
433
+ "content": [
434
+ {"type": "text", "text": prompt_text or ""},
435
+ {"type": "audio", "path": final_audio_path},
436
+ ],
437
+ }
438
+ ]
439
+ ]
440
+
441
+ # NOTE: If `conversations` includes audio, apply_chat_template() decodes via load_audio()
442
+ # to MONO float32 at 16 kHz by default. We omit `sampling_rate`, so the 16k default is used.
443
+ # Processor assumes mono 1-D audio; stereo would require code changes. No audio ⇒ no effect here.
444
+ batch = processor.apply_chat_template(
445
+ conversations,
446
+ tokenize=True,
447
+ add_generation_prompt=True,
448
+ return_dict=True,
449
+ ).to(model.device)
450
+
451
+ gen_ids = model.generate(**batch, max_new_tokens=1024)
452
+ inp_len = batch["input_ids"].shape[1]
453
+ new_tokens = gen_ids[:, inp_len:]
454
+ texts = processor.batch_decode(new_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False)
455
+
456
+ result = texts[0] if texts else ""
457
+ return f"{status_message}\n\n{result}"
458
+ except Exception as e:
459
+ return f"❌ Error: {str(e)}"
460
+
461
+
462
+ def load_youtube_audio(youtube_url):
463
+ """Load YouTube audio into the Audio component and generate video embed."""
464
+ if not youtube_url.strip():
465
+ return None, "❌ Please enter a YouTube URL", ""
466
+
467
+ embed_html = generate_youtube_embed(youtube_url.strip())
468
+
469
+ audio_path, message = download_youtube_audio(youtube_url.strip(), force_reload=True)
470
+
471
+ if audio_path:
472
+ return audio_path, message, embed_html
473
+ else:
474
+ return None, message, embed_html
475
+
476
+
477
+ with gr.Blocks(css=APP_CSS, theme=gr.themes.Soft(primary_hue="purple", secondary_hue="fuchsia")) as demo:
478
+ gr.HTML(
479
+ f"""
480
+ <div class="hero">
481
+ <img src="{HERO_IMAGE_URL}" alt="Music Flamingo logo" class="hero__logo" />
482
+ <h1 class="hero__title">{HERO_TITLE}</h1>
483
+ <p class="hero__subtitle">{HERO_SUBTITLE}</p>
484
+ {HERO_AUTHORS}
485
+ {HERO_BADGES}
486
+ </div>
487
+ """
488
+ )
489
+
490
+ with gr.Tabs(elem_classes="tab-nav"):
491
+ with gr.Row(elem_classes="panel-row"):
492
+ with gr.Column(elem_classes=["glass-card"]):
493
+ gr.Markdown("### 🎵 Audio Input")
494
+ audio_in = gr.Audio(
495
+ sources=["upload", "microphone"],
496
+ type="filepath",
497
+ label="Upload Audio File",
498
+ show_label=True,
499
+ )
500
+ gr.Markdown("**OR**")
501
+ youtube_url = gr.Textbox(label="YouTube URL", placeholder="https://www.youtube.com/watch?v=...", info="Paste any YouTube URL - we'll extract high-quality audio automatically")
502
+ load_btn = gr.Button("🔄 Load Audio", variant="secondary", size="sm")
503
+ status_text = gr.Textbox(label="Status", interactive=False, visible=False)
504
+ youtube_embed = gr.HTML(label="Video Preview", visible=False)
505
+ prompt_in = gr.Textbox(
506
+ label="Prompt",
507
+ value="Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
508
+ placeholder="Ask a question about the audio…",
509
+ lines=6,
510
+ )
511
+
512
+ gr.Examples(
513
+ examples=EXAMPLE_YOUTUBE_PROMPTS,
514
+ inputs=[youtube_url, prompt_in],
515
+ label="🎵 Example Prompts",
516
+ )
517
+
518
+ btn = gr.Button("Generate Answer", elem_classes="accent-button")
519
+ with gr.Column(elem_classes=["glass-card"]):
520
+ out = gr.Textbox(
521
+ label="Model Response",
522
+ lines=25,
523
+ placeholder="Model answers will appear here with audio-informed insights…",
524
+ )
525
+
526
+ load_btn.click(lambda: [None, "🔄 Loading audio...", gr.update(visible=True)], outputs=[audio_in, status_text, status_text]).then(
527
+ fn=load_youtube_audio, inputs=[youtube_url], outputs=[audio_in, status_text, youtube_embed]
528
+ ).then(lambda: gr.update(visible=True), outputs=[youtube_embed])
529
+
530
+ btn.click(fn=infer, inputs=[audio_in, youtube_url, prompt_in], outputs=out)
531
+ gr.HTML(
532
+ """
533
+ <div class="footer-note">
534
+ © 2025 NVIDIA | Powered by 🤗 Transformers + Gradio
535
+ </div>
536
+ """
537
+ )
538
+
539
+
540
+ if __name__ == "__main__":
541
+ demo.launch(share=True)
packages.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ ffmpeg
2
+ libsndfile1
3
+ git
4
+ openssh-client
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/transformers
2
+
3
+ accelerate
4
+ torch
5
+ torchaudio
6
+ librosa
7
+ soundfile
8
+ yt-dlp
9
+ gradio
10
+ pysocks