|
|
import gradio as gr |
|
|
from pathlib import Path |
|
|
import yt_dlp |
|
|
import logging |
|
|
import librosa |
|
|
import numpy as np |
|
|
from PIL import Image |
|
|
import ffmpeg |
|
|
import shutil |
|
|
import tempfile |
|
|
import time |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.DEBUG) |
|
|
|
|
|
def analyze_audio(youtube_url, input_text, input_image=None, slider_value=50, checkbox_value=False): |
|
|
""" |
|
|
Downloads YouTube audio, performs automatic audio feature analysis with librosa, and processes inputs. |
|
|
Automatically handles file and folder management. |
|
|
|
|
|
Args: |
|
|
youtube_url (str): YouTube video URL (optional). |
|
|
input_text (str): Text input for processing. |
|
|
input_image (PIL.Image, optional): Image input for processing. |
|
|
slider_value (float): Numerical parameter (e.g., analysis threshold). |
|
|
checkbox_value (bool): Toggle for enhanced analysis. |
|
|
|
|
|
Returns: |
|
|
tuple: (processed_text, output_image_display, output_audio, extra_info) |
|
|
""" |
|
|
|
|
|
temp_dir = Path(tempfile.mkdtemp(prefix="audio_analysis_")) |
|
|
output_dir = temp_dir / "downloaded_media" |
|
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
logging.debug(f"Created temporary directory: {temp_dir}, output directory: {output_dir}") |
|
|
|
|
|
try: |
|
|
|
|
|
processed_text = f"Processed: '{input_text}'." |
|
|
output_image_display = input_image |
|
|
output_audio = None |
|
|
extra_info = f"Threshold: {slider_value/100:.2f}" |
|
|
|
|
|
|
|
|
if youtube_url: |
|
|
try: |
|
|
|
|
|
if not youtube_url.startswith(("https://www.youtube.com/", "https://youtu.be/")): |
|
|
return "Error: Invalid YouTube URL", None, None, "Processing failed." |
|
|
|
|
|
|
|
|
ydl_opts = { |
|
|
'format': 'bestaudio/best', |
|
|
'outtmpl': str(output_dir / '%(title)s.%(ext)s'), |
|
|
'postprocessors': [{ |
|
|
'key': 'FFmpegExtractAudio', |
|
|
'preferredcodec': 'mp3', |
|
|
'preferredquality': '192', |
|
|
}], |
|
|
'restrictfilenames': True, |
|
|
'noplaylist': True, |
|
|
} |
|
|
|
|
|
|
|
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl: |
|
|
info = ydl.extract_info(youtube_url, download=True) |
|
|
audio_file = output_dir / f"{info['title']}.mp3" |
|
|
logging.debug(f"Downloaded audio: {audio_file}") |
|
|
output_audio = str(audio_file) |
|
|
|
|
|
|
|
|
y, sr = librosa.load(audio_file) |
|
|
hop_length = 512 |
|
|
logging.debug(f"Using hop_length: {hop_length}") |
|
|
|
|
|
|
|
|
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, hop_length=hop_length) |
|
|
spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr, hop_length=hop_length) |
|
|
tempo, _ = librosa.beat.beat_track(y=y, sr=sr, hop_length=hop_length) |
|
|
|
|
|
|
|
|
mfcc_mean = np.mean(mfcc, axis=1).tolist()[:3] |
|
|
spectral_centroid_mean = np.mean(spectral_centroid) |
|
|
features_summary = ( |
|
|
f"Audio Features: MFCC (mean of first 3 coeffs): {mfcc_mean}, " |
|
|
f"Spectral Centroid: {spectral_centroid_mean:.2f} Hz, " |
|
|
f"Tempo: {tempo:.2f} BPM" |
|
|
) |
|
|
|
|
|
processed_text += f" {features_summary}." |
|
|
extra_info += f", Audio: {audio_file.name}" |
|
|
|
|
|
except Exception as e: |
|
|
logging.error(f"YouTube download or audio processing error: {str(e)}") |
|
|
processed_text += f" Error processing YouTube audio: {str(e)}." |
|
|
|
|
|
|
|
|
if input_image is not None: |
|
|
from PIL import ImageEnhance |
|
|
enhancer = ImageEnhance.Brightness(input_image) |
|
|
output_image_display = enhancer.enhance(1.5) |
|
|
processed_text += " Image processed (brightened)." |
|
|
else: |
|
|
processed_text += " No image provided." |
|
|
|
|
|
|
|
|
processed_text += f" Slider: {slider_value}, Enhanced Analysis: {checkbox_value}." |
|
|
if checkbox_value: |
|
|
processed_text += " Enhanced analysis enabled." |
|
|
if youtube_url and slider_value > 50: |
|
|
processed_text += f" High threshold ({slider_value}) applied for deeper analysis." |
|
|
|
|
|
return processed_text, output_image_display, output_audio, extra_info |
|
|
|
|
|
except Exception as e: |
|
|
logging.error(f"Error in analyze_audio: {str(e)}") |
|
|
return f"Error: {str(e)}", None, None, "Processing failed." |
|
|
|
|
|
finally: |
|
|
|
|
|
try: |
|
|
time.sleep(1) |
|
|
if temp_dir.exists(): |
|
|
shutil.rmtree(temp_dir) |
|
|
logging.debug(f"Cleaned up temporary directory: {temp_dir}") |
|
|
except Exception as e: |
|
|
logging.error(f"Error cleaning up temporary directory: {str(e)}") |
|
|
|
|
|
|
|
|
input_youtube_url = gr.Textbox( |
|
|
label="YouTube Video URL", |
|
|
placeholder="e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ", |
|
|
info="Optional: Enter a YouTube URL to download and analyze audio." |
|
|
) |
|
|
input_text_component = gr.Textbox( |
|
|
label="Input Text", |
|
|
placeholder="e.g., Analyze this audio track", |
|
|
info="Type a description or query for processing." |
|
|
) |
|
|
input_image_component = gr.Image( |
|
|
type="pil", |
|
|
label="Upload Image (Optional)", |
|
|
sources=["upload", "webcam", "clipboard"] |
|
|
) |
|
|
input_slider_component = gr.Slider( |
|
|
minimum=0, |
|
|
maximum=100, |
|
|
value=50, |
|
|
step=1, |
|
|
label="Analysis Threshold", |
|
|
info="Adjusts sensitivity of audio feature analysis." |
|
|
) |
|
|
input_checkbox_component = gr.Checkbox( |
|
|
label="Enable Enhanced Analysis", |
|
|
info="Toggle for deeper audio feature extraction." |
|
|
) |
|
|
|
|
|
|
|
|
output_text_component = gr.Textbox( |
|
|
label="Analysis Results", |
|
|
info="Text results including audio feature analysis." |
|
|
) |
|
|
output_image_component = gr.Image( |
|
|
label="Processed Image (if any)", |
|
|
info="Processed image output (if provided)." |
|
|
) |
|
|
output_audio_component = gr.Audio( |
|
|
label="Downloaded Audio", |
|
|
type="filepath", |
|
|
info="Audio downloaded from YouTube." |
|
|
) |
|
|
output_label_component = gr.Label( |
|
|
label="Analysis Summary", |
|
|
info="Feature analysis details and processing info." |
|
|
) |
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=analyze_audio, |
|
|
inputs=[ |
|
|
input_youtube_url, |
|
|
input_text_component, |
|
|
input_image_component, |
|
|
input_slider_component, |
|
|
input_checkbox_component |
|
|
], |
|
|
outputs=[ |
|
|
output_text_component, |
|
|
output_image_component, |
|
|
output_audio_component, |
|
|
output_label_component |
|
|
], |
|
|
title="YouTube Audio Feature Analysis", |
|
|
description="Download YouTube audio, analyze features with librosa, and process text/image inputs. Customize with slider and checkbox.", |
|
|
examples=[ |
|
|
["https://www.youtube.com/watch?v=dQw4w9WgXcQ", "Analyze this track", None, 75, True], |
|
|
[None, "Describe a music track", None, 30, False], |
|
|
["https://www.youtube.com/watch?v=9bZkp7q19f0", "Extract audio features", None, 60, True] |
|
|
], |
|
|
allow_flagging="never", |
|
|
theme=gr.themes.Soft() |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
iface.launch() |