Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| from typing import Dict, Tuple, Any, Optional | |
| import gradio as gr | |
| from tools.audio_cleaning import remove_noise | |
| from tools.audio_cutting import ( | |
| cut_audio, | |
| mute_time_windows, | |
| extract_segments, | |
| trim_audio, | |
| ) | |
| from tools.audio_info import get_audio_info | |
| from tools.audio_insertion import ( | |
| insert_section, | |
| replace_section, | |
| ) | |
| from tools.combine_tracks import create_medley | |
| from tools.music_understanding import ( | |
| understand_music, | |
| analyze_music_structure, | |
| suggest_cutting_points, | |
| analyze_genre_and_style, | |
| ) | |
| from tools.pitch_alignment import estimate_key, align_songs_by_key, shift_to_key | |
| from tools.stems_separation import ( | |
| separate_audio, | |
| extract_selected_stems, | |
| extract_vocal_non_vocal, | |
| ) | |
| from tools.time_strech import align_songs_by_bpm, stretch_to_bpm | |
| from tools.voice_replacement import replace_voice_wrapper | |
| def pitch_shift_with_semitones(audio_path: str, semitones: int) -> str: | |
| """ | |
| Shift the pitch of an audio file by a specified number of semitones. | |
| This function uses librosa's pitch shifting algorithm to change the musical pitch | |
| of an audio file while maintaining its tempo and duration. | |
| Args: | |
| audio_path: Path to the input audio file or URL (supports common formats: WAV, MP3, FLAC) | |
| semitones: Number of semitones to shift (positive = higher pitch, negative = lower pitch) | |
| Range: -12 to +12 semitones (1 octave up/down) | |
| Returns: | |
| Path to the pitch-shifted audio file in WAV format | |
| Examples: | |
| >>> pitch_shift_with_semitones("song.wav", 2) | |
| # Returns path to audio shifted up by 2 semitones | |
| >>> pitch_shift_with_semitones("track.mp3", -5) | |
| # Returns path to audio shifted down by 5 semitones | |
| >>> pitch_shift_with_semitones("audio.wav", 0) | |
| # Returns original file path (no change) | |
| Note: | |
| The function creates a temporary WAV file that should be cleaned up by the caller | |
| """ | |
| if semitones == 0: | |
| return audio_path | |
| # Load audio to get sample rate | |
| import librosa | |
| y, sr = librosa.load(audio_path, sr=None, mono=False) | |
| # Apply pitch shift | |
| y_shifted = librosa.effects.pitch_shift(y, n_steps=semitones, sr=sr) | |
| # Save to temporary file | |
| import tempfile | |
| import soundfile as sf | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: | |
| if y_shifted.ndim == 2: | |
| y_shifted = y_shifted.T | |
| sf.write(tmp.name, y_shifted, sr, format="wav", subtype="PCM_16") | |
| return tmp.name | |
| def stretch_audio_to_bpm_wrapper(audio_path: str, target_bpm: float) -> str: | |
| """ | |
| Stretch or compress audio to match a specific BPM (beats per minute) while maintaining pitch. | |
| This function uses time-stretching algorithms to change the tempo of an audio file | |
| without affecting its musical pitch, making it useful for beat-matching and tempo alignment. | |
| Args: | |
| audio_path: Path to the input audio file or URL (supports common formats: WAV, MP3, FLAC) | |
| target_bpm: Target beats per minute (BPM) value | |
| Typical range: 60-200 BPM | |
| Common values: 90 (slow), 120 (medium), 140 (fast), 128 (electronic) | |
| Returns: | |
| Path to the time-stretched audio file in WAV format | |
| Examples: | |
| >>> stretch_audio_to_bpm_wrapper("song.wav", 128) | |
| # Returns path to audio stretched to 128 BPM (electronic tempo) | |
| >>> stretch_audio_to_bpm_wrapper("track.mp3", 120) | |
| # Returns path to audio stretched to 120 BPM (standard pop tempo) | |
| >>> stretch_audio_to_bpm_wrapper("audio.wav", 140) | |
| # Returns path to audio stretched to 140 BPM (fast tempo) | |
| Note: | |
| The function automatically detects the original BPM and calculates the stretch factor | |
| Creates a new WAV file with the modified tempo | |
| """ | |
| try: | |
| result = stretch_to_bpm(audio_path, target_bpm) | |
| return result | |
| except Exception as e: | |
| print(f"Error stretching audio to BPM: {str(e)}") | |
| raise e | |
| def extract_selected_stems_wrapper( | |
| audio_path: str, vocals: bool, drums: bool, bass: bool, other: bool | |
| ) -> Tuple[str | None, str | None, str | None, str | None]: | |
| """ | |
| Extract selected stems from an audio file based on user choices. | |
| This function allows selective extraction of specific stems rather than all four stems, | |
| which can save processing time and storage space when only certain elements are needed. | |
| Args: | |
| audio_path: Path to the input audio file or URL (supports common formats: WAV, MP3, FLAC, M4A) | |
| vocals: Whether to extract the vocals stem | |
| drums: Whether to extract the drums stem | |
| bass: Whether to extract the bass stem | |
| other: Whether to extract the other stem | |
| Returns: | |
| tuple[str|None, str|None, str|None, str|None]: Paths to (vocals_file, drums_file, bass_file, other_file) | |
| Examples: | |
| >>> extract_selected_stems_wrapper("song.wav", True, True, False, False) | |
| # Returns {'vocals': 'path/to/vocals.wav', 'drums': 'path/to/drums.wav'} | |
| >>> extract_selected_stems_wrapper("track.mp3", True, False, False, False) | |
| # Returns {'vocals': 'path/to/vocals.wav'} (karaoke preparation) | |
| >>> extract_selected_stems_wrapper("audio.wav", False, True, True, False) | |
| # Returns {'drums': 'path/to/drums.wav', 'bass': 'path/to/bass.wav'} | |
| Note: | |
| At least one stem must be selected for extraction | |
| Uses the same high-quality Demucs model as separate_audio | |
| Processing time is the same as full separation since Demucs extracts all stems internally | |
| """ | |
| stems_to_extract = [] | |
| if vocals: | |
| stems_to_extract.append("vocals") | |
| if drums: | |
| stems_to_extract.append("drums") | |
| if bass: | |
| stems_to_extract.append("bass") | |
| if other: | |
| stems_to_extract.append("other") | |
| if not stems_to_extract: | |
| raise ValueError("At least one stem must be selected for extraction") | |
| try: | |
| results = extract_selected_stems(audio_path, stems_to_extract) | |
| vocals_path = results.get("vocals") | |
| drums_path = results.get("drums") | |
| bass_path = results.get("bass") | |
| other_path = results.get("other") | |
| return vocals_path, drums_path, bass_path, other_path | |
| except Exception as e: | |
| print(f"Error extracting selected stems: {str(e)}") | |
| raise e | |
| def extract_vocal_non_vocal_wrapper(audio_path: str) -> Tuple[str, str]: | |
| """ | |
| Extract vocals and non-vocals (instrumental) stems from an audio file. | |
| This function provides a simple interface to separate audio into vocal and | |
| non-vocal components, which is useful for karaoke creation, vocal isolation, | |
| or instrumental extraction. | |
| Args: | |
| audio_path: Path to the input audio file or URL (supports common formats: WAV, MP3, FLAC, M4A) | |
| Returns: | |
| tuple[str, str]: Paths to (vocals_file, instrumental_file) | |
| - vocals_file: Path to the isolated vocal track | |
| - instrumental_file: Path to the combined instrumental track (drums + bass + other) | |
| Examples: | |
| >>> extract_vocal_non_vocal_wrapper("song.mp3") | |
| # Returns ('path/to/vocals.wav', 'path/to/instrumental.wav') | |
| >>> extract_vocal_non_vocal_wrapper("track.wav") | |
| # Returns ('path/to/vocals.wav', 'path/to/instrumental.wav') | |
| Note: | |
| The instrumental track combines drums, bass, and other stems into a single track | |
| Uses the same high-quality Demucs model as separate_audio | |
| Instrumental track is automatically mixed and normalized for consistent volume | |
| """ | |
| try: | |
| return extract_vocal_non_vocal(audio_path) | |
| except Exception as e: | |
| print(f"Error extracting vocal and non-vocal stems: {str(e)}") | |
| raise e | |
| def mute_time_windows_wrapper( | |
| audio_path: str, windows_str: str, format_val: str | |
| ) -> str | None: | |
| """ | |
| Mute specific time windows in an audio file with smooth fade transitions. | |
| This wrapper function parses JSON-formatted time windows and applies muting | |
| with smooth fade in/out transitions to avoid audio artifacts. | |
| Args: | |
| audio_path: Path to the input audio file or URL (supports common formats: WAV, MP3, FLAC, M4A) | |
| windows_str: JSON-formatted string of time windows to mute | |
| Format: "[[start1, end1], [start2, end2], ...]" | |
| Example: "[[1.0, 2.0], [3.5, 4.2]]" mutes 1-2s and 3.5-4.2s | |
| format_val: Output audio format ("wav" or "mp3") | |
| Returns: | |
| Path to the processed audio file with muted sections | |
| Returns None if parsing fails or an error occurs | |
| Examples: | |
| - windows_str="[[10.0, 15.0]]": Mute audio from 10 to 15 seconds | |
| - windows_str="[[0.0, 1.0], [30.0, 31.0]]": Mute intro and outro | |
| - windows_str="[]": No muting applied | |
| Note: | |
| Uses 100ms fade in/out at each mute boundary to prevent audio clicks | |
| Time values are in seconds from the start of the audio file | |
| Windows can overlap or be in any order | |
| """ | |
| try: | |
| windows = eval(windows_str) if windows_str else [] | |
| return mute_time_windows( | |
| audio_path=audio_path, mute_windows=windows, output_format=format_val | |
| ) | |
| except Exception as e: | |
| print(f"Error muting time windows: {str(e)}") | |
| raise e | |
| def extract_segments_wrapper( | |
| audio_path: str, segments_str: str, format_val: str, join: bool | |
| ) -> Tuple[str, str | None, str | None, str | None]: | |
| """ | |
| Extract multiple segments (up to 4 segments) from an audio file and optionally join them. | |
| This wrapper function parses JSON-formatted time segments and extracts | |
| the specified portions from the audio file, with an option to join | |
| them into a single file. | |
| Args: | |
| audio_path: Path to the input audio file or URL (supports common formats: WAV, MP3, FLAC, M4A) | |
| segments_str: JSON-formatted string of time segments to extract | |
| Format: "[[start1, end1], [start2, end2], ...]" | |
| Example: "[[0.0, 10.0], [30.0, 40.0]]" extracts 0-10s and 30-40s | |
| format_val: Output audio format ("wav" or "mp3") | |
| join: Whether to join all segments into a single audio file | |
| True: Creates one file with all segments concatenated | |
| False: Creates separate files for each segment (returns first for Gradio) | |
| Returns: | |
| Path to the processed audio file(s) | |
| If join=True: Path to single joined file | |
| If join=False: Path to first extracted segment (for Gradio compatibility) | |
| Returns None if parsing fails or an error occurs | |
| Examples: | |
| - segments_str="[[10.0, 20.0]]": Extract 10-20 second segment | |
| - segments_str="[[0.0, 5.0], [10.0, 15.0]]": Extract intro and middle section | |
| - join=True: Combine all segments into one continuous file | |
| - join=False: Create separate files (returns first for Gradio) | |
| Note: | |
| Time values are in seconds from the start of the audio file | |
| Segments can overlap or be in any order | |
| When join=False, only the first segment path is returned for Gradio compatibility | |
| All segments are extracted with crossfades to avoid audio artifacts | |
| """ | |
| segments = eval(segments_str) if segments_str else [] | |
| result = extract_segments( | |
| audio_path=audio_path, | |
| segments=segments, | |
| output_format=format_val, | |
| join_segments=join, | |
| ) | |
| # Handle different return types | |
| if isinstance(result, list): | |
| # Return list as tuple (pad with None if needed) | |
| padded_result = result + [None] * (4 - len(result)) | |
| # Ensure first element is a string | |
| first_element = padded_result[0] if padded_result[0] is not None else "" | |
| return first_element, padded_result[1], padded_result[2], padded_result[3] | |
| else: | |
| # Return single result as tuple with None values | |
| return result, None, None, None | |
| def analyze_music_structure_wrapper(audio_path: str) -> str: | |
| """ | |
| Analyze the structural components and sections of a song. | |
| This wrapper function uses AI to identify and describe the different sections | |
| of a song such as intro, verse, chorus, bridge, outro, and other structural elements. | |
| Args: | |
| audio_path: Path to the input audio file or URL (supports common formats: WAV, MP3, FLAC, M4A) | |
| Returns: | |
| Detailed text analysis of the song's structure including: | |
| - Section identification (intro, verse, chorus, bridge, etc.) | |
| - Timestamp information for each section | |
| - Structural patterns and repetitions | |
| - Transitions between sections | |
| Returns error message if analysis fails | |
| Examples: | |
| - analyze_music_structure_wrapper('song.mp3'): Get full structure analysis | |
| - analyze_music_structure_wrapper('track.wav'): Identify verse/chorus sections | |
| Note: | |
| Uses AI-based audio analysis to detect musical sections | |
| Accuracy varies with musical style and production quality | |
| Works best for songs with clear structural patterns | |
| Processing time depends on audio length and complexity | |
| Requires internet connection for AI model access | |
| """ | |
| result = analyze_music_structure(audio_path=audio_path) | |
| if result["status"] == "success": | |
| return result["analysis"] | |
| else: | |
| return f"Error: {result.get('error', 'Unknown error')}" | |
| def understand_music_wrapper(audio_path: str, prompt: str) -> str: | |
| """ | |
| Analyze music using AI to provide detailed understanding and insights. | |
| This wrapper function interfaces with NVIDIA's Music-Flamingo Audio Language Model | |
| to provide comprehensive music analysis based on user prompts. | |
| Args: | |
| audio_path: Path to the input audio file or URL (supports common formats: WAV, MP3, FLAC, M4A) | |
| prompt: Text prompt describing what analysis to perform | |
| Examples: | |
| - "Describe this track in full detail - tell me the genre, tempo, and key" | |
| - "What instruments are used in this song?" | |
| - "Analyze the production style and mixing techniques" | |
| - "What mood or emotion does this music create?" | |
| Returns: | |
| Detailed text analysis of the music based on the prompt | |
| Returns error message if analysis fails | |
| Examples: | |
| - understand_music_wrapper('song.mp3', 'Describe the genre and style') | |
| - understand_music_wrapper('track.wav', 'Analyze the drum patterns and rhythm') | |
| Note: | |
| Uses NVIDIA's Music-Flamingo model for advanced audio understanding | |
| Analysis quality depends on the clarity and specificity of the prompt | |
| Processing time varies with audio length and complexity | |
| Requires internet connection for AI model access | |
| """ | |
| try: | |
| result = understand_music(audio_path=audio_path, prompt_text=prompt) | |
| if result["status"] == "success": | |
| return result["analysis"] | |
| else: | |
| return f"Error: {result.get('error', 'Unknown error')}" | |
| except Exception as e: | |
| print(f"Error: {str(e)}") | |
| raise e | |
| def suggest_cutting_points_wrapper(audio_path: str, purpose: str) -> str: | |
| """ | |
| Suggest optimal cutting points for audio editing based on specific use cases. | |
| This wrapper function provides AI-powered suggestions for the best points | |
| to cut or edit audio tracks, tailored to different purposes and use cases. | |
| Args: | |
| audio_path: Path to the input audio file or URL (supports common formats: WAV, MP3, FLAC, M4A) | |
| purpose: Intended use case for the cutting points | |
| Available options: | |
| - "general": Natural edit points with clean transitions | |
| - "dj_mix": DJ mixing points with intro/outro sections for beatmatching | |
| - "social_media": Short-form content optimized cuts (15-60 seconds) | |
| - "ringtone": Ringtone-friendly segments (15-30 seconds with hooks) | |
| Returns: | |
| Detailed text analysis containing: | |
| - Timestamp suggestions in MM:SS format | |
| - Explanation of why each point is suitable for cutting | |
| - Context-specific recommendations based on the chosen purpose | |
| - Technical notes about transitions and audio quality | |
| Returns error message if analysis fails | |
| Examples: | |
| - suggest_cutting_points_wrapper('song.mp3', 'general'): Get general edit points | |
| - suggest_cutting_points_wrapper('track.wav', 'dj_mix'): Find DJ mixing points | |
| - suggest_cutting_points_wrapper('audio.mp3', 'social_media'): Social media cuts | |
| - suggest_cutting_points_wrapper('music.wav', 'ringtone'): Ringtone segments | |
| Note: | |
| Uses AI analysis to identify musically appropriate cutting points | |
| Analysis considers beat structure, phrase boundaries, and audio quality | |
| Different purposes yield different types of suggestions | |
| Timestamps are provided in MM:SS format for easy reference | |
| Processing time varies with audio length and complexity | |
| Requires internet connection for AI model access | |
| """ | |
| try: | |
| result = suggest_cutting_points(audio_path=audio_path, purpose=purpose) | |
| if result["status"] == "success": | |
| return result["analysis"] | |
| else: | |
| return f"Error: {result.get('error', 'Unknown error')}" | |
| except Exception as e: | |
| print(f"Error: {str(e)}") | |
| raise e | |
| def estimate_key_wrapper(audio_path: str) -> str: | |
| """ | |
| Estimate the musical key of an audio file. | |
| This wrapper function analyzes the harmonic content of an audio file to determine | |
| its musical key using chroma features and statistical analysis of pitch class | |
| distributions. | |
| Args: | |
| audio_path: Path to the input audio file or URL (supports common formats: WAV, MP3, FLAC) | |
| Returns: | |
| Estimated musical key as a string (e.g., 'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B') | |
| Examples: | |
| >>> estimate_key_wrapper("song.wav") | |
| # Returns 'C' for audio in C major/A minor | |
| >>> estimate_key_wrapper("track.mp3") | |
| # Returns 'F#' for audio in F# major/D# minor | |
| Note: | |
| - Accuracy depends on audio quality and harmonic clarity | |
| - Works best with tonal music (pop, rock, jazz, classical) | |
| - May be less accurate for atonal or heavily processed music | |
| - Returns the most likely key but music can sometimes modulate | |
| """ | |
| try: | |
| key = estimate_key(audio_path) | |
| return f"Estimated Key: {key}" | |
| except Exception as e: | |
| print(f"Error estimating key: {str(e)}") | |
| raise e | |
| def align_songs_by_key_wrapper( | |
| audio1_path: str, | |
| audio2_path: str, | |
| target_key: str = "C", | |
| output_format: str = "wav", | |
| ) -> Tuple[str, str]: | |
| """ | |
| Align two songs to the same musical key for harmonic mixing. | |
| This wrapper function analyzes the keys of two audio files and pitch-shifts | |
| them to a target key, making them harmonically compatible for DJ mixing, | |
| mashups, or seamless transitions. | |
| Args: | |
| audio1_path: Path to the first audio file or URL (supports common formats: WAV, MP3, FLAC) | |
| audio2_path: Path to the second audio file or URL (supports common formats: WAV, MP3, FLAC) | |
| target_key: Target musical key for both tracks (default: "C") | |
| Available options: 'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B' | |
| output_format: Output audio format ("wav" or "mp3", default: "wav") | |
| Returns: | |
| Tuple of paths to the key-aligned audio files: | |
| - First element: Path to aligned version of audio1_path | |
| - Second element: Path to aligned version of audio2_path | |
| Examples: | |
| >>> align_songs_by_key_wrapper("song1.mp3", "song2.wav", "F", "wav") | |
| # Returns ('path/to/song1_aligned.wav', 'path/to/song2_aligned.wav') | |
| >>> align_songs_by_key_wrapper("track1.wav", "track2.wav", "G", "mp3") | |
| # Returns both tracks aligned to G major/minor | |
| Note: | |
| - Pitch shifting preserves tempo and duration | |
| - Extreme key changes may affect audio quality | |
| - Best results with moderate shifts (within 5-6 semitones) | |
| - Original keys are estimated automatically | |
| """ | |
| try: | |
| result1, result2 = align_songs_by_key( | |
| audio1_path=audio1_path, | |
| audio2_path=audio2_path, | |
| target_key=target_key, | |
| output_path="output", # Use default output location | |
| output_format=output_format, | |
| ) | |
| return result1, result2 | |
| except Exception as e: | |
| print(f"Error aligning songs by key: {str(e)}", f"Error: {str(e)}") | |
| raise e | |
| def shift_to_key_wrapper( | |
| audio_path: str, target_key: str, output_format: str = "wav" | |
| ) -> str: | |
| """ | |
| Shift an audio file to a specific musical key. | |
| This wrapper function changes the pitch of an audio file to match a target | |
| musical key while preserving the tempo and duration, useful for key matching | |
| in music production, DJ sets, or creating harmonically compatible versions. | |
| Args: | |
| audio_path: Path to the input audio file or URL (supports common formats: WAV, MP3, FLAC) | |
| target_key: Target musical key to shift to | |
| Available options: 'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B' | |
| output_format: Output audio format ("wav" or "mp3", default: "wav") | |
| Returns: | |
| Path to the pitch-shifted audio file in the target key | |
| Examples: | |
| >>> shift_to_key_wrapper("song.wav", "F", "wav") | |
| # Returns 'path/to/song_shifted_to_F.wav' | |
| >>> shift_to_key_wrapper("track.mp3", "G#", "mp3") | |
| # Returns 'path/to/track_shifted_to_Gsharp.mp3' | |
| Note: | |
| - Pitch shifting preserves tempo and duration | |
| - Quality may degrade with extreme shifts (more than 6 semitones) | |
| - Works best with tonal music and clear harmonic content | |
| - Original key is estimated automatically to calculate shift amount | |
| """ | |
| try: | |
| result = shift_to_key( | |
| audio_path=audio_path, | |
| target_key=target_key, | |
| output_path="output", # Use default output location | |
| output_format=output_format, | |
| ) | |
| return result | |
| except Exception as e: | |
| print(f"Error shifting to key: {str(e)}") | |
| raise e | |
| def separate_audio_mcp( | |
| audio_path: str, | |
| output_format: str = "wav", | |
| ) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]: | |
| """ | |
| Separate audio into vocals, drums, bass, and other stems using Demucs neural network. | |
| This MCP tool uses Demucs model to isolate individual instrument stems from a mixed | |
| audio file, providing high-quality separation for music production, remixing, and analysis. | |
| Args: | |
| audio_path: Path to input audio file or URL (supports common formats: WAV, MP3, FLAC, M4A) | |
| output_format: Output format for separated stems ('wav' or 'mp3', default: 'wav') | |
| Returns: | |
| Tuple of paths to separated audio files in order: | |
| (vocals_path, drums_path, bass_path, other_path) | |
| Examples: | |
| >>> separate_audio_mcp("song.mp3", "wav") | |
| # Returns ('/tmp/vocals.wav', '/tmp/drums.wav', '/tmp/bass.wav', '/tmp/other.wav') | |
| Note: | |
| - Processing time varies with audio length and complexity | |
| - Output files are saved with timestamps to avoid conflicts | |
| - Demucs provides state-of-the-art source separation quality | |
| - Best results with stereo, 44.1kHz or higher quality audio | |
| - Performance optimizations: GPU acceleration, chunking, parallel processing | |
| - Auto-optimizes based on available hardware (CPU cores, GPU, memory) | |
| """ | |
| model: str = "hdemucs_mmi" | |
| device: Optional[str] = None | |
| segment: Optional[int] = None | |
| jobs: Optional[int] = None | |
| try: | |
| # Auto-detect GPU if available and not specified | |
| if device is None: | |
| try: | |
| import torch | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| except ImportError: | |
| device = "cpu" | |
| # Auto-adjust segment size based on available memory if not specified | |
| if segment is None: | |
| try: | |
| import psutil | |
| available_gb = psutil.virtual_memory().available / (1024**3) | |
| if available_gb > 16: | |
| segment = None # Let Demucs decide | |
| elif available_gb > 8: | |
| segment = 15 | |
| else: | |
| segment = 10 | |
| except ImportError: | |
| segment = 10 # Conservative default | |
| # Auto-adjust jobs based on CPU cores if not specified | |
| if jobs is None: | |
| try: | |
| import os | |
| jobs = min(os.cpu_count() or 1, 4) # Cap at 4 to avoid memory issues | |
| except Exception: | |
| jobs = 1 | |
| vocals, drums, bass, other = separate_audio( | |
| audio_path=audio_path, | |
| output_path=None, # Use default temp location | |
| output_format=output_format, | |
| model=model, | |
| device=device, | |
| segment=segment, | |
| jobs=jobs, | |
| ) | |
| return vocals, drums, bass, other | |
| except Exception as e: | |
| print(f"Error separating audio: {str(e)}") | |
| raise e | |
| def pitch_shift_with_semitones_mcp( | |
| audio_path: str, semitones: int, output_format: str = "wav" | |
| ) -> str: | |
| """ | |
| Shift the pitch of an audio file by a specified number of semitones. | |
| This MCP tool uses librosa's high-quality pitch shifting algorithm to change the musical | |
| pitch of an audio file while preserving tempo and duration, useful for key matching | |
| and creative audio manipulation. | |
| Args: | |
| audio_path: Path to the input audio file or URL (supports common formats: WAV, MP3, FLAC) | |
| semitones: Number of semitones to shift (positive = higher pitch, negative = lower pitch) | |
| Range: -12 to +12 semitones (1 octave up/down) | |
| output_format: Output format for pitch-shifted audio ('wav' or 'mp3', default: 'wav') | |
| Returns: | |
| Path to the pitch-shifted audio file | |
| Examples: | |
| >>> pitch_shift_with_semitones_mcp("song.wav", 2, "wav") | |
| # Returns 'path/to/shifted.wav' shifted up by 2 semitones | |
| >>> pitch_shift_with_semitones_mcp("track.mp3", -5, "mp3") | |
| # Returns 'path/to/shifted.mp3' shifted down by 5 semitones | |
| Note: | |
| - Extreme shifts (beyond Β±12 semitones) may reduce audio quality | |
| - Algorithm preserves formants for natural-sounding vocal shifts | |
| - Works best with high-quality input audio | |
| - Output maintains original duration and tempo | |
| """ | |
| try: | |
| result = pitch_shift_with_semitones(audio_path, semitones) | |
| return result | |
| except Exception as e: | |
| print(f"Error shifting pitch: {str(e)}") | |
| raise e | |
| def align_songs_by_bpm_mcp( | |
| audio1_path: str, audio2_path: str, target_bpm: float, output_format: str = "wav" | |
| ) -> Tuple[str, str]: | |
| """ | |
| Align two songs to the same BPM for seamless mixing and transitions. | |
| This MCP tool analyzes the tempo of both audio files and time-stretches them to | |
| a target BPM, making them perfect for DJ sets, mashups, and beat-matched transitions. | |
| Args: | |
| audio1_path: Path to the first audio file or URL (supports common formats: WAV, MP3, FLAC) | |
| audio2_path: Path to the second audio file or URL (supports common formats: WAV, MP3, FLAC) | |
| target_bpm: Target beats per minute for both tracks (typical range: 60-200 BPM) | |
| output_format: Output format for aligned tracks ('wav' or 'mp3', default: 'wav') | |
| Returns: | |
| Tuple of paths to BPM-aligned audio files: | |
| (path_to_aligned_track1, path_to_aligned_track2) | |
| Examples: | |
| >>> align_songs_by_bpm_mcp("song1.wav", "song2.wav", 128.0, "wav") | |
| # Returns ('/tmp/song1_aligned.wav', '/tmp/song2_aligned.wav') | |
| >>> align_songs_by_bpm_mcp("track1.mp3", "track2.mp3", 140.0, "mp3") | |
| # Returns both tracks aligned to 140 BPM | |
| Note: | |
| - Time-stretching preserves pitch while changing tempo | |
| - Quality may degrade with extreme tempo changes (>50% difference) | |
| - Works best with rhythmic music with clear beats | |
| - Original BPM is automatically detected and analyzed | |
| """ | |
| try: | |
| result1, result2 = align_songs_by_bpm( | |
| audio1_path=audio1_path, | |
| audio2_path=audio2_path, | |
| output_path="output", # Use default output location | |
| output_format=output_format, | |
| ) | |
| # Apply target BPM by stretching both tracks | |
| from tools.time_strech import stretch_to_bpm | |
| aligned1 = stretch_to_bpm(result1, target_bpm, None, output_format) | |
| aligned2 = stretch_to_bpm(result2, target_bpm, None, output_format) | |
| return aligned1, aligned2 | |
| except Exception as e: | |
| print(f"Error aligning songs by BPM: {str(e)}", f"Error: {str(e)}") | |
| raise e | |
| def create_medley_mcp( | |
| vocals_path: str, | |
| instrumental_path: str, | |
| output_format: str = "wav", | |
| ) -> str: | |
| """ | |
| Create a professional medley by mixing vocals and instrumental tracks with advanced processing. | |
| This MCP tool combines vocal and instrumental tracks with professional audio processing | |
| including gain control, compression, and high-quality mixing for polished results. | |
| Args: | |
| vocals_path: Path to the vocals audio file or URL (supports common formats: WAV, MP3, FLAC, M4A) | |
| instrumental_path: Path to the instrumental audio file or URL (supports common formats: WAV, MP3, FLAC, M4A) | |
| output_format: Output format for medley ('wav' or 'mp3', default: 'wav') | |
| Returns: | |
| Path to the created medley audio file | |
| Examples: | |
| >>> create_medley_mcp("vocals.wav", "instrumental.wav", 0.6, 0.9, "wav") | |
| # Returns 'path/to/medley.wav' with balanced vocal/instrumental mix | |
| >>> create_medley_mcp("lead.mp3", "backing.mp3", 0.8, 0.7, "mp3") | |
| # Returns 'path/to/medley.mp3' with vocals slightly louder | |
| Note: | |
| - Includes professional compression for consistent levels | |
| - Automatic gain staging prevents clipping | |
| - Handles different track lengths by extending shorter with silence | |
| - Uses high-quality audio processing algorithms | |
| """ | |
| vocals_gain: float = 0.6 | |
| instrumental_gain: float = 1.2 | |
| try: | |
| result = create_medley( | |
| vocals_path=vocals_path, | |
| instrumental_path=instrumental_path, | |
| vocals_gain=vocals_gain, | |
| instrumental_gain=instrumental_gain, | |
| compressor="threshold=-12dB:ratio=3:attack=50:release=200", | |
| audio_codec="libmp3lame" if output_format == "mp3" else "pcm_s16le", | |
| audio_bitrate="192k" if output_format == "mp3" else "", | |
| output_path=None, # Use default temp location | |
| ) | |
| return result | |
| except Exception as e: | |
| print(f"Error creating medley: {str(e)}") | |
| raise e | |
| def get_audio_info_mcp(audio_path: str) -> Dict[str, Any]: | |
| """ | |
| Get comprehensive information about an audio file including technical specifications. | |
| This MCP tool analyzes audio files and returns detailed metadata including duration, | |
| sample rate, channels, format information, and file statistics. | |
| Args: | |
| audio_path: Path to the input audio file or URL (supports common formats: WAV, MP3, FLAC, M4A) | |
| Returns: | |
| Dictionary with detailed audio information: | |
| { | |
| "duration": 245.5, | |
| "sample_rate": 44100, | |
| "channels": 2, | |
| "format": "stereo", | |
| "filename": "song.mp3", | |
| "file_size": "8.2 MB", | |
| "bitrate": "320 kbps" | |
| } | |
| Examples: | |
| >>> get_audio_info_mcp("song.wav") | |
| # Returns {'duration': 180.0, 'sample_rate': 44100, 'channels': 2, ...} | |
| Note: | |
| - Supports both local files and remote URLs | |
| - Automatically detects audio format and codec information | |
| - Provides human-readable file size and bitrate information | |
| - Works with corrupted or partially downloaded files (when possible) | |
| """ | |
| try: | |
| info = get_audio_info(audio_path) | |
| return info | |
| except Exception as e: | |
| print(f"Error getting audio info: {str(e)}") | |
| raise e | |
| def cut_audio_mcp( | |
| audio_path: str, start_time: float, end_time: float, output_format: str = "wav" | |
| ) -> str: | |
| """ | |
| Extract a specific segment from an audio file between start and end times. | |
| This MCP tool provides precise audio cutting capabilities with sample-accurate timing, | |
| perfect for creating clips, removing unwanted sections, or isolating specific parts. | |
| Args: | |
| audio_path: Path to the input audio file or URL (supports common formats: WAV, MP3, FLAC, M4A) | |
| start_time: Start time in seconds (0.0 to duration) | |
| end_time: End time in seconds (start_time to duration) | |
| output_format: Output format for cut segment ('wav' or 'mp3', default: 'wav') | |
| Returns: | |
| Path to the cut audio segment | |
| Examples: | |
| >>> cut_audio_mcp("song.wav", 30.0, 90.0, "wav") | |
| # Returns 'path/to/segment.wav' containing 30-90 seconds | |
| >>> cut_audio_mcp("track.mp3", 0.0, 15.5, "mp3") | |
| # Returns 'path/to/segment.mp3' containing first 15.5 seconds | |
| Note: | |
| - Timing is sample-accurate for precise cuts | |
| - Automatic fade in/out to prevent audio artifacts | |
| - Validates time range against audio duration | |
| - Preserves original audio quality and metadata | |
| """ | |
| try: | |
| result = cut_audio( | |
| audio_path=audio_path, | |
| start_time=start_time, | |
| end_time=end_time, | |
| output_path=None, # Use default temp location | |
| output_format=output_format, | |
| ) | |
| return result | |
| except Exception as e: | |
| print(f"Error cutting audio: {str(e)}") | |
| raise e | |
| def trim_audio_mcp( | |
| audio_path: str, | |
| trim_start: Optional[float] = None, | |
| trim_end: Optional[float] = None, | |
| output_format: str = "wav", | |
| ) -> str: | |
| """ | |
| Trim audio from the beginning and/or end with precise timing control. | |
| This MCP tool removes unwanted portions from the start and/or end of audio files, | |
| useful for cleaning up recordings, removing silence, or creating tight edits. | |
| Args: | |
| audio_path: Path to the input audio file or URL (supports common formats: WAV, MP3, FLAC, M4A) | |
| trim_start: Amount to trim from start in seconds (None = no trim from start) | |
| trim_end: Amount to trim from end in seconds (None = no trim from end) | |
| output_format: Output format for trimmed audio ('wav' or 'mp3', default: 'wav') | |
| Returns: | |
| Path to the trimmed audio file | |
| Examples: | |
| >>> trim_audio_mcp("song.wav", 5.0, None, "wav") | |
| # Returns 'path/to/trimmed.wav' with first 5 seconds removed | |
| >>> trim_audio_mcp("track.mp3", 2.5, 3.0, "mp3") | |
| # Returns 'path/to/trimmed.wav' with 2.5s from start and 3s from end removed | |
| Note: | |
| - Trim values are validated against audio duration | |
| - Smooth fade transitions to prevent audio artifacts | |
| - Preserves audio quality and metadata | |
| - Can trim from start, end, or both simultaneously | |
| """ | |
| try: | |
| result = trim_audio( | |
| audio_path=audio_path, | |
| trim_start=trim_start, | |
| trim_end=trim_end, | |
| output_path=None, # Use default temp location | |
| output_format=output_format, | |
| ) | |
| return result | |
| except Exception as e: | |
| print(f"Error trimming audio: {str(e)}") | |
| raise e | |
| def analyze_genre_and_style_mcp(audio_path: str) -> str: | |
| """ | |
| Provide comprehensive genre and production style analysis using AI. | |
| This MCP tool uses NVIDIA's Music-Flamingo model to analyze audio content and provide | |
| detailed insights into genre, instrumentation, production techniques, and stylistic elements. | |
| Args: | |
| audio_path: Path to the input audio file or URL (supports common formats: WAV, MP3, FLAC, M4A) | |
| Returns: | |
| Detailed text analysis containing: | |
| - Genre classification and sub-genres | |
| - Instrumentation and arrangement details | |
| - Production style and mixing techniques | |
| - Era and cultural influences | |
| - Comparative analysis with similar styles | |
| Examples: | |
| >>> analyze_genre_and_style_mcp("song.wav") | |
| # Returns detailed analysis: "This track exhibits characteristics of | |
| # synth-pop with elements of 1980s new wave..." | |
| Note: | |
| - Requires internet connection for AI model access | |
| - Analysis time varies with audio complexity | |
| - Works best with high-quality audio files | |
| - Provides subjective but informed analysis | |
| - May include cultural and historical context | |
| """ | |
| try: | |
| result = analyze_genre_and_style( | |
| audio_path=audio_path, | |
| audio_file=None, | |
| filename="audio", | |
| youtube_url=None, | |
| ) | |
| if result["status"] == "success": | |
| return result["analysis"] | |
| else: | |
| return f"Error: {result.get('error', 'Unknown error')}" | |
| except Exception as e: | |
| print(f"Error analyzing genre and style: {str(e)}") | |
| raise e | |
| def remove_noise_mcp( | |
| audio_path: str, | |
| noise_type: str = "general", | |
| sensitivity: float = 0.5, | |
| output_format: str = "wav", | |
| ) -> str: | |
| """ | |
| Remove noise from audio using adaptive filtering and spectral subtraction. | |
| This MCP wrapper provides noise removal capabilities for various types of | |
| unwanted audio artifacts including hiss, hum, rumble, and general background noise. | |
| Args: | |
| audio_path: Path to the input audio file or URL (supports common formats: WAV, MP3, FLAC, M4A) | |
| noise_type: Type of noise to remove ('general', 'hiss', 'hum', 'rumble', 'background') | |
| sensitivity: Noise reduction sensitivity (0.0 to 1.0, default: 0.5) | |
| output_format: Output format for the cleaned audio ('wav' or 'mp3', default: 'wav') | |
| Returns: | |
| Path to the cleaned audio file | |
| Examples: | |
| >>> remove_noise_mcp("noisy_recording.wav", "hiss", 0.7, "wav") | |
| # Returns path to cleaned audio with reduced hiss | |
| >>> remove_noise_mcp("podcast.mp3", "background", 0.3, "mp3") | |
| # Returns path to cleaned audio with reduced background noise | |
| Note: | |
| - Higher sensitivity values remove more noise but may affect audio quality | |
| - Different noise types use specialized algorithms for optimal results | |
| - Processing time varies with audio length and noise complexity | |
| """ | |
| try: | |
| result = remove_noise( | |
| audio_path=audio_path, | |
| noise_type=noise_type, | |
| sensitivity=sensitivity, | |
| output_path=None, | |
| output_format=output_format, | |
| ) | |
| return result | |
| except Exception as e: | |
| print(f"Error removing noise: {str(e)}") | |
| raise e | |
| def insert_section_mcp( | |
| audio_path: str, | |
| section_path: str, | |
| insert_time: float, | |
| crossfade_duration: float = 0.1, | |
| output_format: str = "wav", | |
| ) -> str: | |
| """ | |
| Insert a section from one audio track into another at a precise time position. | |
| This MCP wrapper allows inserting audio content (like an intro, advertisement, | |
| or sound effect) into an existing track at any position with smooth | |
| crossfading to avoid audible clicks or abrupt transitions. | |
| Args: | |
| audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A) | |
| section_path: Path to the audio section to insert (supports common formats: WAV, MP3, FLAC, M4A) | |
| insert_time: Position to insert the section (in seconds from start of main audio) | |
| crossfade_duration: Length of crossfade in seconds (default: 0.1) | |
| output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav') | |
| Returns: | |
| Path to the audio file with the section inserted | |
| Examples: | |
| >>> insert_section_mcp("main_track.wav", "intro.wav", 5.0, 0.2, "wav") | |
| # Returns path to audio with intro inserted at 5 seconds | |
| >>> insert_section_mcp("podcast.mp3", "advertisement.mp3", 180.0, 0.5, "mp3") | |
| # Returns path to audio with ad inserted at 3 minutes | |
| Note: | |
| - Insert position is measured from the start of the main audio | |
| - Crossfade prevents clicks and creates smooth transitions | |
| - If insert_time + section duration exceeds main audio duration, section is truncated | |
| """ | |
| try: | |
| result = insert_section( | |
| audio_path=audio_path, | |
| section_path=section_path, | |
| insert_time=insert_time, | |
| crossfade_duration=crossfade_duration, | |
| output_path=None, | |
| output_format=output_format, | |
| ) | |
| return result | |
| except Exception as e: | |
| print(f"Error inserting audio section: {str(e)}") | |
| raise e | |
| def replace_section_mcp( | |
| audio_path: str, | |
| start_time: float, | |
| end_time: float, | |
| replacement_path: str, | |
| crossfade_duration: float = 0.1, | |
| output_format: str = "wav", | |
| ) -> str: | |
| """ | |
| Replace a section of an audio track with another audio segment. | |
| This MCP wrapper removes a specified time range from the main audio and | |
| replaces it with new content, using crossfades for smooth transitions. | |
| Args: | |
| audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A) | |
| start_time: Start time of section to replace (in seconds) | |
| end_time: End time of section to replace (in seconds) | |
| replacement_path: Path to the replacement audio segment (supports common formats: WAV, MP3, FLAC, M4A) | |
| crossfade_duration: Length of crossfade in seconds (default: 0.1) | |
| output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav') | |
| Returns: | |
| Path to the audio file with the section replaced | |
| Examples: | |
| >>> replace_section_mcp("song.wav", 60.0, 90.0, "new_verse.wav", 0.2, "wav") | |
| # Returns path to audio with 60-90s section replaced | |
| >>> replace_section_mcp("podcast.mp3", 120.0, 150.0, "correction.wav", 0.3, "mp3") | |
| # Returns path to audio with 2-minute section replaced | |
| Note: | |
| - Start time must be less than end time | |
| - Crossfade prevents clicks at replacement boundaries | |
| - Replacement section is trimmed if longer than specified duration | |
| """ | |
| try: | |
| result = replace_section( | |
| audio_path=audio_path, | |
| start_time=start_time, | |
| end_time=end_time, | |
| replacement_path=replacement_path, | |
| crossfade_duration=crossfade_duration, | |
| output_path=None, | |
| output_format=output_format, | |
| ) | |
| return result | |
| except Exception as e: | |
| print(f"Error replacing audio section: {str(e)}") | |
| raise e | |
| def replace_voice_mcp( | |
| source_audio_path: str, | |
| target_audio_path: str, | |
| ) -> str: | |
| """ | |
| Replace voice in source audio with voice from target audio using Seed-VC. | |
| This MCP wrapper uses the Seed-VC Gradio space to perform voice conversion, | |
| replacing the voice characteristics in the source audio with those from | |
| the target audio while preserving the linguistic content and timing. | |
| Args: | |
| source_audio_path: Path to the source audio file or URL (voice to be replaced) | |
| target_audio_path: Path to the target audio file or URL (voice to use) | |
| Returns: | |
| Path to the generated voice-replaced audio file | |
| Examples: | |
| >>> replace_voice_mcp("source.wav", "target_voice.wav") | |
| # Returns path to voice-replaced audio file | |
| >>> replace_voice_mcp("speech.mp3", "singer.wav", diffusion_steps=15, pitch_shift=2) | |
| # Returns path to voice-replaced audio with custom settings | |
| >>> replace_voice_mcp("https://example.com/source.wav", "target.wav") | |
| # Downloads source audio and replaces voice with target voice | |
| >>> replace_voice_mcp("source.wav", "https://example.com/voice.mp3", pitch_shift=2) | |
| # Downloads target voice and applies to source with pitch shift | |
| Note: | |
| - Uses Seed-VC model for high-quality voice conversion | |
| - Preserves linguistic content and timing from source audio | |
| - Applies voice characteristics from target audio | |
| - Processing time depends on diffusion steps and audio length | |
| """ | |
| diffusion_steps: int = 35 | |
| length_adjust: float = 1.0 | |
| inference_cfg_rate: float = 0.5 | |
| f0_condition: bool = True | |
| auto_f0_adjust: bool = True | |
| pitch_shift: int = 0 | |
| return replace_voice_wrapper( | |
| source_audio_path=source_audio_path, | |
| target_audio_path=target_audio_path, | |
| diffusion_steps=diffusion_steps, | |
| length_adjust=length_adjust, | |
| inference_cfg_rate=inference_cfg_rate, | |
| f0_condition=f0_condition, | |
| auto_f0_adjust=auto_f0_adjust, | |
| pitch_shift=pitch_shift, | |
| ) | |
| def create_interface() -> gr.Blocks: | |
| """ | |
| Create and configure the complete Gradio interface with all audio processing tools. | |
| This function sets up a fun web interface with 25+ different tabs, | |
| each providing access to specific audio processing capabilities. The interface | |
| is organized into logical categories for easy exploration and experimentation. | |
| Returns: | |
| gr.Blocks: A fully configured Gradio interface containing: | |
| **Stem Processing Tabs:** | |
| - Stem Separation: Full 4-stem separation (vocals, drums, bass, other) | |
| - Selective Stems: Extract only selected stems | |
| - Vocal/Instrumental: Separate vocals from instrumental | |
| - Karaoke Creation: One-click instrumental track generation | |
| **Audio Manipulation Tabs:** | |
| - Track Combination: Mix two audio tracks with weights | |
| - Pitch Alignment: Shift audio pitch by semitones | |
| - Key Estimation: Estimate musical key using harmonic analysis | |
| - Shift to Key: Shift audio to specific musical key | |
| - Align Songs by Key: Harmonically align multiple tracks | |
| - Stereo Mix: Create stereo mix with left/right channels | |
| - Time Stretching: Change tempo without affecting pitch | |
| - BPM Alignment: Align two tracks to same BPM | |
| - Medley Creation: Fun vocal/instrumental mixing | |
| **Audio Editing Tabs:** | |
| - Audio Cutting: Extract segments between time points | |
| - Mute Windows: Mute specific time ranges with fades | |
| - Extract Segments: Extract multiple segments | |
| - Trim Audio: Trim from beginning/end | |
| **Analysis & Information Tabs:** | |
| - Audio Information: Get detailed file information | |
| - Music Understanding: AI-powered music analysis | |
| - Song Structure: Identify song sections | |
| - Cutting Points: AI-suggested edit points | |
| - Genre Analysis: Detailed genre and style analysis | |
| **External Source Tabs:** | |
| - YouTube Extraction: Extract audio from YouTube videos | |
| - YouTube Video Info: Get video metadata without downloading | |
| Note: | |
| - All interfaces use consistent styling and error handling | |
| - Audio inputs support multiple formats (WAV, MP3, FLAC, M4A) | |
| - Each tab includes appropriate input validation | |
| - Server runs on 0.0.0.0:7860 with MCP server enabled | |
| - All examples disabled for security (cache_examples=False) | |
| - Flagging disabled to prevent data collection | |
| - This is a demo project for exploring audio processing capabilities | |
| """ | |
| # Tab 1: Stem Separation | |
| stem_interface = gr.Interface( | |
| fn=separate_audio_mcp, | |
| inputs=[ | |
| gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]), | |
| gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"), | |
| ], | |
| outputs=[ | |
| gr.Audio(label="Vocals", type="filepath"), | |
| gr.Audio(label="Drums", type="filepath"), | |
| gr.Audio(label="Bass", type="filepath"), | |
| gr.Audio(label="Other", type="filepath"), | |
| ], | |
| title="Audio Stem Separation", | |
| description="Upload an audio file to separate it into vocals, drums, bass, and other stems.", | |
| examples=None, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| ) | |
| # Tab 3: Pitch Alignment | |
| pitch_interface = gr.Interface( | |
| fn=pitch_shift_with_semitones_mcp, | |
| inputs=[ | |
| gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]), | |
| gr.Number(value=0, label="Semitones to Shift"), | |
| gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"), | |
| ], | |
| outputs=gr.Audio(label="Pitch Shifted Audio", type="filepath"), | |
| title="Pitch Shift Audio", | |
| description="Shift the pitch of an audio file by specified semitones.", | |
| examples=None, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| ) | |
| # Tab 4: Key Estimation | |
| key_estimation_interface = gr.Interface( | |
| fn=estimate_key_wrapper, | |
| inputs=[ | |
| gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]), | |
| ], | |
| outputs=gr.Textbox(label="Estimated Key", lines=2), | |
| title="Estimate Musical Key", | |
| description="Estimate the musical key of an audio file using harmonic analysis.", | |
| examples=None, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| ) | |
| # Tab 5: Shift to Key | |
| shift_to_key_interface = gr.Interface( | |
| fn=shift_to_key_wrapper, | |
| inputs=[ | |
| gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]), | |
| gr.Dropdown( | |
| choices=[ | |
| "C", | |
| "C#", | |
| "D", | |
| "D#", | |
| "E", | |
| "F", | |
| "F#", | |
| "G", | |
| "G#", | |
| "A", | |
| "A#", | |
| "B", | |
| ], | |
| value="C", | |
| label="Target Key", | |
| ), | |
| gr.Dropdown( | |
| choices=["wav", "mp3"], | |
| value="wav", | |
| label="Output Format", | |
| ), | |
| ], | |
| outputs=gr.Audio(label="Key-Shifted Audio", type="filepath"), | |
| title="Shift Audio to Key", | |
| description="Shift an audio file to a specific musical key while preserving tempo.", | |
| examples=None, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| ) | |
| # Tab 6: Align Songs by Key | |
| align_songs_interface = gr.Interface( | |
| fn=align_songs_by_key_wrapper, | |
| inputs=[ | |
| gr.Audio(type="filepath", label="First Audio File", sources=["upload"]), | |
| gr.Audio(type="filepath", label="Second Audio File", sources=["upload"]), | |
| gr.Dropdown( | |
| choices=[ | |
| "C", | |
| "C#", | |
| "D", | |
| "D#", | |
| "E", | |
| "F", | |
| "F#", | |
| "G", | |
| "G#", | |
| "A", | |
| "A#", | |
| "B", | |
| ], | |
| value="C", | |
| label="Target Key", | |
| ), | |
| gr.Dropdown( | |
| choices=["wav", "mp3"], | |
| value="wav", | |
| label="Output Format", | |
| ), | |
| ], | |
| outputs=[ | |
| gr.Audio(label="First Song (Aligned)", type="filepath"), | |
| gr.Audio(label="Second Song (Aligned)", type="filepath"), | |
| ], | |
| title="Align Songs by Key", | |
| description="Align two songs to the same musical key for harmonic mixing.", | |
| examples=None, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| ) | |
| # Tab 8: Time Stretching | |
| stretch_interface = gr.Interface( | |
| fn=stretch_audio_to_bpm_wrapper, | |
| inputs=[ | |
| gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]), | |
| gr.Number(value=120, label="Target BPM"), | |
| ], | |
| outputs=gr.Audio(label="Stretched Audio", type="filepath"), | |
| title="Stretch Audio to BPM", | |
| description="Stretch audio to match a specific BPM.", | |
| examples=None, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| ) | |
| # Tab 5: BPM Alignment | |
| bpm_interface = gr.Interface( | |
| fn=align_songs_by_bpm_mcp, | |
| inputs=[ | |
| gr.Audio(type="filepath", label="First Audio Track", sources=["upload"]), | |
| gr.Audio(type="filepath", label="Second Audio Track", sources=["upload"]), | |
| gr.Number(value=120.0, label="Target BPM"), | |
| gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"), | |
| ], | |
| outputs=[ | |
| gr.Audio(label="Aligned First Track", type="filepath"), | |
| gr.Audio(label="Aligned Second Track", type="filepath"), | |
| ], | |
| title="Align Songs by BPM", | |
| description="Align two songs to the same BPM by stretching the faster one to match the slower one.", | |
| examples=None, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| ) | |
| # Tab 6: Selective Stem Extraction | |
| selective_interface = gr.Interface( | |
| fn=extract_selected_stems_wrapper, | |
| inputs=[ | |
| gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]), | |
| gr.Checkbox(value=True, label="Extract Vocals"), | |
| gr.Checkbox(value=True, label="Extract Drums"), | |
| gr.Checkbox(value=True, label="Extract Bass"), | |
| gr.Checkbox(value=True, label="Extract Other"), | |
| ], | |
| outputs=[ | |
| gr.Audio(label="Vocals Stem", type="filepath"), | |
| gr.Audio(label="Drums Stem", type="filepath"), | |
| gr.Audio(label="Bass Stem", type="filepath"), | |
| gr.Audio(label="Other Stem", type="filepath"), | |
| ], | |
| title="Selective Stem Extraction", | |
| description="Extract only specific stems from an audio file to save processing time and storage.", | |
| examples=None, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| ) | |
| # Tab 7: Vocal/Non-Vocal Separation | |
| vocal_nonvocal_interface = gr.Interface( | |
| fn=extract_vocal_non_vocal_wrapper, | |
| inputs=gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]), | |
| outputs=[ | |
| gr.Audio(label="Vocals Track", type="filepath"), | |
| gr.Audio(label="Instrumental Track", type="filepath"), | |
| ], | |
| title="Vocal/Instrumental Separation", | |
| description="Separate audio into vocal and instrumental components for karaoke or vocal isolation.", | |
| examples=None, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| ) | |
| # Tab 9: Medley Creation | |
| medley_interface = gr.Interface( | |
| fn=create_medley_mcp, | |
| inputs=[ | |
| gr.Audio(type="filepath", label="Vocals Stem", sources=["upload"]), | |
| gr.Audio(type="filepath", label="Instrumental Stem", sources=["upload"]), | |
| gr.Dropdown( | |
| choices=["wav", "mp3"], | |
| value="wav", | |
| label="Output Format", | |
| ), | |
| ], | |
| outputs=gr.Audio(label="Medley Audio", type="filepath"), | |
| title="Create Vocal/Instrumental Medley", | |
| description="Mix vocals and instrumental stems into a polished medley with compression and gain control.", | |
| examples=None, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| ) | |
| # Tab 10: Audio Information | |
| audio_info_interface = gr.Interface( | |
| fn=get_audio_info_mcp, | |
| inputs=gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]), | |
| outputs=gr.JSON(label="Audio Information"), | |
| title="Get Audio Information", | |
| description="Get detailed information about an audio file including duration, sample rate, channels, and file size.", | |
| examples=None, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| ) | |
| # Tab 13: Audio Cutting | |
| cut_interface = gr.Interface( | |
| fn=cut_audio_mcp, | |
| inputs=[ | |
| gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]), | |
| gr.Number(value=0.0, label="Start Time (seconds)"), | |
| gr.Number(value=10.0, label="End Time (seconds)"), | |
| gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"), | |
| ], | |
| outputs=gr.Audio(label="Cut Audio", type="filepath"), | |
| title="Cut Audio Segment", | |
| description="Extract a segment from an audio file between specified start and end times.", | |
| examples=None, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| ) | |
| # Tab 13: Mute Time Windows | |
| mute_interface = gr.Interface( | |
| fn=mute_time_windows_wrapper, | |
| inputs=[ | |
| gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]), | |
| gr.Textbox( | |
| value="[[1.0, 2.0], [3.0, 4.0]]", | |
| label="Mute Windows (JSON format)", | |
| placeholder="[[start1, end1], [start2, end2]]", | |
| ), | |
| gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"), | |
| ], | |
| outputs=gr.Audio(label="Muted Audio", type="filepath"), | |
| title="Mute Time Windows", | |
| description="Mute specific time windows in an audio file with smooth fade transitions.", | |
| examples=None, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| ) | |
| # Tab 14: Extract Segments | |
| extract_interface = gr.Interface( | |
| fn=extract_segments_wrapper, | |
| inputs=[ | |
| gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]), | |
| gr.Textbox( | |
| value="[[0.0, 1.0], [2.0, 3.0]]", | |
| label="Segments (JSON format)", | |
| placeholder="[[start1, end1], [start2, end2]]", | |
| ), | |
| gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"), | |
| gr.Checkbox(value=False, label="Join Segments"), | |
| ], | |
| outputs=[ | |
| gr.Audio(label="Extracted Segment 1", type="filepath"), | |
| gr.Audio(label="Extracted Segment 2", type="filepath"), | |
| gr.Audio(label="Extracted Segment 3", type="filepath"), | |
| gr.Audio(label="Extracted Segment 4", type="filepath"), | |
| ], | |
| title="Extract Segments", | |
| description="Extract multiple segments from an audio file. Shows up to 4 segments (first segment when not joined).", | |
| examples=None, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| ) | |
| # Tab 15: Trim Audio | |
| trim_interface = gr.Interface( | |
| fn=trim_audio_mcp, | |
| inputs=[ | |
| gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]), | |
| gr.Number(value=None, label="Trim Start (seconds, leave empty to skip)"), | |
| gr.Number(value=None, label="Trim End (seconds, leave empty to skip)"), | |
| gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"), | |
| ], | |
| outputs=gr.Audio(label="Trimmed Audio", type="filepath"), | |
| title="Trim Audio", | |
| description="Trim audio from the beginning and/or end.", | |
| examples=None, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| ) | |
| # Tab 16: Music Understanding | |
| understand_interface = gr.Interface( | |
| fn=understand_music_wrapper, | |
| inputs=[ | |
| gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]), | |
| gr.Textbox( | |
| value="Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.", | |
| label="Analysis Prompt", | |
| lines=3, | |
| ), | |
| ], | |
| outputs=gr.Textbox(label="Music Analysis", lines=10), | |
| title="Music Understanding (AI)", | |
| description="Analyze music using NVIDIA's Music-Flamingo Audio Language Model.", | |
| examples=None, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| ) | |
| # Tab 17: Song Structure Analysis | |
| structure_interface = gr.Interface( | |
| fn=analyze_music_structure_wrapper, | |
| inputs=[ | |
| gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]), | |
| ], | |
| outputs=gr.Textbox(label="Structure Analysis", lines=10), | |
| title="Song Structure Analysis", | |
| description="Analyze song structure and identify sections (verse, chorus, bridge, etc.).", | |
| examples=None, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| ) | |
| # Tab 18: Cutting Points Suggestions | |
| cutting_points_interface = gr.Interface( | |
| fn=suggest_cutting_points_wrapper, | |
| inputs=[ | |
| gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]), | |
| gr.Dropdown( | |
| choices=["general", "dj_mix", "social_media", "ringtone"], | |
| value="general", | |
| label="Purpose", | |
| ), | |
| ], | |
| outputs=gr.Textbox(label="Cutting Point Suggestions", lines=10), | |
| title="AI Cutting Point Suggestions", | |
| description="Get AI-suggested optimal cutting points for different purposes.", | |
| examples=None, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| ) | |
| # Tab 19: Genre and Style Analysis | |
| genre_interface = gr.Interface( | |
| fn=analyze_genre_and_style_mcp, | |
| inputs=[ | |
| gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]), | |
| ], | |
| outputs=gr.Textbox(label="Genre & Style Analysis", lines=10), | |
| title="Genre & Style Analysis", | |
| description="Detailed analysis of genre, production style, and instrumentation.", | |
| examples=None, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| ) | |
| # Tab 20: Audio Cleaning | |
| cleaning_interface = gr.Interface( | |
| fn=remove_noise_mcp, | |
| inputs=[ | |
| gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]), | |
| gr.Dropdown( | |
| choices=["general", "hiss", "hum", "rumble", "background"], | |
| value="general", | |
| label="Noise Type", | |
| ), | |
| gr.Slider( | |
| minimum=0.0, maximum=1.0, value=0.5, step=0.1, label="Sensitivity" | |
| ), | |
| gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"), | |
| ], | |
| outputs=gr.Audio(label="Cleaned Audio", type="filepath"), | |
| title="Audio Noise Removal", | |
| description="Remove various types of noise from audio using adaptive filtering and spectral subtraction.", | |
| examples=None, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| ) | |
| # Tab 21: Insert Section | |
| insert_interface = gr.Interface( | |
| fn=insert_section_mcp, | |
| inputs=[ | |
| gr.Audio(type="filepath", label="Main Audio File", sources=["upload"]), | |
| gr.Audio(type="filepath", label="Section to Insert", sources=["upload"]), | |
| gr.Number(value=5.0, label="Insert Time (seconds)"), | |
| gr.Number(value=0.1, label="Crossfade Duration (seconds)"), | |
| gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"), | |
| ], | |
| outputs=gr.Audio(label="Audio with Insertion", type="filepath"), | |
| title="Insert Audio Section", | |
| description="Insert a section from one audio track into another at a precise time position.", | |
| examples=None, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| ) | |
| # Tab 22: Replace Section | |
| replace_interface = gr.Interface( | |
| fn=replace_section_mcp, | |
| inputs=[ | |
| gr.Audio(type="filepath", label="Main Audio File", sources=["upload"]), | |
| gr.Number(value=60.0, label="Start Time (seconds)"), | |
| gr.Number(value=90.0, label="End Time (seconds)"), | |
| gr.Audio(type="filepath", label="Replacement Section", sources=["upload"]), | |
| gr.Number(value=0.1, label="Crossfade Duration (seconds)"), | |
| gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"), | |
| ], | |
| outputs=gr.Audio(label="Audio with Replacement", type="filepath"), | |
| title="Replace Audio Section", | |
| description="Replace a section of an audio track with another audio segment.", | |
| examples=None, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| ) | |
| # Tab 23: Voice Replacement | |
| voice_replacement_interface = gr.Interface( | |
| fn=replace_voice_mcp, | |
| inputs=[ | |
| gr.Audio( | |
| type="filepath", | |
| label="Source Audio (voice to be replaced) - Local file or URL", | |
| sources=["upload"], | |
| ), | |
| gr.Audio( | |
| type="filepath", | |
| label="Target Audio (voice to use) - Local file or URL", | |
| sources=["upload"], | |
| ), | |
| ], | |
| outputs=gr.Audio(label="Voice-Replaced Audio", type="filepath"), | |
| title="Voice Replacement with Seed-VC", | |
| description="Replace voice in source audio with voice from target audio using Seed-VC AI model.", | |
| examples=None, | |
| cache_examples=False, | |
| flagging_mode="never", | |
| ) | |
| # Create TabbedInterface with custom header | |
| tabbed_interface = gr.TabbedInterface( | |
| [ | |
| stem_interface, | |
| pitch_interface, | |
| key_estimation_interface, | |
| shift_to_key_interface, | |
| align_songs_interface, | |
| stretch_interface, | |
| bpm_interface, | |
| selective_interface, | |
| vocal_nonvocal_interface, | |
| medley_interface, | |
| audio_info_interface, | |
| cut_interface, | |
| mute_interface, | |
| extract_interface, | |
| trim_interface, | |
| understand_interface, | |
| structure_interface, | |
| cutting_points_interface, | |
| genre_interface, | |
| cleaning_interface, | |
| insert_interface, | |
| replace_interface, | |
| voice_replacement_interface, | |
| ], | |
| [ | |
| "Stem Separation", | |
| "Pitch Alignment", | |
| "Key Estimation", | |
| "Shift to Key", | |
| "Align Songs by Key", | |
| "Time Stretching", | |
| "BPM Alignment", | |
| "Selective Stems", | |
| "Vocal/Instrumental", | |
| "Medley Creation", | |
| "Audio Information", | |
| "Audio Cutting", | |
| "Mute Windows", | |
| "Extract Segments", | |
| "Trim Audio", | |
| "Music Understanding", | |
| "Song Structure", | |
| "Cutting Points", | |
| "Genre Analysis", | |
| "Audio Cleaning", | |
| "Insert Section", | |
| "Replace Section", | |
| "Voice Replacement", | |
| ], | |
| title="π΅ Music AI Tools - Professional Audio Processing Suite", | |
| ) | |
| # Add custom CSS for header styling | |
| tabbed_interface.head = """ | |
| <style> | |
| .gradio-container { | |
| font-family: 'Inter', system-ui, -apple-system, sans-serif !important; | |
| } | |
| .tab-nav { | |
| border-bottom: 2px solid #e5e7eb !important; | |
| } | |
| .tab-nav button { | |
| font-weight: 500 !important; | |
| } | |
| </style> | |
| """ | |
| # Add header HTML to the interface | |
| header_html = """ | |
| <div style="text-align: center; padding: 30px 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; margin: 20px auto; max-width: 1200px; box-shadow: 0 10px 30px rgba(0,0,0,0.2);"> | |
| <h1 style="color: white; font-size: 2.8em; margin-bottom: 15px; font-weight: 700; text-shadow: 2px 2px 4px rgba(0,0,0,0.3);"> | |
| π΅ Music AI Tools πΆ | |
| </h1> | |
| <h2 style="color: #f0f0f0; font-size: 1.4em; margin-bottom: 20px; font-weight: 400;"> | |
| Fun Audio Processing Playground | |
| </h2> | |
| <p style="color: #e0e0e0; font-size: 1.1em; max-width: 900px; margin: 0 auto 25px auto; line-height: 1.7; text-align: left; background: rgba(0,0,0,0.2); padding: 20px; border-radius: 10px;"> | |
| <strong style="color: white; font-size: 1.2em;">π§ Cool Audio Tricks:</strong> Stem separation (Demucs), pitch shifting, time stretching, and key alignment<br> | |
| <strong style="color: white; font-size: 1.2em;">πΉ Smart AI Analysis:</strong> Genre detection, structure analysis, and cutting suggestions (Music-Flamingo)<br> | |
| <strong style="color: white; font-size: 1.2em;">ποΈ Fun Audio Editing:</strong> Noise removal, track combination, and precise audio manipulation<br> | |
| <strong style="color: white; font-size: 1.2em;">π€ Awesome AI Tools:</strong> Voice replacement (Seed-VC) and music understanding (Music-Flamingo)<br> | |
| <strong style="color: white; font-size: 1.2em;">π Fast & Powerful:</strong> GPU boost, parallel processing, and live progress updates | |
| </p> | |
| <div style="margin-top: 20px; display: flex; justify-content: center; gap: 10px; flex-wrap: wrap;"> | |
| <span style="background: rgba(255,255,255,0.25); padding: 8px 20px; border-radius: 25px; margin: 5px; color: white; font-weight: 600; backdrop-filter: blur(10px);"> | |
| πΌ 25+ Tools | |
| </span> | |
| <span style="background: rgba(255,255,255,0.25); padding: 8px 20px; border-radius: 25px; margin: 5px; color: white; font-weight: 600; backdrop-filter: blur(10px);"> | |
| π― AI-Powered | |
| </span> | |
| <span style="background: rgba(255,255,255,0.25); padding: 8px 20px; border-radius: 25px; margin: 5px; color: white; font-weight: 600; backdrop-filter: blur(10px);"> | |
| π URL Support | |
| </span> | |
| <span style="background: rgba(255,255,255,0.25); padding: 8px 20px; border-radius: 25px; margin: 5px; color: white; font-weight: 600; backdrop-filter: blur(10px);"> | |
| πͺ Demo Fun | |
| </span> | |
| </div> | |
| <div style="margin-top: 25px; text-align: center; color: rgba(255,255,255,0.9); font-size: 0.9em; line-height: 1.6;"> | |
| <strong style="color: white;">π€ AI Models Used:</strong><br> | |
| π΅ <strong>Stem Separation:</strong> <a href="https://github.com/adefossez/demucs" target="_blank" style="color: #ffd700; text-decoration: underline;">Demucs</a> by Facebook Research<br> | |
| π€ <strong>Voice Replacement:</strong> <a href="https://huggingface.co/spaces/Plachta/Seed-VC" target="_blank" style="color: #ffd700; text-decoration: underline;">Seed-VC</a> on Hugging Face<br> | |
| π§ <strong>Music Understanding:</strong> <a href="https://huggingface.co/spaces/nvidia/music-flamingo" target="_blank" style="color: #ffd700; text-decoration: underline;">Music-Flamingo</a> by NVIDIA<br> | |
| <br> | |
| <strong style="color: white;">ποΈ Audio Processing Libraries:</strong><br> | |
| βοΈ <strong>Audio Analysis:</strong> <a href="https://librosa.org/" target="_blank" style="color: #87ceeb; text-decoration: underline;">Librosa</a> for audio feature extraction<br> | |
| π¬ <strong>Audio Conversion:</strong> <a href="https://ffmpeg.org/" target="_blank" style="color: #87ceeb; text-decoration: underline;">FFmpeg</a> for format conversion and processing | |
| </div> | |
| </div> | |
| """ | |
| # Create a wrapper interface that includes the header | |
| with gr.Blocks() as wrapper_interface: | |
| gr.HTML(header_html) | |
| tabbed_interface.render() | |
| return wrapper_interface | |
| if __name__ == "__main__": | |
| interface = create_interface() | |
| interface.launch(server_name="0.0.0.0", server_port=7860, mcp_server=True) | |