Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import os | |
| import subprocess | |
| import tempfile | |
| from pathlib import Path | |
| from typing import Tuple | |
| import librosa | |
| import numpy as np | |
| import soundfile as sf | |
| def _load_audio(audio_path: str, mono: bool = False) -> Tuple[np.ndarray, float]: | |
| """ | |
| Load an audio file in stereo format. | |
| Args: | |
| audio_path: Path to audio file or URL | |
| mono: Whether to load as mono or stereo (default: False) | |
| Returns: | |
| Tuple of (audio_data, sample_rate) | |
| """ | |
| y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq") | |
| return y, sr | |
| def estimate_key(audio_path: str) -> str: | |
| """ | |
| Estimate the musical key of an audio file using chroma features and harmonic analysis. | |
| This function analyzes the harmonic content of an audio file to determine its musical key | |
| using chroma features and statistical analysis of pitch class distributions. | |
| Args: | |
| audio_path: Path to audio file or URL (supports common formats: WAV, MP3, FLAC) | |
| Returns: | |
| Estimated key as string (e.g., 'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B') | |
| Examples: | |
| - Returns 'C' for audio in C major/A minor | |
| - Returns 'F#' for audio in F# major/D# minor | |
| - Returns 'A' for audio in A major/F# minor | |
| Note: | |
| Uses medium quality processing for faster analysis | |
| Most accurate for music with clear harmonic content | |
| May be less accurate for atonal or highly percussive music | |
| """ | |
| try: | |
| y, sr = librosa.load( | |
| audio_path, res_type="soxr_mq" | |
| ) # Medium quality for faster processing | |
| # Extract chroma features | |
| chroma = librosa.feature.chroma_cqt(y=y, sr=sr) | |
| # Get the most prominent pitch class | |
| chroma_mean = np.mean(chroma, axis=1) | |
| key_index = np.argmax(chroma_mean) | |
| # Map index to key names | |
| keys = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"] | |
| estimated_key = keys[key_index] | |
| return estimated_key | |
| except Exception as e: | |
| raise RuntimeError(f"Error estimating key: {str(e)}") | |
| def key_to_semitones(key: str, target_key: str = "C") -> int: | |
| """ | |
| Calculate semitone difference between two keys. | |
| Args: | |
| key: Source key | |
| target_key: Target key to align to | |
| Returns: | |
| Number of semitones to shift | |
| """ | |
| keys = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"] | |
| if key not in keys or target_key not in keys: | |
| raise ValueError("Invalid key name") | |
| key_index = keys.index(key) | |
| target_index = keys.index(target_key) | |
| # Calculate semitone difference (wrapping around 12 semitones) | |
| semitones = (target_index - key_index) % 12 | |
| if semitones > 6: | |
| semitones -= 12 | |
| return semitones | |
| def align_songs_by_key( | |
| audio1_path: str, | |
| audio2_path: str, | |
| target_key: str = "C", | |
| output_path: str = "output", | |
| output_format: str = "wav", | |
| ) -> Tuple[str, str]: | |
| """ | |
| Align two songs to the same musical key by pitch shifting. | |
| Args: | |
| audio1_path: Path to first audio file (supports WAV, MP3, FLAC) | |
| audio2_path: Path to second audio file (supports WAV, MP3, FLAC) | |
| target_key: Target key to align both songs to (default: 'C') | |
| output_path: Directory to save the aligned audio files | |
| output_format: Output format ('wav' or 'mp3', default: 'wav') | |
| Returns: | |
| Tuple of (aligned_audio1_path, aligned_audio2_path) - paths to processed files | |
| """ | |
| try: | |
| # Estimate keys for both tracks (handled internally by shift_to_key) | |
| # key1 = estimate_key(audio1_path) | |
| # key2 = estimate_key(audio2_path) | |
| # Calculate semitone shifts (handled internally by shift_to_key) | |
| # semitones1 = key_to_semitones(key1, target_key) | |
| # semitones2 = key_to_semitones(key2, target_key) | |
| # Load audio files | |
| y1, sr1 = _load_audio(audio1_path) | |
| y2, sr2 = _load_audio(audio2_path) | |
| # res_type = "soxr_vhq" # Very high quality for final output (set in shift_to_key) | |
| aligned1_path = shift_to_key( | |
| audio1_path, target_key, output_path, output_format | |
| ) | |
| aligned2_path = shift_to_key( | |
| audio2_path, target_key, output_path, output_format | |
| ) | |
| return aligned1_path, aligned2_path | |
| except Exception as e: | |
| raise RuntimeError(f"Error aligning audio keys: {str(e)}") from e | |
| def shift_to_key( | |
| audio_path: str, | |
| target_key: str, | |
| output_path: str = "output", | |
| output_format: str = "wav", | |
| ) -> str: | |
| """ | |
| Shift an audio file to a specific musical key. | |
| Args: | |
| audio_path: Path to audio file or URL (supports WAV, MP3, FLAC) | |
| target_key: Target key to shift to | |
| output_path: Directory to save the shifted audio file | |
| output_format: Output format ('wav' or 'mp3', default: 'wav') | |
| Returns: | |
| Path to the pitch-shifted audio file | |
| """ | |
| try: | |
| # Estimate current key | |
| current_key = estimate_key(audio_path) | |
| # Calculate semitone shift | |
| semitones = key_to_semitones(current_key, target_key) | |
| # Load and shift audio | |
| y, sr = _load_audio(audio_path) | |
| y_shifted = librosa.effects.pitch_shift( | |
| y, n_steps=semitones, scale=True, sr=sr, res_type="soxr_vhq" | |
| ) | |
| # Save to temporary file | |
| audio_filename = Path(audio_path).stem | |
| os.makedirs(output_path, exist_ok=True) | |
| if y_shifted.ndim == 2: | |
| y_shifted = y_shifted.T | |
| final_audio_path = os.path.join( | |
| output_path, | |
| f"{audio_filename}_shifted_to_{target_key}.{output_format.lower()}", | |
| ) | |
| if output_format.lower() == "mp3": | |
| # For MP3, save as WAV first then convert | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav: | |
| sf.write(temp_wav.name, y_shifted, sr, format="wav", subtype="PCM_16") | |
| # Convert to MP3 using ffmpeg | |
| cmd = [ | |
| "ffmpeg", | |
| "-y", | |
| "-i", | |
| temp_wav.name, | |
| "-c:a", | |
| "libmp3lame", | |
| "-b:a", | |
| "192k", | |
| final_audio_path, | |
| ] | |
| subprocess.run(cmd, capture_output=True, check=True) | |
| # Clean up temp file | |
| os.unlink(temp_wav.name) | |
| else: | |
| sf.write(final_audio_path, y_shifted, sr, format="wav", subtype="PCM_16") | |
| return final_audio_path | |
| except Exception as e: | |
| raise RuntimeError(f"Error shifting key: {str(e)}") | |
| if __name__ == "__main__": | |
| import argparse | |
| parser = argparse.ArgumentParser( | |
| description="Pitch alignment tools for audio files" | |
| ) | |
| subparsers = parser.add_subparsers(dest="command", help="Available commands") | |
| # Estimate key of a single file | |
| estimate_parser = subparsers.add_parser( | |
| "estimate", help="Estimate the key of an audio file" | |
| ) | |
| estimate_parser.add_argument("audio", help="Path to audio file") | |
| # Align two songs by key | |
| align_parser = subparsers.add_parser("align", help="Align two songs to same key") | |
| align_parser.add_argument("audio1", help="Path to first audio file") | |
| align_parser.add_argument("audio2", help="Path to second audio file") | |
| align_parser.add_argument( | |
| "--target-key", default="C", help="Target key to align to (default: C)" | |
| ) | |
| align_parser.add_argument( | |
| "--format", default="wav", choices=["wav", "mp3"], help="Output format" | |
| ) | |
| # Shift single file to key | |
| shift_parser = subparsers.add_parser("shift", help="Shift audio to specific key") | |
| shift_parser.add_argument("audio", help="Path to audio file") | |
| shift_parser.add_argument("target_key", help="Target key to shift to") | |
| shift_parser.add_argument( | |
| "--format", default="wav", choices=["wav", "mp3"], help="Output format" | |
| ) | |
| args = parser.parse_args() | |
| try: | |
| if args.command == "estimate": | |
| key = estimate_key(args.audio) | |
| print(f"Estimated key: {key}") | |
| elif args.command == "align": | |
| aligned1, aligned2 = align_songs_by_key( | |
| args.audio1, args.audio2, args.target_key, output_format=args.format | |
| ) | |
| print(f"Aligned audio 1: {aligned1}") | |
| print(f"Aligned audio 2: {aligned2}") | |
| elif args.command == "shift": | |
| output = shift_to_key( | |
| args.audio, args.target_key, output_format=args.format | |
| ) | |
| print(f"Shifted audio saved to: {output}") | |
| else: | |
| parser.print_help() | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| raise e | |
| exit(1) | |