music-mcp / tools /pitch_alignment.py
frascuchon's picture
frascuchon HF Staff
audio_path documented
14e5437
import os
import subprocess
import tempfile
from pathlib import Path
from typing import Tuple
import librosa
import numpy as np
import soundfile as sf
def _load_audio(audio_path: str, mono: bool = False) -> Tuple[np.ndarray, float]:
"""
Load an audio file in stereo format.
Args:
audio_path: Path to audio file or URL
mono: Whether to load as mono or stereo (default: False)
Returns:
Tuple of (audio_data, sample_rate)
"""
y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq")
return y, sr
def estimate_key(audio_path: str) -> str:
"""
Estimate the musical key of an audio file using chroma features and harmonic analysis.
This function analyzes the harmonic content of an audio file to determine its musical key
using chroma features and statistical analysis of pitch class distributions.
Args:
audio_path: Path to audio file or URL (supports common formats: WAV, MP3, FLAC)
Returns:
Estimated key as string (e.g., 'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B')
Examples:
- Returns 'C' for audio in C major/A minor
- Returns 'F#' for audio in F# major/D# minor
- Returns 'A' for audio in A major/F# minor
Note:
Uses medium quality processing for faster analysis
Most accurate for music with clear harmonic content
May be less accurate for atonal or highly percussive music
"""
try:
y, sr = librosa.load(
audio_path, res_type="soxr_mq"
) # Medium quality for faster processing
# Extract chroma features
chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
# Get the most prominent pitch class
chroma_mean = np.mean(chroma, axis=1)
key_index = np.argmax(chroma_mean)
# Map index to key names
keys = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
estimated_key = keys[key_index]
return estimated_key
except Exception as e:
raise RuntimeError(f"Error estimating key: {str(e)}")
def key_to_semitones(key: str, target_key: str = "C") -> int:
"""
Calculate semitone difference between two keys.
Args:
key: Source key
target_key: Target key to align to
Returns:
Number of semitones to shift
"""
keys = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
if key not in keys or target_key not in keys:
raise ValueError("Invalid key name")
key_index = keys.index(key)
target_index = keys.index(target_key)
# Calculate semitone difference (wrapping around 12 semitones)
semitones = (target_index - key_index) % 12
if semitones > 6:
semitones -= 12
return semitones
def align_songs_by_key(
audio1_path: str,
audio2_path: str,
target_key: str = "C",
output_path: str = "output",
output_format: str = "wav",
) -> Tuple[str, str]:
"""
Align two songs to the same musical key by pitch shifting.
Args:
audio1_path: Path to first audio file (supports WAV, MP3, FLAC)
audio2_path: Path to second audio file (supports WAV, MP3, FLAC)
target_key: Target key to align both songs to (default: 'C')
output_path: Directory to save the aligned audio files
output_format: Output format ('wav' or 'mp3', default: 'wav')
Returns:
Tuple of (aligned_audio1_path, aligned_audio2_path) - paths to processed files
"""
try:
# Estimate keys for both tracks (handled internally by shift_to_key)
# key1 = estimate_key(audio1_path)
# key2 = estimate_key(audio2_path)
# Calculate semitone shifts (handled internally by shift_to_key)
# semitones1 = key_to_semitones(key1, target_key)
# semitones2 = key_to_semitones(key2, target_key)
# Load audio files
y1, sr1 = _load_audio(audio1_path)
y2, sr2 = _load_audio(audio2_path)
# res_type = "soxr_vhq" # Very high quality for final output (set in shift_to_key)
aligned1_path = shift_to_key(
audio1_path, target_key, output_path, output_format
)
aligned2_path = shift_to_key(
audio2_path, target_key, output_path, output_format
)
return aligned1_path, aligned2_path
except Exception as e:
raise RuntimeError(f"Error aligning audio keys: {str(e)}") from e
def shift_to_key(
audio_path: str,
target_key: str,
output_path: str = "output",
output_format: str = "wav",
) -> str:
"""
Shift an audio file to a specific musical key.
Args:
audio_path: Path to audio file or URL (supports WAV, MP3, FLAC)
target_key: Target key to shift to
output_path: Directory to save the shifted audio file
output_format: Output format ('wav' or 'mp3', default: 'wav')
Returns:
Path to the pitch-shifted audio file
"""
try:
# Estimate current key
current_key = estimate_key(audio_path)
# Calculate semitone shift
semitones = key_to_semitones(current_key, target_key)
# Load and shift audio
y, sr = _load_audio(audio_path)
y_shifted = librosa.effects.pitch_shift(
y, n_steps=semitones, scale=True, sr=sr, res_type="soxr_vhq"
)
# Save to temporary file
audio_filename = Path(audio_path).stem
os.makedirs(output_path, exist_ok=True)
if y_shifted.ndim == 2:
y_shifted = y_shifted.T
final_audio_path = os.path.join(
output_path,
f"{audio_filename}_shifted_to_{target_key}.{output_format.lower()}",
)
if output_format.lower() == "mp3":
# For MP3, save as WAV first then convert
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
sf.write(temp_wav.name, y_shifted, sr, format="wav", subtype="PCM_16")
# Convert to MP3 using ffmpeg
cmd = [
"ffmpeg",
"-y",
"-i",
temp_wav.name,
"-c:a",
"libmp3lame",
"-b:a",
"192k",
final_audio_path,
]
subprocess.run(cmd, capture_output=True, check=True)
# Clean up temp file
os.unlink(temp_wav.name)
else:
sf.write(final_audio_path, y_shifted, sr, format="wav", subtype="PCM_16")
return final_audio_path
except Exception as e:
raise RuntimeError(f"Error shifting key: {str(e)}")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="Pitch alignment tools for audio files"
)
subparsers = parser.add_subparsers(dest="command", help="Available commands")
# Estimate key of a single file
estimate_parser = subparsers.add_parser(
"estimate", help="Estimate the key of an audio file"
)
estimate_parser.add_argument("audio", help="Path to audio file")
# Align two songs by key
align_parser = subparsers.add_parser("align", help="Align two songs to same key")
align_parser.add_argument("audio1", help="Path to first audio file")
align_parser.add_argument("audio2", help="Path to second audio file")
align_parser.add_argument(
"--target-key", default="C", help="Target key to align to (default: C)"
)
align_parser.add_argument(
"--format", default="wav", choices=["wav", "mp3"], help="Output format"
)
# Shift single file to key
shift_parser = subparsers.add_parser("shift", help="Shift audio to specific key")
shift_parser.add_argument("audio", help="Path to audio file")
shift_parser.add_argument("target_key", help="Target key to shift to")
shift_parser.add_argument(
"--format", default="wav", choices=["wav", "mp3"], help="Output format"
)
args = parser.parse_args()
try:
if args.command == "estimate":
key = estimate_key(args.audio)
print(f"Estimated key: {key}")
elif args.command == "align":
aligned1, aligned2 = align_songs_by_key(
args.audio1, args.audio2, args.target_key, output_format=args.format
)
print(f"Aligned audio 1: {aligned1}")
print(f"Aligned audio 2: {aligned2}")
elif args.command == "shift":
output = shift_to_key(
args.audio, args.target_key, output_format=args.format
)
print(f"Shifted audio saved to: {output}")
else:
parser.print_help()
except Exception as e:
print(f"Error: {e}")
raise e
exit(1)