Spaces:

frascuchon
/

music-mcp

Running on CPU Upgrade

App Files Files Community

music-mcp / tools /pitch_alignment.py

frascuchon HF Staff

audio_path documented

14e5437 19 days ago

raw

history blame contribute delete

8.87 kB

	import os
	import subprocess
	import tempfile
	from pathlib import Path
	from typing import Tuple

	import librosa
	import numpy as np
	import soundfile as sf


	def _load_audio(audio_path: str, mono: bool = False) -> Tuple[np.ndarray, float]:
	"""
	Load an audio file in stereo format.

	Args:
	audio_path: Path to audio file or URL
	mono: Whether to load as mono or stereo (default: False)

	Returns:
	Tuple of (audio_data, sample_rate)
	"""
	y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq")
	return y, sr


	def estimate_key(audio_path: str) -> str:
	"""
	Estimate the musical key of an audio file using chroma features and harmonic analysis.

	This function analyzes the harmonic content of an audio file to determine its musical key
	using chroma features and statistical analysis of pitch class distributions.

	Args:
	audio_path: Path to audio file or URL (supports common formats: WAV, MP3, FLAC)

	Returns:
	Estimated key as string (e.g., 'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B')

	Examples:
	- Returns 'C' for audio in C major/A minor
	- Returns 'F#' for audio in F# major/D# minor
	- Returns 'A' for audio in A major/F# minor

	Note:
	Uses medium quality processing for faster analysis
	Most accurate for music with clear harmonic content
	May be less accurate for atonal or highly percussive music
	"""
	try:
	y, sr = librosa.load(
	audio_path, res_type="soxr_mq"
	) # Medium quality for faster processing

	# Extract chroma features
	chroma = librosa.feature.chroma_cqt(y=y, sr=sr)

	# Get the most prominent pitch class
	chroma_mean = np.mean(chroma, axis=1)
	key_index = np.argmax(chroma_mean)

	# Map index to key names
	keys = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
	estimated_key = keys[key_index]

	return estimated_key

	except Exception as e:
	raise RuntimeError(f"Error estimating key: {str(e)}")


	def key_to_semitones(key: str, target_key: str = "C") -> int:
	"""
	Calculate semitone difference between two keys.

	Args:
	key: Source key
	target_key: Target key to align to

	Returns:
	Number of semitones to shift
	"""
	keys = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]

	if key not in keys or target_key not in keys:
	raise ValueError("Invalid key name")

	key_index = keys.index(key)
	target_index = keys.index(target_key)

	# Calculate semitone difference (wrapping around 12 semitones)
	semitones = (target_index - key_index) % 12
	if semitones > 6:
	semitones -= 12

	return semitones


	def align_songs_by_key(
	audio1_path: str,
	audio2_path: str,
	target_key: str = "C",
	output_path: str = "output",
	output_format: str = "wav",
	) -> Tuple[str, str]:
	"""
	Align two songs to the same musical key by pitch shifting.

	Args:
	audio1_path: Path to first audio file (supports WAV, MP3, FLAC)
	audio2_path: Path to second audio file (supports WAV, MP3, FLAC)
	target_key: Target key to align both songs to (default: 'C')
	output_path: Directory to save the aligned audio files
	output_format: Output format ('wav' or 'mp3', default: 'wav')

	Returns:
	Tuple of (aligned_audio1_path, aligned_audio2_path) - paths to processed files
	"""
	try:
	# Estimate keys for both tracks (handled internally by shift_to_key)
	# key1 = estimate_key(audio1_path)
	# key2 = estimate_key(audio2_path)

	# Calculate semitone shifts (handled internally by shift_to_key)
	# semitones1 = key_to_semitones(key1, target_key)
	# semitones2 = key_to_semitones(key2, target_key)

	# Load audio files
	y1, sr1 = _load_audio(audio1_path)
	y2, sr2 = _load_audio(audio2_path)

	# res_type = "soxr_vhq" # Very high quality for final output (set in shift_to_key)

	aligned1_path = shift_to_key(
	audio1_path, target_key, output_path, output_format
	)
	aligned2_path = shift_to_key(
	audio2_path, target_key, output_path, output_format
	)

	return aligned1_path, aligned2_path

	except Exception as e:
	raise RuntimeError(f"Error aligning audio keys: {str(e)}") from e


	def shift_to_key(
	audio_path: str,
	target_key: str,
	output_path: str = "output",
	output_format: str = "wav",
	) -> str:
	"""
	Shift an audio file to a specific musical key.

	Args:
	audio_path: Path to audio file or URL (supports WAV, MP3, FLAC)
	target_key: Target key to shift to
	output_path: Directory to save the shifted audio file
	output_format: Output format ('wav' or 'mp3', default: 'wav')

	Returns:
	Path to the pitch-shifted audio file
	"""
	try:
	# Estimate current key
	current_key = estimate_key(audio_path)

	# Calculate semitone shift
	semitones = key_to_semitones(current_key, target_key)

	# Load and shift audio
	y, sr = _load_audio(audio_path)
	y_shifted = librosa.effects.pitch_shift(
	y, n_steps=semitones, scale=True, sr=sr, res_type="soxr_vhq"
	)

	# Save to temporary file
	audio_filename = Path(audio_path).stem
	os.makedirs(output_path, exist_ok=True)

	if y_shifted.ndim == 2:
	y_shifted = y_shifted.T

	final_audio_path = os.path.join(
	output_path,
	f"{audio_filename}_shifted_to_{target_key}.{output_format.lower()}",
	)

	if output_format.lower() == "mp3":
	# For MP3, save as WAV first then convert
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
	sf.write(temp_wav.name, y_shifted, sr, format="wav", subtype="PCM_16")

	# Convert to MP3 using ffmpeg
	cmd = [
	"ffmpeg",
	"-y",
	"-i",
	temp_wav.name,
	"-c:a",
	"libmp3lame",
	"-b:a",
	"192k",
	final_audio_path,
	]
	subprocess.run(cmd, capture_output=True, check=True)

	# Clean up temp file
	os.unlink(temp_wav.name)
	else:
	sf.write(final_audio_path, y_shifted, sr, format="wav", subtype="PCM_16")

	return final_audio_path

	except Exception as e:
	raise RuntimeError(f"Error shifting key: {str(e)}")


	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser(
	description="Pitch alignment tools for audio files"
	)
	subparsers = parser.add_subparsers(dest="command", help="Available commands")

	# Estimate key of a single file
	estimate_parser = subparsers.add_parser(
	"estimate", help="Estimate the key of an audio file"
	)
	estimate_parser.add_argument("audio", help="Path to audio file")

	# Align two songs by key
	align_parser = subparsers.add_parser("align", help="Align two songs to same key")
	align_parser.add_argument("audio1", help="Path to first audio file")
	align_parser.add_argument("audio2", help="Path to second audio file")
	align_parser.add_argument(
	"--target-key", default="C", help="Target key to align to (default: C)"
	)
	align_parser.add_argument(
	"--format", default="wav", choices=["wav", "mp3"], help="Output format"
	)

	# Shift single file to key
	shift_parser = subparsers.add_parser("shift", help="Shift audio to specific key")
	shift_parser.add_argument("audio", help="Path to audio file")
	shift_parser.add_argument("target_key", help="Target key to shift to")
	shift_parser.add_argument(
	"--format", default="wav", choices=["wav", "mp3"], help="Output format"
	)

	args = parser.parse_args()

	try:
	if args.command == "estimate":
	key = estimate_key(args.audio)
	print(f"Estimated key: {key}")
	elif args.command == "align":
	aligned1, aligned2 = align_songs_by_key(
	args.audio1, args.audio2, args.target_key, output_format=args.format
	)
	print(f"Aligned audio 1: {aligned1}")
	print(f"Aligned audio 2: {aligned2}")
	elif args.command == "shift":
	output = shift_to_key(
	args.audio, args.target_key, output_format=args.format
	)
	print(f"Shifted audio saved to: {output}")
	else:
	parser.print_help()
	except Exception as e:
	print(f"Error: {e}")
	raise e
	exit(1)