Jai-D's picture
Upload folder using huggingface_hub
bfc6d2a verified
"""Audio download and preprocessing for D9c inference."""
import tempfile
from pathlib import Path
from typing import Tuple
import librosa
import numpy as np
import requests
# Default sample rate for MERT/MuQ (hardcoded to avoid import issues)
TARGET_SR = 24000
class AudioDownloadError(Exception):
"""Raised when audio download fails."""
pass
class AudioProcessingError(Exception):
"""Raised when audio processing fails."""
pass
def download_and_preprocess_audio(
audio_url: str,
target_sr: int = TARGET_SR,
max_duration: int = 300,
timeout: int = 60,
) -> Tuple[np.ndarray, float]:
"""Download audio from URL and preprocess for MERT/MuQ.
Args:
audio_url: URL to download audio from
target_sr: Target sample rate (24kHz for MERT/MuQ)
max_duration: Maximum audio duration in seconds
timeout: Download timeout in seconds
Returns:
Tuple of (audio_array, duration_seconds)
Raises:
AudioDownloadError: If download fails
AudioProcessingError: If audio processing fails
"""
try:
response = requests.get(audio_url, timeout=timeout, stream=True)
response.raise_for_status()
except requests.RequestException as e:
raise AudioDownloadError(f"Failed to download audio: {e}")
# Determine file extension from content-type or URL
content_type = response.headers.get("content-type", "")
if "mpeg" in content_type or audio_url.endswith(".mp3"):
suffix = ".mp3"
elif "wav" in content_type or audio_url.endswith(".wav"):
suffix = ".wav"
elif "flac" in content_type or audio_url.endswith(".flac"):
suffix = ".flac"
else:
suffix = ".mp3"
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
temp_path = Path(f.name)
try:
audio, sr = librosa.load(temp_path, sr=target_sr, mono=True)
duration = len(audio) / sr
if duration > max_duration:
raise AudioProcessingError(
f"Audio too long: {duration:.1f}s > {max_duration}s limit"
)
if duration < 1.0:
raise AudioProcessingError(
f"Audio too short: {duration:.1f}s < 1.0s minimum"
)
return audio, duration
except AudioProcessingError:
raise
except Exception as e:
raise AudioProcessingError(f"Failed to process audio: {e}")
finally:
temp_path.unlink(missing_ok=True)
def load_audio_from_file(
audio_path: Path,
target_sr: int = TARGET_SR,
) -> Tuple[np.ndarray, float]:
"""Load audio from local file."""
audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)
duration = len(audio) / sr
return audio, duration
def preprocess_audio_from_bytes(
audio_bytes: bytes,
target_sr: int = TARGET_SR,
max_duration: int = 300,
) -> Tuple[np.ndarray, float]:
"""Preprocess audio from raw bytes (e.g., base64 decoded)."""
import io
try:
audio, sr = librosa.load(io.BytesIO(audio_bytes), sr=target_sr, mono=True)
duration = len(audio) / sr
if duration > max_duration:
raise AudioProcessingError(
f"Audio too long: {duration:.1f}s > {max_duration}s limit"
)
if duration < 1.0:
raise AudioProcessingError(
f"Audio too short: {duration:.1f}s < 1.0s minimum"
)
return audio, duration
except AudioProcessingError:
raise
except Exception as e:
raise AudioProcessingError(f"Failed to process audio bytes: {e}")