| """Audio download and preprocessing for D9c inference.""" |
|
|
| import tempfile |
| from pathlib import Path |
| from typing import Tuple |
|
|
| import librosa |
| import numpy as np |
| import requests |
|
|
| |
| TARGET_SR = 24000 |
|
|
|
|
| class AudioDownloadError(Exception): |
| """Raised when audio download fails.""" |
| pass |
|
|
|
|
| class AudioProcessingError(Exception): |
| """Raised when audio processing fails.""" |
| pass |
|
|
|
|
| def download_and_preprocess_audio( |
| audio_url: str, |
| target_sr: int = TARGET_SR, |
| max_duration: int = 300, |
| timeout: int = 60, |
| ) -> Tuple[np.ndarray, float]: |
| """Download audio from URL and preprocess for MERT/MuQ. |
| |
| Args: |
| audio_url: URL to download audio from |
| target_sr: Target sample rate (24kHz for MERT/MuQ) |
| max_duration: Maximum audio duration in seconds |
| timeout: Download timeout in seconds |
| |
| Returns: |
| Tuple of (audio_array, duration_seconds) |
| |
| Raises: |
| AudioDownloadError: If download fails |
| AudioProcessingError: If audio processing fails |
| """ |
| try: |
| response = requests.get(audio_url, timeout=timeout, stream=True) |
| response.raise_for_status() |
| except requests.RequestException as e: |
| raise AudioDownloadError(f"Failed to download audio: {e}") |
|
|
| |
| content_type = response.headers.get("content-type", "") |
| if "mpeg" in content_type or audio_url.endswith(".mp3"): |
| suffix = ".mp3" |
| elif "wav" in content_type or audio_url.endswith(".wav"): |
| suffix = ".wav" |
| elif "flac" in content_type or audio_url.endswith(".flac"): |
| suffix = ".flac" |
| else: |
| suffix = ".mp3" |
|
|
| with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as f: |
| for chunk in response.iter_content(chunk_size=8192): |
| f.write(chunk) |
| temp_path = Path(f.name) |
|
|
| try: |
| audio, sr = librosa.load(temp_path, sr=target_sr, mono=True) |
| duration = len(audio) / sr |
|
|
| if duration > max_duration: |
| raise AudioProcessingError( |
| f"Audio too long: {duration:.1f}s > {max_duration}s limit" |
| ) |
|
|
| if duration < 1.0: |
| raise AudioProcessingError( |
| f"Audio too short: {duration:.1f}s < 1.0s minimum" |
| ) |
|
|
| return audio, duration |
|
|
| except AudioProcessingError: |
| raise |
| except Exception as e: |
| raise AudioProcessingError(f"Failed to process audio: {e}") |
|
|
| finally: |
| temp_path.unlink(missing_ok=True) |
|
|
|
|
| def load_audio_from_file( |
| audio_path: Path, |
| target_sr: int = TARGET_SR, |
| ) -> Tuple[np.ndarray, float]: |
| """Load audio from local file.""" |
| audio, sr = librosa.load(audio_path, sr=target_sr, mono=True) |
| duration = len(audio) / sr |
| return audio, duration |
|
|
|
|
| def preprocess_audio_from_bytes( |
| audio_bytes: bytes, |
| target_sr: int = TARGET_SR, |
| max_duration: int = 300, |
| ) -> Tuple[np.ndarray, float]: |
| """Preprocess audio from raw bytes (e.g., base64 decoded).""" |
| import io |
|
|
| try: |
| audio, sr = librosa.load(io.BytesIO(audio_bytes), sr=target_sr, mono=True) |
| duration = len(audio) / sr |
|
|
| if duration > max_duration: |
| raise AudioProcessingError( |
| f"Audio too long: {duration:.1f}s > {max_duration}s limit" |
| ) |
|
|
| if duration < 1.0: |
| raise AudioProcessingError( |
| f"Audio too short: {duration:.1f}s < 1.0s minimum" |
| ) |
|
|
| return audio, duration |
|
|
| except AudioProcessingError: |
| raise |
| except Exception as e: |
| raise AudioProcessingError(f"Failed to process audio bytes: {e}") |
|
|