| | """ |
| | Audio Preprocessing Module |
| | |
| | Enhances audio quality before source separation and transcription. |
| | |
| | Preprocessing Steps: |
| | 1. Spectral denoising - Remove background noise and artifacts |
| | 2. Peak normalization - Normalize volume to consistent level |
| | 3. High-pass filtering - Remove rumble and DC offset |
| | 4. Resampling - Ensure consistent sample rate |
| | |
| | Target: +2-5% accuracy improvement on noisy/compressed YouTube audio |
| | """ |
| |
|
| | from pathlib import Path |
| | from typing import Optional |
| | import numpy as np |
| | import librosa |
| | import soundfile as sf |
| |
|
| |
|
| | class AudioPreprocessor: |
| | """ |
| | Audio preprocessing for improving transcription accuracy. |
| | |
| | Mitigates common issues with YouTube audio: |
| | - Compression artifacts (lossy codecs) |
| | - Background noise (ambient, microphone noise) |
| | - Inconsistent levels (quiet vs loud recordings) |
| | - Low-frequency rumble (not musical, degrades separation) |
| | """ |
| |
|
| | def __init__( |
| | self, |
| | enable_denoising: bool = True, |
| | enable_normalization: bool = True, |
| | enable_highpass: bool = True, |
| | target_sample_rate: int = 44100 |
| | ): |
| | """ |
| | Initialize audio preprocessor. |
| | |
| | Args: |
| | enable_denoising: Enable spectral denoising |
| | enable_normalization: Enable peak normalization |
| | enable_highpass: Enable high-pass filter (remove rumble) |
| | target_sample_rate: Target sample rate (Hz) |
| | """ |
| | self.enable_denoising = enable_denoising |
| | self.enable_normalization = enable_normalization |
| | self.enable_highpass = enable_highpass |
| | self.target_sample_rate = target_sample_rate |
| |
|
| | def preprocess( |
| | self, |
| | audio_path: Path, |
| | output_dir: Optional[Path] = None |
| | ) -> Path: |
| | """ |
| | Preprocess audio file for improved transcription quality. |
| | |
| | Args: |
| | audio_path: Input audio file |
| | output_dir: Output directory (default: same as input) |
| | |
| | Returns: |
| | Path to preprocessed audio file |
| | """ |
| | if output_dir is None: |
| | output_dir = audio_path.parent |
| | output_dir.mkdir(parents=True, exist_ok=True) |
| |
|
| | output_path = output_dir / f"{audio_path.stem}_preprocessed.wav" |
| |
|
| | print(f" Preprocessing audio: {audio_path.name}") |
| |
|
| | |
| | y, sr = librosa.load(str(audio_path), sr=None, mono=False) |
| |
|
| | |
| | if y.ndim == 2: |
| | print(f" Input: stereo, {sr}Hz") |
| | is_stereo = True |
| | else: |
| | print(f" Input: mono, {sr}Hz") |
| | is_stereo = False |
| | y = np.expand_dims(y, axis=0) |
| |
|
| | |
| | if self.enable_denoising: |
| | print(f" Applying spectral denoising...") |
| | y = self._denoise(y, sr, is_stereo) |
| |
|
| | |
| | if self.enable_normalization: |
| | print(f" Normalizing volume...") |
| | y = self._normalize(y) |
| |
|
| | |
| | if self.enable_highpass: |
| | print(f" Applying high-pass filter (30Hz cutoff)...") |
| | y = self._highpass_filter(y, sr) |
| |
|
| | |
| | if sr != self.target_sample_rate: |
| | print(f" Resampling: {sr}Hz → {self.target_sample_rate}Hz") |
| | y = self._resample(y, sr, self.target_sample_rate) |
| | sr = self.target_sample_rate |
| |
|
| | |
| | if not is_stereo: |
| | y = y[0] |
| |
|
| | |
| | sf.write(output_path, y.T if is_stereo else y, sr) |
| | print(f" ✓ Preprocessed audio saved: {output_path.name}") |
| |
|
| | return output_path |
| |
|
| | def _denoise(self, y: np.ndarray, sr: int, is_stereo: bool) -> np.ndarray: |
| | """ |
| | Apply spectral denoising using noisereduce library. |
| | |
| | Args: |
| | y: Audio data (channels, samples) |
| | sr: Sample rate |
| | is_stereo: Whether audio is stereo |
| | |
| | Returns: |
| | Denoised audio |
| | """ |
| | try: |
| | import noisereduce as nr |
| | except ImportError: |
| | print(f" ⚠ noisereduce not installed, skipping denoising") |
| | return y |
| |
|
| | |
| | y_denoised = np.zeros_like(y) |
| |
|
| | for ch in range(y.shape[0]): |
| | y_denoised[ch] = nr.reduce_noise( |
| | y=y[ch], |
| | sr=sr, |
| | stationary=True, |
| | prop_decrease=0.8 |
| | ) |
| |
|
| | return y_denoised |
| |
|
| | def _normalize(self, y: np.ndarray, target_db: float = -1.0) -> np.ndarray: |
| | """ |
| | Normalize audio to target peak level. |
| | |
| | Args: |
| | y: Audio data |
| | target_db: Target peak level in dB (default: -1dB = almost full scale) |
| | |
| | Returns: |
| | Normalized audio |
| | """ |
| | |
| | peak = np.abs(y).max() |
| |
|
| | if peak == 0: |
| | return y |
| |
|
| | |
| | target_linear = 10 ** (target_db / 20.0) |
| | gain = target_linear / peak |
| |
|
| | return y * gain |
| |
|
| | def _highpass_filter( |
| | self, |
| | y: np.ndarray, |
| | sr: int, |
| | cutoff_hz: float = 30.0 |
| | ) -> np.ndarray: |
| | """ |
| | Apply high-pass filter to remove low-frequency rumble. |
| | |
| | Args: |
| | y: Audio data (channels, samples) |
| | sr: Sample rate |
| | cutoff_hz: Cutoff frequency (Hz) |
| | |
| | Returns: |
| | Filtered audio |
| | """ |
| | from scipy.signal import butter, sosfilt |
| |
|
| | |
| | sos = butter(4, cutoff_hz, 'hp', fs=sr, output='sos') |
| |
|
| | |
| | y_filtered = np.zeros_like(y) |
| |
|
| | for ch in range(y.shape[0]): |
| | y_filtered[ch] = sosfilt(sos, y[ch]) |
| |
|
| | return y_filtered |
| |
|
| | def _resample( |
| | self, |
| | y: np.ndarray, |
| | orig_sr: int, |
| | target_sr: int |
| | ) -> np.ndarray: |
| | """ |
| | Resample audio to target sample rate. |
| | |
| | Args: |
| | y: Audio data (channels, samples) |
| | orig_sr: Original sample rate |
| | target_sr: Target sample rate |
| | |
| | Returns: |
| | Resampled audio |
| | """ |
| | y_resampled = np.zeros((y.shape[0], int(y.shape[1] * target_sr / orig_sr))) |
| |
|
| | for ch in range(y.shape[0]): |
| | y_resampled[ch] = librosa.resample( |
| | y[ch], |
| | orig_sr=orig_sr, |
| | target_sr=target_sr |
| | ) |
| |
|
| | return y_resampled |
| |
|
| |
|
| | if __name__ == "__main__": |
| | |
| | import argparse |
| |
|
| | parser = argparse.ArgumentParser(description="Test Audio Preprocessor") |
| | parser.add_argument("audio_file", type=str, help="Path to audio file") |
| | parser.add_argument("--output", type=str, default="./output_audio", |
| | help="Output directory for preprocessed audio") |
| | parser.add_argument("--no-denoise", action="store_true", |
| | help="Disable denoising") |
| | parser.add_argument("--no-normalize", action="store_true", |
| | help="Disable normalization") |
| | parser.add_argument("--no-highpass", action="store_true", |
| | help="Disable high-pass filter") |
| | args = parser.parse_args() |
| |
|
| | preprocessor = AudioPreprocessor( |
| | enable_denoising=not args.no_denoise, |
| | enable_normalization=not args.no_normalize, |
| | enable_highpass=not args.no_highpass |
| | ) |
| |
|
| | audio_path = Path(args.audio_file) |
| | output_dir = Path(args.output) |
| |
|
| | |
| | output_path = preprocessor.preprocess(audio_path, output_dir) |
| | print(f"\n✓ Preprocessing complete: {output_path}") |
| |
|