daasime's picture
Add SOP Audio Analyzer app files
ebba35f
"""
Audio preprocessor - normalize audio for analysis.
"""
import torch
import torchaudio
import numpy as np
from pathlib import Path
from typing import Tuple, Optional
import tempfile
import os
class AudioPreprocessor:
"""Normalize audio to standard format for analysis."""
TARGET_SAMPLE_RATE = 16000
TARGET_CHANNELS = 1
def __init__(self):
pass
def load_audio(self, audio_path: str) -> Tuple[torch.Tensor, int]:
"""
Load audio file.
Returns:
Tuple of (waveform, sample_rate)
"""
# Use soundfile backend to avoid torchcodec dependency
waveform, sample_rate = torchaudio.load(audio_path, backend="soundfile")
return waveform, sample_rate
def normalize(self, waveform: torch.Tensor, sample_rate: int) -> Tuple[torch.Tensor, int]:
"""
Normalize audio to mono, 16kHz, normalized amplitude.
Returns:
Tuple of (normalized_waveform, target_sample_rate)
"""
# Convert to mono
if waveform.shape[0] > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
# Resample to 16kHz
if sample_rate != self.TARGET_SAMPLE_RATE:
resampler = torchaudio.transforms.Resample(
orig_freq=sample_rate,
new_freq=self.TARGET_SAMPLE_RATE
)
waveform = resampler(waveform)
# Normalize amplitude
max_amp = waveform.abs().max()
if max_amp > 0:
waveform = waveform / max_amp * 0.95
return waveform, self.TARGET_SAMPLE_RATE
def process_file(self, audio_path: str, output_path: Optional[str] = None) -> Tuple[torch.Tensor, int, dict]:
"""
Load and normalize audio file.
Returns:
Tuple of (waveform, sample_rate, metadata)
"""
# Load
waveform, orig_sr = self.load_audio(audio_path)
orig_duration = waveform.shape[1] / orig_sr
orig_channels = waveform.shape[0]
# Normalize
waveform, sample_rate = self.normalize(waveform, orig_sr)
# Save if output path provided
if output_path:
torchaudio.save(output_path, waveform, sample_rate)
metadata = {
'original_sample_rate': orig_sr,
'original_channels': orig_channels,
'original_duration': orig_duration,
'normalized_sample_rate': sample_rate,
'normalized_duration': waveform.shape[1] / sample_rate
}
return waveform, sample_rate, metadata
def get_duration(self, waveform: torch.Tensor, sample_rate: int) -> float:
"""Get duration in seconds."""
return waveform.shape[1] / sample_rate
def save_audio(self, waveform: torch.Tensor, sample_rate: int, output_path: str):
"""Save audio to file."""
os.makedirs(os.path.dirname(output_path), exist_ok=True)
torchaudio.save(output_path, waveform, sample_rate)
def extract_segment(self, waveform: torch.Tensor, sample_rate: int,
start: float, end: float) -> torch.Tensor:
"""Extract segment from waveform."""
start_sample = int(start * sample_rate)
end_sample = int(end * sample_rate)
return waveform[:, start_sample:end_sample]