|
import os |
|
import json |
|
import pandas as pd |
|
from datetime import datetime |
|
from pathlib import Path |
|
import wave |
|
import struct |
|
from io import BytesIO |
|
from dotenv import load_dotenv |
|
from huggingface_hub import HfApi |
|
import logging |
|
from database_manager import store_metadata, engine |
|
from language_config import get_all_languages |
|
from sqlalchemy import text |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
BASE_DIR = Path('datasets') |
|
|
|
def should_save_locally(): |
|
"""Check if we should save files locally""" |
|
return os.getenv("SAVE_LOCALLY", "true").lower() == "true" |
|
|
|
class AudioDatasetPreparator: |
|
def __init__(self, transcripts, user_id='anonymous'): |
|
self.transcripts = transcripts |
|
self.user_id = user_id |
|
self.speaker_name = "" |
|
self.gender = "" |
|
self.language = "" |
|
self.country = "" |
|
self.state = "" |
|
self.city = "" |
|
self.age_group = "" |
|
self.accent = "" |
|
self.domain = "GEN" |
|
self.subdomain = "GEN" |
|
|
|
|
|
if should_save_locally(): |
|
self._initialize_storage() |
|
|
|
def _initialize_storage(self): |
|
"""Initialize storage directories""" |
|
try: |
|
|
|
BASE_DIR.mkdir(exist_ok=True) |
|
|
|
|
|
self.recordings_df = pd.DataFrame(columns=[ |
|
'user_id', 'audio_filename', 'transcription', |
|
'speaker_name', 'speaker_id', 'audio_path', |
|
'sampling_rate', 'duration', 'language', |
|
'gender', 'country', 'state', 'city', 'verified', |
|
'username', 'timestamp', 'age_group', 'accent' |
|
]) |
|
|
|
logger.info("Storage initialized successfully") |
|
except Exception as e: |
|
logger.error(f"Error initializing storage: {e}") |
|
raise |
|
|
|
def _get_language_df(self, language): |
|
"""Get or create language-specific DataFrame""" |
|
if language not in self.language_dfs: |
|
|
|
lang_dir = BASE_DIR / language |
|
lang_dir.mkdir(exist_ok=True) |
|
|
|
audio_dir = lang_dir / 'audio' |
|
audio_dir.mkdir(exist_ok=True) |
|
|
|
parquet_path = lang_dir / f"{language}.parquet" |
|
|
|
if parquet_path.exists(): |
|
self.language_dfs[language] = pd.read_parquet(parquet_path) |
|
else: |
|
self.language_dfs[language] = pd.DataFrame(columns=self.recordings_df.columns) |
|
|
|
return self.language_dfs[language] |
|
|
|
def add_metadata(self, recording_data): |
|
"""Only handle local file operations if needed""" |
|
if not should_save_locally(): |
|
return |
|
|
|
def save_audio(self, pcm_data, sample_rate, filename, bits_per_sample=16, channels=1, already_processed=False): |
|
"""Save audio file in language-specific audio folder using built-in wave module with processing""" |
|
if not should_save_locally(): |
|
return None |
|
|
|
try: |
|
|
|
lang_dir = BASE_DIR / self.language |
|
audio_dir = lang_dir / 'audio' |
|
audio_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
filepath = audio_dir / filename |
|
|
|
|
|
if not already_processed and isinstance(pcm_data, (bytes, bytearray)): |
|
|
|
bytes_per_sample = bits_per_sample // 8 |
|
num_samples = len(pcm_data) // bytes_per_sample |
|
|
|
if bits_per_sample == 16: |
|
|
|
import array |
|
int16_data = array.array('h') |
|
int16_data.frombytes(pcm_data) |
|
|
|
|
|
samples_per_second = sample_rate * channels |
|
fade_in_samples = min(int(samples_per_second * 0.3), int(num_samples * 0.1)) |
|
end_trim_samples = min(int(samples_per_second * 0.15), int(num_samples * 0.05)) |
|
fade_out_samples = min(int(samples_per_second * 0.15), int(num_samples * 0.04)) |
|
|
|
|
|
for i in range(fade_in_samples): |
|
fade_ratio = i / fade_in_samples |
|
|
|
smooth_fade = fade_ratio * fade_ratio * fade_ratio |
|
int16_data[i] = int(int16_data[i] * smooth_fade) |
|
|
|
|
|
trimmed_length = max(0, num_samples - end_trim_samples) |
|
|
|
|
|
fade_out_start = trimmed_length - fade_out_samples |
|
for i in range(fade_out_samples): |
|
if fade_out_start + i >= trimmed_length: |
|
break |
|
fade_ratio = 1 - (i / fade_out_samples) |
|
|
|
smooth_fade = fade_ratio * fade_ratio * fade_ratio |
|
int16_data[fade_out_start + i] = int(int16_data[fade_out_start + i] * smooth_fade) |
|
|
|
|
|
pcm_data = int16_data[:trimmed_length].tobytes() |
|
logger.info(f"Server-side processing: applied 300ms fade-in, 150ms end trim and 150ms fade-out") |
|
|
|
|
|
with wave.open(str(filepath), 'wb') as wav_file: |
|
|
|
wav_file.setnchannels(channels) |
|
wav_file.setsampwidth(bits_per_sample // 8) |
|
wav_file.setframerate(sample_rate) |
|
|
|
|
|
wav_file.writeframes(pcm_data) |
|
|
|
logger.info(f"Saved audio file: {filepath}") |
|
return str(filepath) |
|
except Exception as e: |
|
logger.error(f"Error saving audio file: {e}") |
|
return None |