Coild / prepare_dataset.py
loko-dev's picture
Add domain and subdomain handling in dataset preparation and admin routes; enhance data submission with new fields
92a1315
import os
import json
import pandas as pd
from datetime import datetime
from pathlib import Path
import wave
import struct
from io import BytesIO
from dotenv import load_dotenv
from huggingface_hub import HfApi
import logging
from database_manager import store_metadata, engine
from language_config import get_all_languages
from sqlalchemy import text
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Load environment variables
load_dotenv()
# Define base directories
BASE_DIR = Path('datasets') # Base directory for all data
def should_save_locally():
"""Check if we should save files locally"""
return os.getenv("SAVE_LOCALLY", "true").lower() == "true"
class AudioDatasetPreparator:
def __init__(self, transcripts, user_id='anonymous'):
self.transcripts = transcripts
self.user_id = user_id
self.speaker_name = ""
self.gender = ""
self.language = ""
self.country = ""
self.state = ""
self.city = ""
self.age_group = ""
self.accent = ""
self.domain = "GEN" # Add default domain
self.subdomain = "GEN" # Add default subdomain
# Initialize storage if needed
if should_save_locally():
self._initialize_storage()
def _initialize_storage(self):
"""Initialize storage directories"""
try:
# Create base directory only
BASE_DIR.mkdir(exist_ok=True)
# Initialize recordings DataFrame with new columns
self.recordings_df = pd.DataFrame(columns=[
'user_id', 'audio_filename', 'transcription',
'speaker_name', 'speaker_id', 'audio_path',
'sampling_rate', 'duration', 'language',
'gender', 'country', 'state', 'city', 'verified',
'username', 'timestamp', 'age_group', 'accent'
])
logger.info("Storage initialized successfully")
except Exception as e:
logger.error(f"Error initializing storage: {e}")
raise
def _get_language_df(self, language):
"""Get or create language-specific DataFrame"""
if language not in self.language_dfs:
# Create language directory structure
lang_dir = BASE_DIR / language
lang_dir.mkdir(exist_ok=True)
# Create audio subdirectory
audio_dir = lang_dir / 'audio'
audio_dir.mkdir(exist_ok=True)
parquet_path = lang_dir / f"{language}.parquet"
if parquet_path.exists():
self.language_dfs[language] = pd.read_parquet(parquet_path)
else:
self.language_dfs[language] = pd.DataFrame(columns=self.recordings_df.columns)
return self.language_dfs[language]
def add_metadata(self, recording_data):
"""Only handle local file operations if needed"""
if not should_save_locally():
return
def save_audio(self, pcm_data, sample_rate, filename, bits_per_sample=16, channels=1, already_processed=False):
"""Save audio file in language-specific audio folder using built-in wave module with processing"""
if not should_save_locally():
return None
try:
# Get language directory and create audio subdirectory
lang_dir = BASE_DIR / self.language
audio_dir = lang_dir / 'audio'
audio_dir.mkdir(parents=True, exist_ok=True)
# Full path for the output WAV file
filepath = audio_dir / filename
# Apply server-side processing if not already done client-side
if not already_processed and isinstance(pcm_data, (bytes, bytearray)):
# Convert PCM bytes to int16 array for processing
bytes_per_sample = bits_per_sample // 8
num_samples = len(pcm_data) // bytes_per_sample
if bits_per_sample == 16:
# Convert bytes to int16 array
import array
int16_data = array.array('h')
int16_data.frombytes(pcm_data)
# Parameters for audio processing
samples_per_second = sample_rate * channels
fade_in_samples = min(int(samples_per_second * 0.3), int(num_samples * 0.1)) # 300ms fade in
end_trim_samples = min(int(samples_per_second * 0.15), int(num_samples * 0.05)) # 150ms end trim
fade_out_samples = min(int(samples_per_second * 0.15), int(num_samples * 0.04)) # 150ms fade out
# Step 1: Apply fade-in to the beginning (before trimming)
for i in range(fade_in_samples):
fade_ratio = i / fade_in_samples
# Cubic ease-in curve for smooth fade
smooth_fade = fade_ratio * fade_ratio * fade_ratio
int16_data[i] = int(int16_data[i] * smooth_fade)
# Step 2: Calculate the length after removing the end trim
trimmed_length = max(0, num_samples - end_trim_samples)
# Step 3: Apply fade-out at the end (before the trim point)
fade_out_start = trimmed_length - fade_out_samples
for i in range(fade_out_samples):
if fade_out_start + i >= trimmed_length:
break
fade_ratio = 1 - (i / fade_out_samples)
# Cubic ease-out curve for smooth fade
smooth_fade = fade_ratio * fade_ratio * fade_ratio
int16_data[fade_out_start + i] = int(int16_data[fade_out_start + i] * smooth_fade)
# Step 4: Create new PCM data with end trimming
pcm_data = int16_data[:trimmed_length].tobytes()
logger.info(f"Server-side processing: applied 300ms fade-in, 150ms end trim and 150ms fade-out")
# Create WAV file
with wave.open(str(filepath), 'wb') as wav_file:
# Set WAV file parameters
wav_file.setnchannels(channels)
wav_file.setsampwidth(bits_per_sample // 8)
wav_file.setframerate(sample_rate)
# Write PCM data directly
wav_file.writeframes(pcm_data)
logger.info(f"Saved audio file: {filepath}")
return str(filepath)
except Exception as e:
logger.error(f"Error saving audio file: {e}")
return None