pdf_explainer / src /processors /simple_audio_processor.py
spagestic's picture
feat: implement simplified audio processing with enhanced TTS API integration
8f3c067
"""Simplified audio generation functionality that delegates complex processing to the TTS API."""
from typing import Tuple, Optional
import gradio as gr
import numpy as np
class SimpleAudioProcessor:
"""Simplified audio processor that uses the enhanced TTS API for complex processing."""
def __init__(self):
"""Initialize the simple audio processor."""
pass
def generate_audio(self, explanation_text: str, progress=None) -> Tuple[Tuple[int, np.ndarray], dict]:
"""
Generate TTS audio for explanations using the enhanced TTS API.
This method sends the full text to the TTS API which handles:
- Text chunking
- Parallel processing
- Audio concatenation
- All on the server side with GPU acceleration
Args:
explanation_text: The text to convert to audio
progress: Optional progress callback
Returns:
Tuple of (audio_result, update_dict) where audio_result is (sample_rate, audio_data)
"""
if not explanation_text or explanation_text.strip() == "":
raise gr.Error("No explanations available to convert to audio. Please generate explanations first.")
try:
clean_text = explanation_text.strip()
if progress:
progress(0.1, desc="Sending text to TTS API for processing...")
# Import the simplified audio generation function
from .generate_simple_tts_audio import generate_simple_tts_audio
# Generate audio using the new simplified API call
audio_result = generate_simple_tts_audio(clean_text, progress=progress)
if progress:
progress(1.0, desc="Audio generation complete!")
return audio_result, gr.update(visible=True)
except Exception as e:
raise gr.Error(f"Error generating audio: {str(e)}")
def get_processing_info(self, text: str) -> dict:
"""Get basic information about the text to be processed."""
if not text or not text.strip():
return {"error": "No text provided"}
text_length = len(text.strip())
estimated_chunks = max(1, text_length // 800) # Rough estimate
estimated_time = text_length * 0.05 # Rough estimate: 0.05 seconds per character
return {
"processing_mode": "server_side_parallel",
"text_length": text_length,
"estimated_chunks": estimated_chunks,
"estimated_time_seconds": estimated_time,
"estimated_time_readable": f"{estimated_time:.1f} seconds" if estimated_time < 60 else f"{estimated_time/60:.1f} minutes",
"note": "Processing handled by TTS API with GPU acceleration"
}