Spaces:

Agents-MCP-Hackathon
/

pdf_explainer

Sleeping

App Files Files Community

pdf_explainer / src /processors /simple_audio_processor.py

spagestic

feat: implement simplified audio processing with enhanced TTS API integration

8f3c067 7 months ago

raw

history blame contribute delete

2.91 kB

	"""Simplified audio generation functionality that delegates complex processing to the TTS API."""
	from typing import Tuple, Optional
	import gradio as gr
	import numpy as np

	class SimpleAudioProcessor:
	"""Simplified audio processor that uses the enhanced TTS API for complex processing."""

	def __init__(self):
	"""Initialize the simple audio processor."""
	pass

	def generate_audio(self, explanation_text: str, progress=None) -> Tuple[Tuple[int, np.ndarray], dict]:
	"""
	Generate TTS audio for explanations using the enhanced TTS API.

	This method sends the full text to the TTS API which handles:
	- Text chunking
	- Parallel processing
	- Audio concatenation
	- All on the server side with GPU acceleration

	Args:
	explanation_text: The text to convert to audio
	progress: Optional progress callback

	Returns:
	Tuple of (audio_result, update_dict) where audio_result is (sample_rate, audio_data)
	"""
	if not explanation_text or explanation_text.strip() == "":
	raise gr.Error("No explanations available to convert to audio. Please generate explanations first.")

	try:
	clean_text = explanation_text.strip()

	if progress:
	progress(0.1, desc="Sending text to TTS API for processing...")

	# Import the simplified audio generation function
	from .generate_simple_tts_audio import generate_simple_tts_audio

	# Generate audio using the new simplified API call
	audio_result = generate_simple_tts_audio(clean_text, progress=progress)

	if progress:
	progress(1.0, desc="Audio generation complete!")

	return audio_result, gr.update(visible=True)

	except Exception as e:
	raise gr.Error(f"Error generating audio: {str(e)}")

	def get_processing_info(self, text: str) -> dict:
	"""Get basic information about the text to be processed."""
	if not text or not text.strip():
	return {"error": "No text provided"}

	text_length = len(text.strip())
	estimated_chunks = max(1, text_length // 800) # Rough estimate
	estimated_time = text_length * 0.05 # Rough estimate: 0.05 seconds per character

	return {
	"processing_mode": "server_side_parallel",
	"text_length": text_length,
	"estimated_chunks": estimated_chunks,
	"estimated_time_seconds": estimated_time,
	"estimated_time_readable": f"{estimated_time:.1f} seconds" if estimated_time < 60 else f"{estimated_time/60:.1f} minutes",
	"note": "Processing handled by TTS API with GPU acceleration"
	}