Spaces:

Edmon02
/

SpeechT5_hy

Runtime error

File size: 12,859 Bytes

"""
Optimized SpeechT5 Armenian TTS Application
==========================================

High-performance Gradio application with advanced optimization features.
"""

import gradio as gr
import numpy as np
import logging
import time
from typing import Tuple, Optional
import os
import sys

# Add src to path for imports
current_dir = os.path.dirname(os.path.abspath(__file__))
src_path = os.path.join(current_dir, 'src')
if src_path not in sys.path:
    sys.path.insert(0, src_path)

try:
    from src.pipeline import TTSPipeline
except ImportError as e:
    logging.error(f"Failed to import pipeline: {e}")
    # Fallback import attempt
    sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
    from src.pipeline import TTSPipeline

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Global pipeline instance
tts_pipeline: Optional[TTSPipeline] = None


def initialize_pipeline():
    """Initialize the TTS pipeline with error handling."""
    global tts_pipeline
    
    try:
        logger.info("Initializing TTS Pipeline...")
        tts_pipeline = TTSPipeline(
            model_checkpoint="Edmon02/TTS_NB_2",
            max_chunk_length=200,  # Optimal for 5-20s clips
            crossfade_duration=0.1,
            use_mixed_precision=True
        )
        
        # Apply production optimizations
        tts_pipeline.optimize_for_production()
        
        logger.info("TTS Pipeline initialized successfully")
        return True
        
    except Exception as e:
        logger.error(f"Failed to initialize TTS pipeline: {e}")
        return False


def predict(text: str, speaker: str, 
           enable_chunking: bool = True,
           apply_processing: bool = True) -> Tuple[int, np.ndarray]:
    """
    Main prediction function with optimization and error handling.
    
    Args:
        text: Input text to synthesize
        speaker: Speaker selection
        enable_chunking: Whether to enable intelligent chunking
        apply_processing: Whether to apply audio post-processing
        
    Returns:
        Tuple of (sample_rate, audio_array)
    """
    global tts_pipeline
    
    start_time = time.time()
    
    try:
        # Validate inputs
        if not text or not text.strip():
            logger.warning("Empty text provided")
            return 16000, np.zeros(0, dtype=np.int16)
        
        if tts_pipeline is None:
            logger.error("TTS pipeline not initialized")
            return 16000, np.zeros(0, dtype=np.int16)
        
        # Extract speaker code from selection
        speaker_code = speaker.split("(")[0].strip()
        
        # Log request
        logger.info(f"Processing request: {len(text)} chars, speaker: {speaker_code}")
        
        # Synthesize speech
        sample_rate, audio = tts_pipeline.synthesize(
            text=text,
            speaker=speaker_code,
            enable_chunking=enable_chunking,
            apply_audio_processing=apply_processing
        )
        
        # Log performance
        total_time = time.time() - start_time
        audio_duration = len(audio) / sample_rate if len(audio) > 0 else 0
        rtf = total_time / audio_duration if audio_duration > 0 else float('inf')
        
        logger.info(f"Request completed in {total_time:.3f}s (RTF: {rtf:.2f})")
        
        return sample_rate, audio
        
    except Exception as e:
        logger.error(f"Prediction failed: {e}")
        return 16000, np.zeros(0, dtype=np.int16)


def get_performance_info() -> str:
    """Get performance statistics as formatted string."""
    global tts_pipeline
    
    if tts_pipeline is None:
        return "Pipeline not initialized"
    
    try:
        stats = tts_pipeline.get_performance_stats()
        
        info = f"""
**Performance Statistics:**
- Total Inferences: {stats['pipeline_stats']['total_inferences']}
- Average Processing Time: {stats['pipeline_stats']['avg_processing_time']:.3f}s
- Translation Cache Size: {stats['text_processor_stats']['translation_cache_size']}
- Model Inferences: {stats['model_stats']['total_inferences']}
- Average Model Time: {stats['model_stats'].get('avg_inference_time', 0):.3f}s
        """
        
        return info.strip()
        
    except Exception as e:
        return f"Error getting performance info: {e}"


def health_check() -> str:
    """Perform system health check."""
    global tts_pipeline
    
    if tts_pipeline is None:
        return "❌ Pipeline not initialized"
    
    try:
        health = tts_pipeline.health_check()
        
        if health["status"] == "healthy":
            return "✅ All systems operational"
        elif health["status"] == "degraded":
            return "⚠️ Some components have issues"
        else:
            return f"❌ System error: {health.get('error', 'Unknown error')}"
            
    except Exception as e:
        return f"❌ Health check failed: {e}"


# Application metadata
TITLE = "🎤 SpeechT5 Armenian TTS - Optimized"

DESCRIPTION = """
# High-Performance Armenian Text-to-Speech

This is an **optimized version** of SpeechT5 for Armenian language synthesis, featuring:

### 🚀 **Performance Optimizations**
- **Intelligent Text Chunking**: Handles long texts by splitting them intelligently at sentence boundaries
- **Caching**: Translation and embedding caching for faster repeated requests
- **Mixed Precision**: GPU optimization with FP16 inference when available
- **Crossfading**: Smooth audio transitions between chunks for natural-sounding longer texts

### 🎯 **Advanced Features**
- **Smart Text Processing**: Automatic number-to-word conversion with Armenian translation
- **Audio Post-Processing**: Noise gating, normalization, and dynamic range optimization
- **Robust Error Handling**: Graceful fallbacks and comprehensive logging
- **Real-time Performance Monitoring**: Track processing times and system health

### 📝 **Usage Tips**
- **Short texts** (< 200 chars): Processed directly for maximum speed
- **Long texts**: Automatically chunked with overlap for seamless audio
- **Numbers**: Automatically converted to Armenian words
- **Performance**: Enable chunking for texts longer than a few sentences

### 🎵 **Audio Quality**
- Sample Rate: 16 kHz
- Optimized for natural prosody and clear pronunciation
- Cross-fade transitions for multi-chunk synthesis

The model was trained on short clips (5-20s) but uses advanced algorithms to handle longer texts effectively.
"""

EXAMPLES = [
    # Short examples for quick testing
    ["Բարև ձեզ, ինչպե՞ս եք:", "BDL (male)", True, True],
    ["Այսօր գեղեցիկ օր է:", "BDL (male)", False, True],
    
    # Medium examples demonstrating chunking
    ["Հայաստանն ունի հարուստ պատմություն և մշակույթ: Երևանը մայրաքաղաքն է, որն ունի 2800 տարվա պատմություն:", "BDL (male)", True, True],
    
    # Long example with numbers
    ["Արարատ լեռը բարձրությունը 5165 մետր է: Այն Հայաստանի խորհրդանիշն է և գտնվում է Թուրքիայի տարածքում: Լեռան վրա ըստ Աստվածաշնչի՝ կանգնել է Նոյի տապանը 40 օրվա ջրհեղեղից հետո:", "BDL (male)", True, True],
    
    # Technical example
    ["Մեքենայի շարժիչը 150 ձիուժ է և 2.0 լիտր ծավալ ունի: Այն կարող է արագացնել 0-ից 100 կմ/ժ 8.5 վայրկյանում:", "BDL (male)", True, True],
]

# Custom CSS for better styling
CUSTOM_CSS = """
.gradio-container {
    max-width: 1200px !important;
    margin: auto !important;
}

.performance-info {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    padding: 15px;
    border-radius: 10px;
    color: white;
    margin: 10px 0;
}

.health-status {
    padding: 10px;
    border-radius: 8px;
    margin: 10px 0;
    font-weight: bold;
}

.status-healthy { background-color: #d4edda; color: #155724; }
.status-warning { background-color: #fff3cd; color: #856404; }
.status-error { background-color: #f8d7da; color: #721c24; }
"""


def create_interface():
    """Create and configure the Gradio interface."""
    
    with gr.Blocks(
        theme=gr.themes.Soft(),
        css=CUSTOM_CSS,
        title="SpeechT5 Armenian TTS"
    ) as interface:
        
        # Header
        gr.Markdown(f"# {TITLE}")
        gr.Markdown(DESCRIPTION)
        
        with gr.Row():
            with gr.Column(scale=2):
                # Main input controls
                text_input = gr.Textbox(
                    label="📝 Input Text (Armenian)",
                    placeholder="Մուտքագրեք ձեր տեքստը այստեղ...",
                    lines=3,
                    max_lines=10
                )
                
                with gr.Row():
                    speaker_input = gr.Radio(
                        label="🎭 Speaker",
                        choices=["BDL (male)"],
                        value="BDL (male)"
                    )
                
                with gr.Row():
                    chunking_checkbox = gr.Checkbox(
                        label="🧩 Enable Intelligent Chunking",
                        value=True,
                        info="Automatically split long texts for better quality"
                    )
                    processing_checkbox = gr.Checkbox(
                        label="🎚️ Apply Audio Processing",
                        value=True,
                        info="Apply noise gating, normalization, and crossfading"
                    )
                
                # Generate button
                generate_btn = gr.Button(
                    "🎤 Generate Speech",
                    variant="primary",
                    size="lg"
                )
            
            with gr.Column(scale=1):
                # System information panel
                gr.Markdown("### 📊 System Status")
                
                health_display = gr.Textbox(
                    label="Health Status",
                    value="Initializing...",
                    interactive=False,
                    max_lines=1
                )
                
                performance_display = gr.Textbox(
                    label="Performance Stats",
                    value="No data yet",
                    interactive=False,
                    max_lines=8
                )
                
                refresh_btn = gr.Button("🔄 Refresh Stats", size="sm")
        
        # Output
        audio_output = gr.Audio(
            label="🔊 Generated Speech",
            type="numpy",
            interactive=False
        )
        
        # Examples section
        gr.Markdown("### 💡 Example Texts")
        
        # Use simpler Examples component to avoid schema issues
        examples = gr.Examples(
            examples=EXAMPLES,
            inputs=[text_input, speaker_input, chunking_checkbox, processing_checkbox],
            outputs=audio_output,
            fn=predict,
            cache_examples=False,  # Disable caching to avoid schema issues
            label="Click any example to try it:"
        )
        
        # Event handlers
        generate_btn.click(
            fn=predict,
            inputs=[text_input, speaker_input, chunking_checkbox, processing_checkbox],
            outputs=[audio_output],
            show_progress="full"
        )
        
        refresh_btn.click(
            fn=lambda: (health_check(), get_performance_info()),
            outputs=[health_display, performance_display],
            show_progress="minimal"
        )
        
        # Auto-refresh health status on load
        interface.load(
            fn=lambda: (health_check(), get_performance_info()),
            outputs=[health_display, performance_display]
        )
    
    return interface


def main():
    """Main application entry point."""
    logger.info("Starting SpeechT5 Armenian TTS Application")
    
    # Initialize pipeline
    if not initialize_pipeline():
        logger.error("Failed to initialize TTS pipeline - exiting")
        sys.exit(1)
    
    # Create and launch interface
    interface = create_interface()
    
    # Launch with optimized settings
    interface.launch(
        share=False,  # Disable share for HF Spaces
        inbrowser=False,
        show_error=True,
        quiet=False,
        server_name="0.0.0.0",  # Allow external connections
        server_port=7860,       # Standard Gradio port
        max_threads=4,          # Limit concurrent requests
    )


if __name__ == "__main__":
    main()