SpeechT5_hy / archive /app_final.py
Edmon02's picture
feat: Implement project organization plan and optimize TTS deployment
3f1840e
"""
Armenian TTS - HuggingFace Spaces Compatible
===========================================
Final version optimized for HF Spaces with Gradio 3.x compatibility.
"""
import gradio as gr
import numpy as np
import logging
import os
import sys
# Minimal logging setup
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
# Global variables
tts_pipeline = None
pipeline_ready = False
def initialize_tts():
"""Initialize TTS pipeline with comprehensive error handling."""
global tts_pipeline, pipeline_ready
try:
# Setup path for imports
current_dir = os.path.dirname(os.path.abspath(__file__))
src_path = os.path.join(current_dir, 'src')
if src_path not in sys.path:
sys.path.insert(0, src_path)
# Import and initialize pipeline
from src.pipeline import TTSPipeline
logger.info("Initializing TTS pipeline...")
tts_pipeline = TTSPipeline(
model_checkpoint="Edmon02/TTS_NB_2",
max_chunk_length=200,
crossfade_duration=0.1,
use_mixed_precision=True
)
# Apply optimizations
tts_pipeline.optimize_for_production()
pipeline_ready = True
logger.info("TTS pipeline ready!")
return True
except Exception as e:
logger.error(f"Failed to initialize TTS pipeline: {e}")
pipeline_ready = False
return False
def synthesize_speech(text):
"""
Main synthesis function with fallback handling.
Args:
text (str): Armenian text to synthesize
Returns:
tuple: (sample_rate, audio_array)
"""
# Validate input
if not text or not isinstance(text, str) or not text.strip():
return create_silence(1.0)
# Check pipeline status
if not pipeline_ready or tts_pipeline is None:
logger.warning("Pipeline not ready, generating fallback audio")
return create_fallback_audio(text)
try:
logger.info(f"Synthesizing: {text[:50]}...")
# Generate speech using pipeline
sample_rate, audio = tts_pipeline.synthesize(
text=text.strip(),
speaker="BDL",
enable_chunking=True,
apply_audio_processing=True
)
logger.info(f"Generated {len(audio)} samples at {sample_rate}Hz")
return sample_rate, audio
except Exception as e:
logger.error(f"Synthesis error: {e}")
return create_fallback_audio(text)
def create_silence(duration_seconds=1.0):
"""Create silence audio."""
sample_rate = 16000
samples = int(duration_seconds * sample_rate)
return sample_rate, np.zeros(samples, dtype=np.int16)
def create_fallback_audio(text):
"""Create simple fallback audio based on text."""
# Calculate duration based on text length
duration = min(max(len(text) * 0.1, 0.5), 5.0)
sample_rate = 16000
samples = int(duration * sample_rate)
# Generate simple tone
t = np.linspace(0, duration, samples)
# Create a pleasant tone sequence
base_freq = 440 # A4
audio = np.sin(2 * np.pi * base_freq * t) * 0.3
# Add some variation for longer texts
if len(text) > 20:
audio += np.sin(2 * np.pi * (base_freq * 1.5) * t) * 0.2
# Apply fade in/out
fade_samples = min(samples // 10, 1000)
if fade_samples > 0:
fade_in = np.linspace(0, 1, fade_samples)
fade_out = np.linspace(1, 0, fade_samples)
audio[:fade_samples] *= fade_in
audio[-fade_samples:] *= fade_out
# Convert to int16
audio_int16 = (audio * 32767).astype(np.int16)
return sample_rate, audio_int16
# Initialize the pipeline
logger.info("Starting Armenian TTS application...")
init_success = initialize_tts()
if init_success:
app_status = "🟢 TTS System Ready"
app_description = """
🎤 **Armenian Text-to-Speech System**
Convert Armenian text to natural speech using SpeechT5.
**How to use:**
1. Enter Armenian text in the box below
2. Click Submit to generate speech
3. Play the generated audio
**Tips for best results:**
- Use standard Armenian script (Unicode)
- Include punctuation for natural pauses
- Shorter sentences work better for quality
"""
else:
app_status = "🟡 Test Mode (Limited Functionality)"
app_description = """
🎤 **Armenian Text-to-Speech System - Test Mode**
The TTS system is running in test mode with limited functionality.
Text input will generate simple audio tones as placeholders.
"""
# Create the Gradio interface using Gradio 3.x syntax
demo = gr.Interface(
fn=synthesize_speech,
inputs=gr.inputs.Textbox(
lines=3,
placeholder="Մուտքագրեք ձեր հայերեն տեքստը այստեղ...",
label="Armenian Text Input"
),
outputs=gr.outputs.Audio(
label="Generated Speech"
),
title=f"🇦🇲 Armenian Text-to-Speech {app_status}",
description=app_description,
examples=[
"Բարև ձեզ, ինչպե՞ս եք:",
"Այսօր գեղեցիկ օր է:",
"Շնորհակալություն:",
"Հայաստան իմ սիրելի երկիրն է:",
"Երևանը Հայաստանի մայրաքաղաքն է:"
],
theme="default",
allow_screenshot=False,
allow_flagging="never"
)
# Launch the application
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
debug=False,
quiet=False
)