SpeechT5_hy / app.py
Edmon02's picture
feat: Implement project organization plan and optimize TTS deployment
3f1840e
"""
Armenian TTS - Minimal HF Spaces Version
=======================================
Absolutely minimal version to avoid all possible compatibility issues.
"""
import gradio as gr
import numpy as np
import logging
import os
import sys
# Simple logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def setup_pipeline():
"""Setup TTS pipeline with maximum error handling."""
try:
# Add source path
current_dir = os.path.dirname(os.path.abspath(__file__))
src_path = os.path.join(current_dir, 'src')
if src_path not in sys.path:
sys.path.insert(0, src_path)
# Try to import and initialize
from src.pipeline import TTSPipeline
pipeline = TTSPipeline(
model_checkpoint="Edmon02/TTS_NB_2",
max_chunk_length=200,
use_mixed_precision=True
)
pipeline.optimize_for_production()
logger.info("TTS pipeline initialized successfully")
return pipeline, True
except Exception as e:
logger.error(f"Pipeline initialization failed: {e}")
return None, False
def tts_process(text):
"""Process text to speech with complete error handling."""
global tts_pipeline, pipeline_available
# Basic input validation
if not text or not isinstance(text, str) or len(text.strip()) == 0:
# Return 1 second of silence
return 16000, np.zeros(16000, dtype=np.int16)
text = text.strip()
# If no pipeline available, create a simple audio response
if not pipeline_available or tts_pipeline is None:
logger.info(f"Using fallback for text: {text[:30]}...")
# Create simple fallback audio
duration = min(len(text) * 0.08, 4.0) # Max 4 seconds
sample_rate = 16000
samples = int(duration * sample_rate)
if samples <= 0:
return sample_rate, np.zeros(8000, dtype=np.int16)
# Generate a simple pleasant tone
t = np.linspace(0, duration, samples)
frequency = 440 # A4 note
audio = np.sin(2 * np.pi * frequency * t) * 0.2
# Add some harmonics for richer sound
audio += np.sin(2 * np.pi * frequency * 2 * t) * 0.1
audio += np.sin(2 * np.pi * frequency * 3 * t) * 0.05
# Apply simple envelope
envelope = np.exp(-t * 2) # Exponential decay
audio *= envelope
# Convert to int16
audio_int16 = (audio * 32767).astype(np.int16)
return sample_rate, audio_int16
# Try real TTS
try:
logger.info(f"Synthesizing: {text[:50]}...")
sample_rate, audio = tts_pipeline.synthesize(
text=text,
speaker="BDL",
enable_chunking=True,
apply_audio_processing=True
)
logger.info(f"Successfully generated {len(audio)} samples")
return sample_rate, audio
except Exception as e:
logger.error(f"TTS synthesis failed: {e}")
# Fallback to silence
return 16000, np.zeros(8000, dtype=np.int16)
# Initialize the pipeline once
logger.info("Initializing Armenian TTS application...")
tts_pipeline, pipeline_available = setup_pipeline()
if pipeline_available:
title = "🇦🇲 Armenian Text-to-Speech (Ready)"
description = "Convert Armenian text to speech using SpeechT5."
else:
title = "🇦🇲 Armenian TTS (Test Mode)"
description = "TTS system in test mode - will generate simple audio tones."
# Create the simplest possible Gradio interface
app = gr.Interface(
fn=tts_process,
inputs="text",
outputs="audio",
title=title,
description=description,
examples=[
"Բարև ձեզ",
"Շնորհակալություն",
"Ինչպե՞ս եք"
]
)
# Launch the app
if __name__ == "__main__":
app.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
)