import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import re
from pydub import AudioSegment # Required for audio duration, needs ffmpeg installed

# Get all available voices
async def get_voices():
    """Fetches all available voices from the Edge TTS service."""
    voices = await edge_tts.list_voices()
    # Format voice names for display in the dropdown
    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}

# Text-to-speech function
async def text_to_speech(text, voice, rate, pitch):
    """
    Converts text to speech using Edge TTS and saves it to a temporary file.
    Returns the path to the generated audio file and the original text for SRT generation.
    """
    if not text.strip():
        return None, None, gr.Warning("Please enter text to convert.")
    if not voice:
        return None, None, gr.Warning("Please select a voice.")
    
    # Extract the short name from the selected voice string
    voice_short_name = voice.split(" - ")[0]
    
    # Format rate and pitch for the Edge TTS API
    rate_str = f"{rate:+d}%"
    pitch_str = f"{pitch:+d}Hz"
    
    # Initialize the Edge TTS communicator
    communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
    
    # Create a temporary file to save the audio
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)
    
    return tmp_path, text, None # Return audio path, original text, and no warning

def format_time(ms):
    """
    Formats milliseconds into SRT time format (HH:MM:SS,mmm).
    """
    hours = int(ms / 3_600_000)
    ms %= 3_600_000
    minutes = int(ms / 60_000)
    ms %= 60_000
    seconds = int(ms / 1_000)
    milliseconds = int(ms % 1_000)
    return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"

def generate_srt(text_input, audio_filepath):
    """
    Generates a basic SRT file based on text input and estimated timings
    from audio duration. Timings are proportional to segment text length.
    
    Note: This does not use advanced audio analysis for precise timing of pauses.
    It's an estimation based on character count per segment.
    Requires ffmpeg installed for pydub to read audio duration.
    """
    if not text_input or not audio_filepath:
        return None

    try:
        # Load audio to get its total duration using pydub
        audio = AudioSegment.from_file(audio_filepath)
        audio_duration_ms = len(audio)
    except Exception as e:
        print(f"Error getting audio duration with pydub: {e}. SRT generation requires ffmpeg.")
        # If pydub fails (e.g., ffmpeg not found), return None for SRT
        return None 

    # Split text into segments. This regex splits on common sentence-ending
    # punctuation, keeping the punctuation with the segment, and also handles newlines.
    segments = re.findall(r'[^.!?,\n]+[.!?,\n]*', text_input)
    segments = [s.strip() for s in segments if s.strip()] # Clean up empty strings

    if not segments:
        return None

    srt_content = []
    current_time_ms = 0
    total_chars = sum(len(s) for s in segments)
    
    if total_chars == 0: # Prevent division by zero if text is somehow empty after stripping
        return None

    for i, segment in enumerate(segments):
        # Estimate duration for the segment based on its character count
        # This assumes a roughly constant speech rate throughout the audio.
        estimated_segment_duration_ms = (len(segment) / total_chars) * audio_duration_ms

        start_time = current_time_ms
        end_time = current_time_ms + estimated_segment_duration_ms

        # Ensure the last segment's end time matches the total audio duration
        if i == len(segments) - 1:
            end_time = audio_duration_ms

        # Add SRT entry
        srt_content.append(str(i + 1))
        srt_content.append(f"{format_time(start_time)} --> {format_time(end_time)}")
        srt_content.append(segment)
        srt_content.append("") # Empty line separates SRT blocks

        current_time_ms = end_time
            
    # Save the SRT content to a temporary file
    srt_filename = f"{os.path.splitext(audio_filepath)[0]}.srt"
    with open(srt_filename, "w", encoding="utf-8") as f:
        f.write("\n".join(srt_content))
            
    return srt_filename

# Gradio interface function (wraps async functions and handles SRT generation)
def tts_interface(text, voice, rate, pitch):
    """
    The main interface function for Gradio. It calls text_to_speech and then generate_srt.
    """
    # Run the async text_to_speech function
    audio_path, original_text, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
    
    srt_path = None
    if audio_path: # Only attempt SRT generation if audio was successfully created
        srt_path = generate_srt(original_text, audio_path)
    
    # Return the generated audio, SRT file, and any warnings
    return audio_path, srt_path, warning

# Create Gradio application
async def create_demo():
    """
    Asynchronously creates and configures the Gradio interface.
    """
    voices = await get_voices() # Fetch voices when the app starts
    
    description = """
    Convert text to speech using Microsoft Edge TTS. Adjust speech rate and pitch: 0 is default, positive values increase, negative values decrease.
    
    ✨ **New Feature: Generate SRT Subtitles!** ✨
    
    Automatically generates an SRT (SubRip Subtitle) file from your input text,
    with timings estimated based on sentence segmentation and overall audio duration.
    **Note:** This feature provides approximate timings and does not perform
    advanced audio waveform analysis for precise pause detection.
    
    🎥 **Exciting News: Introducing our Text-to-Video Converter!** 🎥
    
    Take your content creation to the next level with our cutting-edge Text-to-Video Converter! 
    Transform your words into stunning, professional-quality videos in just a few clicks. 
    
    ✨ Features:
    • Convert text to engaging videos with customizable visuals
    • Choose from 40+ languages and 300+ voices
    • Perfect for creating audiobooks, storytelling, and language learning materials
    • Ideal for educators, content creators, and language enthusiasts
    
    Ready to revolutionize your content? [Click here to try our Text-to-Video Converter now!](https://text2video.wingetgui.com/)
    """
    
    demo = gr.Interface(
        fn=tts_interface, # The function that processes inputs and returns outputs
        inputs=[
            gr.Textbox(label="Input Text", lines=5, placeholder="Enter your text here to convert to speech and generate SRT..."),
            gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value="", type="value"),
            gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
            gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
        ],
        outputs=[
            gr.Audio(label="Generated Audio", type="filepath"),
            gr.File(label="Generated SRT Subtitle", type="filepath", file_count="single", visible=True), # Output for the SRT file
            gr.Markdown(label="Warning", visible=False) # For displaying warnings
        ],
        title="Edge TTS Text-to-Speech with SRT Generator",
        description=description,
        article="Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!",
        analytics_enabled=False,
        allow_flagging=False
    )
    return demo

# Run the application
if __name__ == "__main__":
    demo.queue()
demo.launch()