import gradio as gr import edge_tts import asyncio import tempfile import os import re from pydub import AudioSegment # Required for audio duration, needs ffmpeg installed # Get all available voices async def get_voices(): """Fetches all available voices from the Edge TTS service.""" voices = await edge_tts.list_voices() # Format voice names for display in the dropdown return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices} # Text-to-speech function async def text_to_speech(text, voice, rate, pitch): """ Converts text to speech using Edge TTS and saves it to a temporary file. Returns the path to the generated audio file and the original text for SRT generation. """ if not text.strip(): return None, None, gr.Warning("Please enter text to convert.") if not voice: return None, None, gr.Warning("Please select a voice.") # Extract the short name from the selected voice string voice_short_name = voice.split(" - ")[0] # Format rate and pitch for the Edge TTS API rate_str = f"{rate:+d}%" pitch_str = f"{pitch:+d}Hz" # Initialize the Edge TTS communicator communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str) # Create a temporary file to save the audio with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: tmp_path = tmp_file.name await communicate.save(tmp_path) return tmp_path, text, None # Return audio path, original text, and no warning def format_time(ms): """ Formats milliseconds into SRT time format (HH:MM:SS,mmm). """ hours = int(ms / 3_600_000) ms %= 3_600_000 minutes = int(ms / 60_000) ms %= 60_000 seconds = int(ms / 1_000) milliseconds = int(ms % 1_000) return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}" def generate_srt(text_input, audio_filepath): """ Generates a basic SRT file based on text input and estimated timings from audio duration. Timings are proportional to segment text length. Note: This does not use advanced audio analysis for precise timing of pauses. It's an estimation based on character count per segment. Requires ffmpeg installed for pydub to read audio duration. """ if not text_input or not audio_filepath: return None try: # Load audio to get its total duration using pydub audio = AudioSegment.from_file(audio_filepath) audio_duration_ms = len(audio) except Exception as e: print(f"Error getting audio duration with pydub: {e}. SRT generation requires ffmpeg.") # If pydub fails (e.g., ffmpeg not found), return None for SRT return None # Split text into segments. This regex splits on common sentence-ending # punctuation, keeping the punctuation with the segment, and also handles newlines. segments = re.findall(r'[^.!?,\n]+[.!?,\n]*', text_input) segments = [s.strip() for s in segments if s.strip()] # Clean up empty strings if not segments: return None srt_content = [] current_time_ms = 0 total_chars = sum(len(s) for s in segments) if total_chars == 0: # Prevent division by zero if text is somehow empty after stripping return None for i, segment in enumerate(segments): # Estimate duration for the segment based on its character count # This assumes a roughly constant speech rate throughout the audio. estimated_segment_duration_ms = (len(segment) / total_chars) * audio_duration_ms start_time = current_time_ms end_time = current_time_ms + estimated_segment_duration_ms # Ensure the last segment's end time matches the total audio duration if i == len(segments) - 1: end_time = audio_duration_ms # Add SRT entry srt_content.append(str(i + 1)) srt_content.append(f"{format_time(start_time)} --> {format_time(end_time)}") srt_content.append(segment) srt_content.append("") # Empty line separates SRT blocks current_time_ms = end_time # Save the SRT content to a temporary file srt_filename = f"{os.path.splitext(audio_filepath)[0]}.srt" with open(srt_filename, "w", encoding="utf-8") as f: f.write("\n".join(srt_content)) return srt_filename # Gradio interface function (wraps async functions and handles SRT generation) def tts_interface(text, voice, rate, pitch): """ The main interface function for Gradio. It calls text_to_speech and then generate_srt. """ # Run the async text_to_speech function audio_path, original_text, warning = asyncio.run(text_to_speech(text, voice, rate, pitch)) srt_path = None if audio_path: # Only attempt SRT generation if audio was successfully created srt_path = generate_srt(original_text, audio_path) # Return the generated audio, SRT file, and any warnings return audio_path, srt_path, warning # Create Gradio application async def create_demo(): """ Asynchronously creates and configures the Gradio interface. """ voices = await get_voices() # Fetch voices when the app starts description = """ Convert text to speech using Microsoft Edge TTS. Adjust speech rate and pitch: 0 is default, positive values increase, negative values decrease. ✨ **New Feature: Generate SRT Subtitles!** ✨ Automatically generates an SRT (SubRip Subtitle) file from your input text, with timings estimated based on sentence segmentation and overall audio duration. **Note:** This feature provides approximate timings and does not perform advanced audio waveform analysis for precise pause detection. 🎥 **Exciting News: Introducing our Text-to-Video Converter!** 🎥 Take your content creation to the next level with our cutting-edge Text-to-Video Converter! Transform your words into stunning, professional-quality videos in just a few clicks. ✨ Features: • Convert text to engaging videos with customizable visuals • Choose from 40+ languages and 300+ voices • Perfect for creating audiobooks, storytelling, and language learning materials • Ideal for educators, content creators, and language enthusiasts Ready to revolutionize your content? [Click here to try our Text-to-Video Converter now!](https://text2video.wingetgui.com/) """ demo = gr.Interface( fn=tts_interface, # The function that processes inputs and returns outputs inputs=[ gr.Textbox(label="Input Text", lines=5, placeholder="Enter your text here to convert to speech and generate SRT..."), gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value="", type="value"), gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1), gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1) ], outputs=[ gr.Audio(label="Generated Audio", type="filepath"), gr.File(label="Generated SRT Subtitle", type="filepath", file_count="single", visible=True), # Output for the SRT file gr.Markdown(label="Warning", visible=False) # For displaying warnings ], title="Edge TTS Text-to-Speech with SRT Generator", description=description, article="Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!", analytics_enabled=False, allow_flagging=False ) return demo # Run the application if __name__ == "__main__": demo.queue() demo.launch()