|
import gradio as gr |
|
import edge_tts |
|
import asyncio |
|
import tempfile |
|
import os |
|
import math |
|
from pydub import AudioSegment |
|
import subprocess |
|
|
|
|
|
async def get_voices(): |
|
voices = await edge_tts.list_voices() |
|
return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices} |
|
|
|
|
|
async def text_to_speech(text, voice, rate, pitch): |
|
if not text.strip() or not voice: |
|
return (None, "Please enter text and select a voice") if not text else (None, "Please select a voice") |
|
|
|
try: |
|
communicate = edge_tts.Communicate( |
|
text, |
|
voice.split(" - ")[0], |
|
rate=f"{rate:+d}%", |
|
pitch=f"{pitch:+d}Hz" |
|
) |
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: |
|
await communicate.save(tmp_file.name) |
|
return tmp_file.name, None |
|
except Exception as e: |
|
return None, f"Speech generation failed: {str(e)}" |
|
|
|
|
|
def add_background_music(speech_path, bg_music_path): |
|
speech = AudioSegment.from_file(speech_path) |
|
background = AudioSegment.from_file(bg_music_path) - 16 |
|
|
|
if len(background) < len(speech) + 3000: |
|
background = background * math.ceil((len(speech)+3000)/len(background)) |
|
|
|
combined = speech.overlay(background[:len(speech)]) |
|
fade_out = background[len(speech):len(speech)+3000].fade_out(3000) |
|
final_audio = combined + fade_out |
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: |
|
final_audio.export(tmp_file.name, format="mp3") |
|
|
|
if os.path.exists(speech_path): |
|
os.remove(speech_path) |
|
return tmp_file.name |
|
|
|
|
|
def process_videos(audio_path, video_files): |
|
temp_files = [] |
|
try: |
|
audio_duration = AudioSegment.from_file(audio_path).duration_seconds |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as concat_video: |
|
temp_files.append(concat_video.name) |
|
|
|
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as list_file: |
|
temp_files.append(list_file.name) |
|
list_file.write("\n".join([f"file '{v.name}'" for v in video_files])) |
|
list_file.close() |
|
|
|
subprocess.run([ |
|
"ffmpeg", "-y", |
|
"-f", "concat", |
|
"-safe", "0", |
|
"-i", list_file.name, |
|
"-c", "copy", |
|
concat_video.name |
|
], check=True) |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as final_video: |
|
subprocess.run([ |
|
"ffmpeg", "-y", |
|
"-stream_loop", "-1", |
|
"-i", concat_video.name, |
|
"-i", audio_path, |
|
"-t", str(audio_duration + 3), |
|
"-c:v", "libx264", |
|
"-c:a", "aac", |
|
"-vf", "fade=t=out:st={}:d=3".format(audio_duration), |
|
"-af", "afade=t=out:st={}:d=3".format(audio_duration), |
|
"-shortest", |
|
final_video.name |
|
], check=True) |
|
|
|
return final_video.name |
|
|
|
finally: |
|
|
|
for f in temp_files: |
|
if os.path.exists(f): |
|
os.remove(f) |
|
|
|
|
|
async def tts_interface(text, voice, rate, pitch, bg_music, video_files): |
|
temp_audio = None |
|
try: |
|
|
|
temp_audio, warning = await text_to_speech(text, voice, rate, pitch) |
|
if warning: |
|
return None, None, gr.Warning(warning) |
|
|
|
|
|
if bg_music: |
|
temp_audio = add_background_music(temp_audio, bg_music) |
|
|
|
|
|
video_path = None |
|
if video_files: |
|
video_path = process_videos(temp_audio, video_files) |
|
|
|
for video in video_files: |
|
if hasattr(video, 'name') and os.path.exists(video.name): |
|
os.remove(video.name) |
|
|
|
return temp_audio, video_path, None |
|
|
|
except Exception as e: |
|
return None, None, gr.Warning(f"Processing error: {str(e)}") |
|
finally: |
|
|
|
if temp_audio and os.path.exists(temp_audio): |
|
try: |
|
if video_path and temp_audio != video_path: |
|
os.remove(temp_audio) |
|
except: |
|
pass |
|
|
|
|
|
async def create_demo(): |
|
voices = await get_voices() |
|
|
|
demo = gr.Interface( |
|
fn=tts_interface, |
|
inputs=[ |
|
gr.Textbox(label="Input Text", lines=5, placeholder="Enter your text here..."), |
|
gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice"), |
|
gr.Slider(-50, 50, 0, label="Speech Rate (%)"), |
|
gr.Slider(-20, 20, 0, label="Pitch (Hz)"), |
|
gr.Audio(label="Background Music", type="filepath"), |
|
gr.File(label="Upload Videos", file_types=[".mp4", ".mov"], file_count="multiple") |
|
], |
|
outputs=[ |
|
gr.Audio(label="Generated Audio", type="filepath"), |
|
gr.Video(label="Final Video"), |
|
gr.Markdown(visible=False) |
|
], |
|
title="Multi-Video TTS con Bucle", |
|
description=""" |
|
Este script permite crear videos personalizados combinando texto, audio y múltiples clips de video. |
|
Convierte texto en voz usando tecnología avanzada de síntesis de voz (Text-to-Speech), |
|
opcionalmente añade música de fondo para enriquecer el audio generado y procesa varios videos subidos por el usuario |
|
para reproducirlos en secuencia y en bucle infinito. |
|
El resultado final es un video que sincroniza el audio con la concatenación de los clips, |
|
asegurando una transición suave entre ellos y un fade-out al final de cada ciclo. Además, el script está diseñado para |
|
limpiar automáticamente los archivos temporales y los videos originales subidos, evitando acumulación innecesaria en el servidor. |
|
Es ideal para generar contenido dinámico como videos motivacionales, presentaciones automáticas o material promocional. |
|
""", |
|
css="#component-0 {max-width: 800px}" |
|
) |
|
return demo |
|
|
|
async def main(): |
|
demo = await create_demo() |
|
demo.queue() |
|
demo.launch() |
|
|
|
if __name__ == "__main__": |
|
asyncio.run(main()) |