gnosticdev's picture
Update app.py
a241f1d verified
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import math
from pydub import AudioSegment
import subprocess
# Función para obtener voces disponibles
async def get_voices():
voices = await edge_tts.list_voices()
return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
# Conversión de texto a voz
async def text_to_speech(text, voice, rate, pitch):
if not text.strip() or not voice:
return (None, "Please enter text and select a voice") if not text else (None, "Please select a voice")
try:
communicate = edge_tts.Communicate(
text,
voice.split(" - ")[0],
rate=f"{rate:+d}%",
pitch=f"{pitch:+d}Hz"
)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
await communicate.save(tmp_file.name)
return tmp_file.name, None
except Exception as e:
return None, f"Speech generation failed: {str(e)}"
# Agregar música de fondo (ahora elimina el audio original)
def add_background_music(speech_path, bg_music_path):
speech = AudioSegment.from_file(speech_path)
background = AudioSegment.from_file(bg_music_path) - 16 # 15% volume
if len(background) < len(speech) + 3000:
background = background * math.ceil((len(speech)+3000)/len(background))
combined = speech.overlay(background[:len(speech)])
fade_out = background[len(speech):len(speech)+3000].fade_out(3000)
final_audio = combined + fade_out
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
final_audio.export(tmp_file.name, format="mp3")
# Eliminar audio original
if os.path.exists(speech_path):
os.remove(speech_path)
return tmp_file.name
# Procesar múltiples videos (ahora elimina archivos temporales)
def process_videos(audio_path, video_files):
temp_files = []
try:
audio_duration = AudioSegment.from_file(audio_path).duration_seconds
# Concatenar videos
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as concat_video:
temp_files.append(concat_video.name)
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as list_file:
temp_files.append(list_file.name)
list_file.write("\n".join([f"file '{v.name}'" for v in video_files]))
list_file.close()
subprocess.run([
"ffmpeg", "-y",
"-f", "concat",
"-safe", "0",
"-i", list_file.name,
"-c", "copy",
concat_video.name
], check=True)
# Crear video final
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as final_video:
subprocess.run([
"ffmpeg", "-y",
"-stream_loop", "-1",
"-i", concat_video.name,
"-i", audio_path,
"-t", str(audio_duration + 3),
"-c:v", "libx264",
"-c:a", "aac",
"-vf", "fade=t=out:st={}:d=3".format(audio_duration),
"-af", "afade=t=out:st={}:d=3".format(audio_duration),
"-shortest",
final_video.name
], check=True)
return final_video.name
finally:
# Eliminar archivos temporales
for f in temp_files:
if os.path.exists(f):
os.remove(f)
# Función principal (ahora elimina videos originales)
async def tts_interface(text, voice, rate, pitch, bg_music, video_files):
temp_audio = None
try:
# Generar audio principal
temp_audio, warning = await text_to_speech(text, voice, rate, pitch)
if warning:
return None, None, gr.Warning(warning)
# Agregar música de fondo
if bg_music:
temp_audio = add_background_music(temp_audio, bg_music)
# Procesar videos
video_path = None
if video_files:
video_path = process_videos(temp_audio, video_files)
# Eliminar videos originales subidos
for video in video_files:
if hasattr(video, 'name') and os.path.exists(video.name):
os.remove(video.name)
return temp_audio, video_path, None
except Exception as e:
return None, None, gr.Warning(f"Processing error: {str(e)}")
finally:
# Eliminar audio temporal si existe y no es la salida final
if temp_audio and os.path.exists(temp_audio):
try:
if video_path and temp_audio != video_path:
os.remove(temp_audio)
except: # Evitar errores si el archivo ya fue eliminado
pass
# Crear interfaz (sin cambios)
async def create_demo():
voices = await get_voices()
demo = gr.Interface(
fn=tts_interface,
inputs=[
gr.Textbox(label="Input Text", lines=5, placeholder="Enter your text here..."),
gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice"),
gr.Slider(-50, 50, 0, label="Speech Rate (%)"),
gr.Slider(-20, 20, 0, label="Pitch (Hz)"),
gr.Audio(label="Background Music", type="filepath"),
gr.File(label="Upload Videos", file_types=[".mp4", ".mov"], file_count="multiple")
],
outputs=[
gr.Audio(label="Generated Audio", type="filepath"),
gr.Video(label="Final Video"),
gr.Markdown(visible=False)
],
title="Multi-Video TTS con Bucle",
description="""
Este script permite crear videos personalizados combinando texto, audio y múltiples clips de video.
Convierte texto en voz usando tecnología avanzada de síntesis de voz (Text-to-Speech),
opcionalmente añade música de fondo para enriquecer el audio generado y procesa varios videos subidos por el usuario
para reproducirlos en secuencia y en bucle infinito.
El resultado final es un video que sincroniza el audio con la concatenación de los clips,
asegurando una transición suave entre ellos y un fade-out al final de cada ciclo. Además, el script está diseñado para
limpiar automáticamente los archivos temporales y los videos originales subidos, evitando acumulación innecesaria en el servidor.
Es ideal para generar contenido dinámico como videos motivacionales, presentaciones automáticas o material promocional.
""",
css="#component-0 {max-width: 800px}"
)
return demo
async def main():
demo = await create_demo()
demo.queue()
demo.launch()
if __name__ == "__main__":
asyncio.run(main())