insta-maker-2 / app.py
hivecorp's picture
Update app.py
8ca57cc verified
raw
history blame
4.88 kB
import gradio as gr
from pydub import AudioSegment
import edge_tts
import os
import wave
import asyncio
import srt
# Function to calculate audio duration
def get_audio_length(audio_path):
with wave.open(audio_path, 'rb') as audio:
frames = audio.getnframes()
rate = audio.getframerate()
return frames / float(rate)
# Generate precise SRT entries for a text batch
def generate_accurate_srt(text, start_time, batch_index):
srt_entries = []
current_time = start_time
for line in text.splitlines():
end_time = current_time + get_audio_length_for_line(line)
srt_entries.append(
srt.Subtitle(
index=batch_index,
start=srt.timedelta(seconds=current_time),
end=srt.timedelta(seconds=end_time),
content=line
)
)
current_time = end_time
batch_index += 1
return srt_entries, current_time
# Process batches and accumulate precise SRT entries
async def batch_process_srt_and_audio(script_text, voice, batch_size=500, progress=gr.Progress()):
total_srt_entries = []
combined_audio = AudioSegment.empty()
cumulative_time = 0.0 # Track total time for accurate SRT start times
batch_index = 1
# Split text into manageable batches
for i in range(0, len(script_text), batch_size):
batch_text = script_text[i:i+batch_size]
mp3_file = f"audio_batch_{i}.mp3" # Save as MP3 first
wav_file = f"audio_batch_{i}.wav" # Convert to WAV
# Generate audio for each batch and save as MP3
tts = edge_tts.Communicate(batch_text, voice, rate="-25%")
await tts.save(mp3_file)
# Convert MP3 to WAV
batch_audio = AudioSegment.from_file(mp3_file, format="mp3")
batch_audio.export(wav_file, format="wav")
# Ensure WAV conversion succeeded and calculate duration
batch_duration = get_audio_length(wav_file)
srt_entries, cumulative_time = generate_accurate_srt(batch_text, cumulative_time, batch_index)
# Append entries and audio for the batch
total_srt_entries.extend(srt_entries)
combined_audio += batch_audio
batch_index += len(srt_entries)
# Clean up temporary MP3 file
os.remove(mp3_file)
# Export combined audio and SRT
combined_audio.export("final_audio.wav", format="wav")
with open("final_subtitles.srt", "w") as srt_file:
srt_file.write(srt.compose(total_srt_entries))
# Final validation check
validate_srt_against_audio("final_subtitles.srt", "final_audio.wav")
return "final_subtitles.srt", "final_audio.wav"
# Validate SRT timing with total audio length
def validate_srt_against_audio(srt_file_path, audio_file_path):
audio_duration = get_audio_length(audio_file_path)
with open(srt_file_path, 'r') as file:
subtitles = list(srt.parse(file.read()))
for subtitle in subtitles:
if subtitle.end.total_seconds() > audio_duration:
subtitle.end = srt.timedelta(seconds=audio_duration)
break
with open(srt_file_path, 'w') as file:
file.write(srt.compose(subtitles))
# Gradio function with error handling and markdown message
async def process_script(script_text, language, voice):
try:
srt_path, audio_path = await batch_process_srt_and_audio(script_text, voice)
return srt_path, audio_path, audio_path, ""
except Exception as e:
print(f"Error: {e}")
return None, None, None, "An error occurred. Please check the script text and try again."
# Dynamic voice selection based on language
def update_voice_options(language):
voices = {
"en-US": ["en-US-AndrewNeural", "en-US-JennyNeural"],
"es-ES": ["es-ES-AlvaroNeural", "es-ES-ElviraNeural"]
}
return gr.update(choices=voices.get(language, []), value=voices.get(language, [])[0])
# Gradio app setup
with gr.Blocks() as app:
gr.Markdown("# Text to Speech with Accurate SRT and Audio Generation")
language = gr.Dropdown(choices=["en-US", "es-ES"], label="Select Language", value="en-US")
voice = gr.Dropdown(choices=["en-US-AndrewNeural", "en-US-JennyNeural"], label="Select Voice")
language.change(fn=update_voice_options, inputs=language, outputs=voice)
script_text = gr.Textbox(label="Enter Script Text", lines=10)
outputs = [
gr.File(label="Download SRT File"),
gr.File(label="Download Audio File"),
gr.Audio(label="Play Audio"),
gr.Markdown(label="Error Message") # This will display any error messages
]
submit_button = gr.Button("Generate Audio and SRT")
submit_button.click(process_script, inputs=[script_text, language, voice], outputs=outputs)
app.launch()