# generate_video.py from gtts import gTTS from pydub import AudioSegment import cv2 import gradio as gr def text_to_wav(text, file_name): language = 'en' tts = gTTS(text=text, lang=language, slow=False) tts.save(file_name) def generate_video(input_image, input_text): text_file = "input_audio.wav" video_file = "/path/to/generated_video.mp4" text_to_wav(input_text, text_file) audio = AudioSegment.from_file(text_file) duration_seconds = len(audio) / 1000.0 img = cv2.imread(input_image) fps = 60 video_duration_seconds = round(duration_seconds) * 3 video = cv2.VideoWriter(video_file, cv2.VideoWriter_fourcc(*'mp4v'), fps, (img.shape[1], img.shape[0])) for _ in range(int(fps * video_duration_seconds)): video.write(img) video.release() return video_file if __name__ == "__main__": iface = gr.Interface( fn=generate_video, inputs=["image", "text"], outputs="video", ) iface.launch(share=True)