Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| from diffusers import StableDiffusionPipeline | |
| from TTS.api import TTS | |
| import moviepy.editor as mp | |
| import numpy as np | |
| import os | |
| from PIL import Image | |
| def estimate_chunk_durations(text, words_per_second=2.5, min_sec=5, max_sec=10): | |
| words = text.split() | |
| chunks = [] | |
| current_chunk = [] | |
| current_duration = 0 | |
| for word in words: | |
| current_chunk.append(word) | |
| current_duration += 1 / words_per_second | |
| if current_duration >= min_sec: | |
| if current_duration >= max_sec or len(current_chunk) > 20: | |
| chunks.append(" ".join(current_chunk)) | |
| current_chunk = [] | |
| current_duration = 0 | |
| if current_chunk: | |
| chunks.append(" ".join(current_chunk)) | |
| total_time = sum([min(max(len(chunk.split()) / words_per_second, min_sec), max_sec) for chunk in chunks]) | |
| print(f"Total estimated time for video: {total_time:.2f} seconds") | |
| return chunks | |
| def generate_speech(text): | |
| tts = TTS("tts_models/en/ljspeech/tacotron2-DDC") | |
| wav_path = "speech.wav" | |
| tts.tts_to_file(text=text, file_path=wav_path) | |
| return wav_path | |
| def generate_images(chunks, image_size=(640, 480)): | |
| pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") | |
| pipe.to("cuda" if torch.cuda.is_available() else "cpu") | |
| image_paths = [] | |
| for i, chunk in enumerate(chunks): | |
| print(f"Generating image for chunk {i + 1} of {len(chunks)}: {chunk[:50]}...") # Printing part of the chunk | |
| image = pipe(chunk).images[0] | |
| image = image.resize(image_size) | |
| img_path = f"image_{i}.png" | |
| image.save(img_path) | |
| image_paths.append(img_path) | |
| return image_paths | |
| def create_video(images, durations, speech_path, image_size=(640, 480)): | |
| clips = [mp.ImageClip(img).set_duration(dur).resize(image_size) for img, dur in zip(images, durations)] | |
| black_start = mp.ColorClip(image_size, color=(0,0,0), duration=1) | |
| black_end = mp.ColorClip(image_size, color=(0,0,0), duration=2) | |
| video = mp.concatenate_videoclips([black_start] + clips + [black_end]) | |
| audio = mp.AudioFileClip(speech_path) | |
| final_video = video.set_audio(audio) | |
| final_video.write_videofile("output.mp4", fps=24) | |
| return "output.mp4" | |
| def process_text(text, image_size): | |
| chunks = estimate_chunk_durations(text) | |
| speech_path = generate_speech(text) | |
| image_paths = generate_images(chunks, image_size) | |
| durations = [min(10, max(5, len(chunk.split()) / 2.5)) for chunk in chunks] | |
| video_path = create_video(image_paths, durations, speech_path, image_size) | |
| return video_path | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Text-to-Video Generator using AI 🎥") | |
| text_input = gr.Textbox(label="Enter your text") | |
| file_input = gr.File(label="Or upload a .txt file") | |
| image_size_input = gr.Radio(choices=["640x480", "800x600", "1024x768"], label="Select Image Size", value="640x480") | |
| process_btn = gr.Button("Generate Video") | |
| output_video = gr.Video() | |
| def handle_request(text, file, image_size): | |
| if file is not None: | |
| text = open(file.name, "r").read() | |
| image_size_dict = {"640x480": (640, 480), "800x600": (800, 600), "1024x768": (1024, 768)} | |
| return process_text(text, image_size_dict[image_size]) | |
| process_btn.click(handle_request, inputs=[text_input, file_input, image_size_input], outputs=output_video) | |
| demo.launch() | |