Spaces:
Runtime error
Runtime error
| import re | |
| import numpy as np | |
| import random | |
| import sys | |
| import torch | |
| from PIL import Image, ImageDraw, ImageFont | |
| from diffusers import DiffusionPipeline, TCDScheduler | |
| from huggingface_hub import hf_hub_download | |
| from gtts import gTTS | |
| from moviepy.editor import ImageSequenceClip, VideoFileClip, concatenate_videoclips, AudioFileClip | |
| import gradio as gr | |
| # Choose among 1, 2, 4 and 8: | |
| num_inference_steps = 8 | |
| base_model_id = "stabilityai/stable-diffusion-xl-base-1.0" | |
| repo_name = "ByteDance/Hyper-SD" | |
| plural = "s" if num_inference_steps > 1 else "" | |
| ckpt_name = f"Hyper-SDXL-{num_inference_steps}step{plural}-lora.safetensors" | |
| device = "cpu" | |
| pipe = DiffusionPipeline.from_pretrained(base_model_id).to(device) | |
| pipe.load_lora_weights(hf_hub_download(repo_name, ckpt_name)) | |
| pipe.fuse_lora() | |
| pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config) | |
| def generate_image(prompt, step_count=50, seed=None): | |
| if seed is None: | |
| seed = random.randint(0, sys.maxsize) | |
| generator = torch.Generator(device).manual_seed(seed) | |
| eta = 0.5 | |
| images = pipe( | |
| prompt=prompt, | |
| num_inference_steps=step_count, | |
| guidance_scale=0.0, | |
| eta=eta, | |
| generator=generator, | |
| ).images | |
| return images[0] | |
| def draw_text_on_image(image, text, font_path="arial.ttf", font_size=24): | |
| image_with_text = image.copy() | |
| draw = ImageDraw.Draw(image_with_text) | |
| try: | |
| font = ImageFont.truetype(font_path, font_size) | |
| except OSError: | |
| print(f"Font {font_path} not found. Using default font.") | |
| font = ImageFont.load_default() | |
| # Split text into multiple lines to fit within the image | |
| lines = [] | |
| max_width = image.width - 20 # Padding of 10 pixels on each side | |
| words = text.split() | |
| while words: | |
| line = '' | |
| while words and draw.textlength(line + words[0], font=font) <= max_width: | |
| line = f"{line} {words.pop(0)}" if line else words.pop(0) | |
| lines.append(line) | |
| # Calculate total text height | |
| text_height = sum(draw.textbbox((0, 0), line, font=font)[3] for line in lines) | |
| # Position text at the bottom of the image | |
| text_y = image.height - text_height - 20 # Padding of 10 pixels from the bottom | |
| for line in lines: | |
| text_bbox = draw.textbbox((0, 0), line, font=font) | |
| text_width = text_bbox[2] - text_bbox[0] | |
| text_height = text_bbox[3] - text_bbox[1] | |
| text_x = (image.width - text_width) // 2 # Centered horizontally | |
| # Draw background rectangle for text | |
| draw.rectangle([(text_x - 5, text_y - 5), (text_x + text_width + 5, text_y + text_height + 5)], fill="black") | |
| # Draw text on top of the rectangle | |
| draw.text((text_x, text_y), line, font=font, fill="white") | |
| text_y += text_height + 5 # Move to the next line with some padding | |
| return image_with_text | |
| def process_story(story): | |
| # Use regular expressions to split the text into sentences | |
| sentences = re.split(r'(?<=[.!?]) +', story.strip()) | |
| # Initialize lists for video clips and audio clips | |
| video_clips = [] | |
| fps = 24 # Frames per second | |
| # Generate images, overlay text, and create audio | |
| for i, sentence in enumerate(sentences): | |
| print(f"Sentence {i+1}: {sentence}\n") | |
| seed = random.randint(0, sys.maxsize) | |
| image = generate_image(sentence, step_count=50, seed=seed) # Increase step count for better quality images | |
| resized_image = image.resize((256, 256)) | |
| image_with_text = draw_text_on_image(resized_image, sentence) | |
| # Save the image with text | |
| image_path = f"sentence_{i+1}.png" | |
| image_with_text.save(image_path) | |
| frame = np.array(image_with_text) # Convert to NumPy array | |
| # Generate audio for the sentence | |
| tts = gTTS(sentence, lang='en') | |
| audio_path = f"sentence_{i+1}.mp3" | |
| tts.save(audio_path) | |
| audio_clip = AudioFileClip(audio_path) | |
| # Create a video clip from the image and set the duration to the audio duration | |
| video_clip = ImageSequenceClip([frame], fps=fps) | |
| video_clip = video_clip.set_duration(audio_clip.duration) | |
| video_clip = video_clip.set_audio(audio_clip) | |
| # Save the individual video clip | |
| clip_path = f"sentence_{i+1}.mp4" | |
| video_clip.write_videofile(clip_path, codec="libx264", audio_codec="aac") | |
| video_clips.append(video_clip) | |
| # Clear memory | |
| del resized_image, image_with_text | |
| # Concatenate all video clips into a final video | |
| final_video = concatenate_videoclips(video_clips) | |
| final_video_path = "story_video.mp4" | |
| final_video.write_videofile(final_video_path, codec="libx264", audio_codec="aac") | |
| return final_video_path | |
| def generate_story_video(story): | |
| final_video_path = process_story(story) | |
| return final_video_path | |
| iface = gr.Interface( | |
| fn=generate_story_video, | |
| inputs="text", | |
| outputs="video", | |
| title="Story to Video Generator", | |
| description="Enter a story and get a video with images and narrated text.", | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch(share=True) | |