import gradio as gr import torch from torch import autocast from diffusers import StableDiffusionPipeline from moviepy.editor import AudioFileClip, ImageClip def process_inputs(prompt, audio): image = get_stable_diffusion_image(prompt) video = add_static_image_to_audio(image, audio) return video def add_static_image_to_audio(image, audio): """Create and save a video file to `output_path` after combining a static image that is located in `image_path` with an audio file in `audio_path`""" # create the audio clip object audio_clip = AudioFileClip(audio) # create the image clip object image_clip = ImageClip(image) # use set_audio method from image clip to combine the audio with the image video_clip = image_clip.set_audio(audio_clip) # specify the duration of the new clip to be the duration of the audio clip video_clip.duration = audio_clip.duration # set the FPS to 1 video_clip.fps = 1 # write the resuling video clip path = "temp/video_out.mp4" video_clip.write_videofile(path) return path def get_stable_diffusion_image(prompt): model_id = "CompVis/stable-diffusion-v1-4" device = "cuda" pipe = StableDiffusionPipeline.from_pretrained(model_id, use_auth_token=True) pipe = pipe.to(device) with autocast("cuda"): image = pipe(prompt, guidance_scale=7.5)["sample"][0] path = "temp/out.jpg" image.save(path) return path iface = gr.Interface(fn=process_inputs, inputs=["text", gr.Audio(type="filepath")], outputs="video") iface.launch()