import gradio as gr
import torch
from torch import autocast
from diffusers import StableDiffusionPipeline
import argparse
from moviepy.editor import AudioFileClip, ImageClip

parser = argparse.ArgumentParser()
setshare = parser.add_argument('--setshare', default=True, action=argparse.BooleanOptionalAction)

def process_inputs(prompt, audio):
    image = get_stable_diffusion_image(prompt)
    video = add_static_image_to_audio(image, audio)
    return video


def add_static_image_to_audio(image, audio):
    """Create and save a video file to `output_path` after 
    combining a static image that is located in `image_path` 
    with an audio file in `audio_path`"""
    # create the audio clip object
    audio_clip = AudioFileClip(audio)
    # create the image clip object
    image_clip = ImageClip(image)
    # use set_audio method from image clip to combine the audio with the image
    video_clip = image_clip.set_audio(audio)
    # specify the duration of the new clip to be the duration of the audio clip
    video_clip.duration = audio.duration
    # set the FPS to 1
    video_clip.fps = 1
    # write the resuling video clip
    return video_clip

def get_stable_diffusion_image(prompt):
    model_id = "CompVis/stable-diffusion-v1-4"
    device = "cuda"
    pipe = StableDiffusionPipeline.from_pretrained(model_id, use_auth_token=True)
    pipe = pipe.to(device)
    with autocast("cuda"):
        image = pipe(prompt, guidance_scale=7.5)["sample"][0]
        print(image)
        return image
    

iface = gr.Interface(fn=process_inputs, inputs=["text", "audio"], outputs="video")
iface.launch(share=setshare)