Spaces:

TempoFunk
/

makeavid-sd-jax

Runtime error

File size: 9,227 Bytes

8c4daf1


import os
from io import BytesIO
import base64
from functools import partial

from PIL import Image, ImageOps
import gradio as gr

from makeavid_sd.inference import InferenceUNetPseudo3D, FlaxDPMSolverMultistepScheduler, jnp


_preheat: bool = False

_seen_compilations = set()

_model = InferenceUNetPseudo3D(
        model_path = 'TempoFunk/makeavid-sd-jax',
        scheduler_cls = FlaxDPMSolverMultistepScheduler,
        dtype = jnp.float16,
        hf_auth_token = os.environ.get('HUGGING_FACE_HUB_TOKEN', None)
)

# gradio is illiterate. type hints make it go poopoo in pantsu.
def generate(
        prompt = 'An elderly man having a great time in the park.',
        neg_prompt = '',
        image = { 'image': None, 'mask': None },
        inference_steps = 20,
        cfg = 12.0,
        seed = 0,
        fps = 24,
        num_frames = 24,
        height = 512,
        width = 512
) -> str:
    height = int(height)
    width = int(width)
    num_frames = int(num_frames)
    seed = int(seed)
    if seed < 0:
        seed = -seed
    inference_steps = int(inference_steps)
    if image is not None:
        hint_image = image['image']
        mask_image = image['mask']
    else:
        hint_image = None
        mask_image = None
    if hint_image is not None:
        if hint_image.mode != 'RGB':
            hint_image = hint_image.convert('RGB')
        if hint_image.size != (width, height):
            hint_image = ImageOps.fit(hint_image, (width, height), method = Image.Resampling.LANCZOS)
    if mask_image is not None:
        if mask_image.mode != 'L':
            mask_image = mask_image.convert('L')
        if mask_image.size != (width, height):
            mask_image = ImageOps.fit(mask_image, (width, height), method = Image.Resampling.LANCZOS)
    images = _model.generate(
            prompt = [prompt] * _model.device_count,
            neg_prompt = neg_prompt,
            hint_image = hint_image,
            mask_image = mask_image,
            inference_steps = inference_steps,
            cfg = cfg,
            height = height,
            width = width,
            num_frames = num_frames,
            seed = seed
    )
    _seen_compilations.add((hint_image is None, inference_steps, height, width, num_frames))
    buffer = BytesIO()
    images[0].save(
            buffer,
            format = 'webp',
            save_all = True,
            append_images = images[1:],
            loop = 0,
            duration = round(1000 / fps),
            allow_mixed = True
    )
    data = base64.b64encode(buffer.getvalue()).decode()
    data = 'data:image/webp;base64,' + data
    buffer.close()
    return data

def check_if_compiled(image, inference_steps, height, width, num_frames, message):
    height = int(height)
    width = int(width)
    hint_image = None if image is None else image['image']
    if (hint_image is None, inference_steps, height, width, num_frames) in _seen_compilations:
        return ''
    else:
        return  f"""{message}"""

if _preheat:
    print('\npreheating the oven')
    generate(
            prompt = 'preheating the oven',
            neg_prompt = '',
            image = { 'image': None, 'mask': None },
            inference_steps = 20,
            cfg = 12.0,
            seed = 0
    )
    print('Entertaining the guests with sailor songs played on an old piano.')
    dada = generate(
            prompt = 'Entertaining the guests with sailor songs played on an old harmonium.',
            neg_prompt = '',
            image = { 'image': Image.new('RGB', size = (512, 512), color = (0, 0, 0)), 'mask': None },
            inference_steps = 20,
            cfg = 12.0,
            seed = 0
    )
    print('dinner is ready\n')

with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled = False) as demo:
    variant = 'panel'
    with gr.Row():
        with gr.Column():
            intro1 = gr.Markdown("""
                        # Make-A-Video Stable Diffusion JAX
                        **Please be patient. The model might have to compile with current parameters.**

                        This can take up to 5 minutes on the first run, and 2-3 minutes on later runs.
                        The compilation will be cached and consecutive runs with the same parameters
                        will be much faster.
            """)
        with gr.Column():
            intro2 = gr.Markdown("""
                        The following parameters require the model to compile
                        - Number of frames
                        - Width & Height
                        - Steps
                        - Input image vs. no input image
            """)

    with gr.Row(variant = variant):
        with gr.Column(variant = variant):
            with gr.Row():
                cancel_button = gr.Button(value = 'Cancel')
                submit_button = gr.Button(value = 'Make A Video', variant = 'primary')
            prompt_input = gr.Textbox(
                    label = 'Prompt',
                    value = 'They are dancing in the club while sweat drips from the ceiling.',
                    interactive = True
            )
            neg_prompt_input = gr.Textbox(
                    label = 'Negative prompt (optional)',
                    value = '',
                    interactive = True
            )
            inference_steps_input = gr.Slider(
                label = 'Steps',
                minimum = 1,
                maximum = 100,
                value = 20,
                step = 1
            )
            cfg_input = gr.Slider(
                    label = 'Guidance scale',
                    minimum = 1.0,
                    maximum = 20.0,
                    step = 0.1,
                    value = 15.0,
                    interactive = True
            )
            seed_input = gr.Number(
                    label = 'Random seed',
                    value = 0,
                    interactive = True,
                    precision = 0
            )
            image_input = gr.Image(
                    label = 'Input image (optional)',
                    interactive = True,
                    image_mode = 'RGB',
                    type = 'pil',
                    optional = True,
                    source = 'upload',
                    tool = 'sketch'
            )
            num_frames_input = gr.Slider(
                    label = 'Number of frames to generate',
                    minimum = 1,
                    maximum = 24,
                    step = 1,
                    value = 24
            )
            width_input = gr.Slider(
                    label = 'Width',
                    minimum = 64,
                    maximum = 512,
                    step = 1,
                    value = 448
            )
            height_input = gr.Slider(
                    label = 'Height',
                    minimum = 64,
                    maximum = 512,
                    step = 1,
                    value = 448
            )
            fps_input = gr.Slider(
                    label = 'Output FPS',
                    minimum = 1,
                    maximum = 1000,
                    step = 1,
                    value = 12
            )
        with gr.Column(variant = variant):
            will_trigger = gr.Markdown('')
            patience = gr.Markdown('')
            image_output = gr.Image(
                    label = 'Output',
                    value = 'example.webp',
                    interactive = False
            )
    trigger_inputs =  [ image_input, inference_steps_input, height_input, width_input, num_frames_input ]
    trigger_check_fun = partial(check_if_compiled, message = 'Current parameters will trigger compilation.')
    height_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
    width_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
    num_frames_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
    inference_steps_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
    will_trigger.value = trigger_check_fun(image_input.value, inference_steps_input.value, height_input.value, width_input.value, num_frames_input.value)
    ev = submit_button.click(
        fn = partial(
                check_if_compiled,
                message = 'Please be patient. The model has to be compiled with current parameters.'
        ),
        inputs = trigger_inputs,
        outputs = patience
    ).then(
        fn = generate,
        inputs = [
                prompt_input,
                neg_prompt_input,
                image_input,
                inference_steps_input,
                cfg_input,
                seed_input,
                fps_input,
                num_frames_input,
                height_input,
                width_input
        ],
        outputs = image_output,
        postprocess = False
    ).then(
        fn = trigger_check_fun,
        inputs = trigger_inputs,
        outputs = will_trigger
    )
    cancel_button(cancels = ev)

demo.queue(concurrency_count = 1, max_size = 16)
demo.launch()