Spaces:

PAIR
/

StreamingT2V

Runtime error

File size: 9,469 Bytes

81022ab
 
 
 
 
 
 
f949b3f
81022ab
 
 
f949b3f
81022ab
 
 
 
f949b3f
81022ab

# General
import os
from os.path import join as opj
import argparse
import datetime
from pathlib import Path
import torch
import gradio as gr
import tempfile
import yaml
from t2v_enhanced.model.video_ldm import VideoLDM

# Utilities
from t2v_enhanced.inference_utils import *
from t2v_enhanced.model_init import *
from t2v_enhanced.model_func import *


on_huggingspace = os.environ.get("SPACE_AUTHOR_NAME") == "PAIR"
parser = argparse.ArgumentParser()
parser.add_argument('--public_access', action='store_true', default=True)
parser.add_argument('--where_to_log', type=str, default="gradio_output")
parser.add_argument('--device', type=str, default="cuda")
args = parser.parse_args()


Path(args.where_to_log).mkdir(parents=True, exist_ok=True)
result_fol = Path(args.where_to_log).absolute()
device = args.device


# --------------------------
# ----- Configurations -----
# --------------------------
ckpt_file_streaming_t2v = Path("t2v_enhanced/checkpoints/streaming_t2v.ckpt").absolute()
cfg_v2v = {'downscale': 1, 'upscale_size': (1280, 720), 'model_id': 'damo/Video-to-Video', 'pad': True}


# --------------------------
# ----- Initialization -----
# --------------------------
ms_model = init_modelscope(device)
# zs_model = init_zeroscope(device)
stream_cli, stream_model = init_streamingt2v_model(ckpt_file_streaming_t2v, result_fol)
msxl_model = init_v2v_model(cfg_v2v)

inference_generator = torch.Generator(device="cuda")


# -------------------------
# ----- Functionality -----
# -------------------------
def generate(prompt, num_frames, image, model_name_stage1, model_name_stage2, n_prompt, seed, t, image_guidance, where_to_log=result_fol):
    now = datetime.datetime.now()
    name = prompt[:100].replace(" ", "_") + "_" + str(now.time()).replace(":", "_").replace(".", "_")

    if num_frames == [] or num_frames is None:
        num_frames = 56
    else:
        num_frames = int(num_frames.split(" ")[0])

    n_autoreg_gen = num_frames/8-8

    inference_generator.manual_seed(seed)
    short_video = ms_short_gen(prompt, ms_model, inference_generator, t, device)
    stream_long_gen(prompt, short_video, n_autoreg_gen, n_prompt, seed, t, image_guidance, name, stream_cli, stream_model)
    video_path = opj(where_to_log, name+".mp4")
    return video_path

def enhance(prompt, input_to_enhance):
    encoded_video = video2video(prompt, input_to_enhance, result_fol, cfg_v2v, msxl_model)
    return encoded_video


# --------------------------
# ----- Gradio-Demo UI -----
# --------------------------
with gr.Blocks() as demo:
    gr.HTML(
        """
        <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
        <h1 style="font-weight: 900; font-size: 3rem; margin: 0rem">
            <a href="https://github.com/Picsart-AI-Research/StreamingT2V" style="color:blue;">StreamingT2V</a> 
        </h1>
        <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
        Roberto Henschel<sup>1*</sup>, Levon Khachatryan<sup>1*</sup>, Daniil Hayrapetyan<sup>1*</sup>, Hayk Poghosyan<sup>1</sup>, Vahram Tadevosyan<sup>1</sup>, Zhangyang Wang<sup>1,2</sup>, Shant Navasardyan<sup>1</sup>, Humphrey Shi<sup>1,3</sup>
        </h2>
        <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
        <sup>1</sup>Picsart AI Resarch (PAIR), <sup>2</sup>UT Austin, <sup>3</sup>SHI Labs @ Georgia Tech, Oregon & UIUC
        </h2>
        <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
        *Equal Contribution
        </h2>
        <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem">
        [<a href="https://arxiv.org/abs/2403.14773" style="color:blue;">arXiv</a>] 
        [<a href="https://github.com/Picsart-AI-Research/StreamingT2V" style="color:blue;">GitHub</a>]
        </h2>
        <h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem">
        <b>StreamingT2V</b> is an advanced autoregressive technique that enables the creation of long videos featuring rich motion dynamics without any stagnation. 
        It ensures temporal consistency throughout the video, aligns closely with the descriptive text, and maintains high frame-level image quality. 
        Our demonstrations include successful examples of videos up to <b>1200 frames, spanning 2 minutes</b>, and can be extended for even longer durations. 
        Importantly, the effectiveness of StreamingT2V is not limited by the specific Text2Video model used, indicating that improvements in base models could yield even higher-quality videos.
        </h2>
        </div>
        """)

    if on_huggingspace:
        gr.HTML("""
        <p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings.
        <br/>
        <a href="https://huggingface.co/spaces/PAIR/StreamingT2V?duplicate=true">
        <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
        </p>""")

    with gr.Row():
        with gr.Column():
            with gr.Row():
                with gr.Column():
                    with gr.Row():
                        num_frames = gr.Dropdown(["24", "32", "40", "48", "56", "80 - only on local", "240 - only on local", "600 - only on local", "1200 - only on local", "10000 - only on local"], label="Number of Video Frames: Default is 56", info="For >80 frames use local workstation!")
                    with gr.Row():
                        prompt_stage1 = gr.Textbox(label='Textual Prompt', placeholder="Ex: Dog running on the street.")
                    with gr.Row():
                        image_stage1 = gr.Image(label='Image Prompt (only required for I2V base models)', show_label=True, scale=1, show_download_button=True)
                with gr.Column():
                    video_stage1 = gr.Video(label='Long Video Preview', show_label=True, interactive=False, scale=2, show_download_button=True)
            with gr.Row():
                run_button_stage1 = gr.Button("Long Video Preview Generation")

            with gr.Row():
                with gr.Column():
                    with gr.Accordion('Advanced options', open=False):
                        model_name_stage1 = gr.Dropdown(
                            choices=["T2V: ModelScope", "T2V: ZeroScope", "I2V: AnimateDiff"],
                            label="Base Model. Default is ModelScope",
                            info="Currently supports only ModelScope. We will add more options later!",
                        )
                        model_name_stage2 = gr.Dropdown(
                            choices=["ModelScope-XL", "Another", "Another"],
                            label="Enhancement Model. Default is ModelScope-XL",
                            info="Currently supports only ModelScope-XL. We will add more options later!",
                        )
                        n_prompt = gr.Textbox(label="Optional Negative Prompt", value='')
                        seed = gr.Slider(label='Seed', minimum=0, maximum=65536, value=33,step=1,)

                        t = gr.Slider(label="Timesteps", minimum=0, maximum=100, value=50, step=1,)
                        image_guidance = gr.Slider(label='Image guidance scale', minimum=1, maximum=10, value=9.0, step=1.0)

        with gr.Column():
            with gr.Row():
                video_stage2 = gr.Video(label='Enhanced Long Video', show_label=True, interactive=False, height=473, show_download_button=True)
            with gr.Row():
                run_button_stage2 = gr.Button("Long Video Enhancement")
    '''
    '''
    gr.HTML(
        """
        <div style="text-align: justify; max-width: 1200px; margin: 20px auto;">
        <h3 style="font-weight: 450; font-size: 0.8rem; margin: 0rem">
        <b>Version: v1.0</b>
        </h3>
        <h3 style="font-weight: 450; font-size: 0.8rem; margin: 0rem">
        <b>Caution</b>: 
        We would like the raise the awareness of users of this demo of its potential issues and concerns.
        Like previous large foundation models, StreamingT2V could be problematic in some cases, partially we use pretrained ModelScope, therefore StreamingT2V can Inherit Its Imperfections.
        So far, we keep all features available for research testing both to show the great potential of the StreamingT2V framework and to collect important feedback to improve the model in the future.
        We welcome researchers and users to report issues with the HuggingFace community discussion feature or email the authors.
        </h3>
        <h3 style="font-weight: 450; font-size: 0.8rem; margin: 0rem">
        <b>Biases and content acknowledgement</b>:
        Beware that StreamingT2V may output content that reinforces or exacerbates societal biases, as well as realistic faces, pornography, and violence. 
        StreamingT2V in this demo is meant only for research purposes.
        </h3>
        </div>
        """)

    inputs_t2v = [prompt_stage1, num_frames, image_stage1, model_name_stage1, model_name_stage2, n_prompt, seed, t, image_guidance]
    run_button_stage1.click(fn=generate, inputs=inputs_t2v, outputs=video_stage1,)

    inputs_v2v = [prompt_stage1, video_stage1]
    run_button_stage2.click(fn=enhance, inputs=inputs_v2v, outputs=video_stage2,)


if on_huggingspace:
    demo.queue(max_size=20)
    demo.launch(debug=True)
else:
    _, _, link = demo.queue(api_open=False).launch(share=args.public_access)
    print(link)