diff --git a/README.md b/README.md index 471fd9ae0f008f8b464455f4a189e7c0fbe5c1fa..2b8799fc69a33f51fa29573df1adcd9db9406b21 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ --- -title: ComfyUI-A10 -emoji: 👕👔👚 -colorFrom: yellow -colorTo: red -sdk: docker -sdk_version: 4.36.1 +title: Open Sora +emoji: ⚡ +colorFrom: blue +colorTo: purple +sdk: gradio +sdk_version: 4.25.0 app_file: app.py pinned: false -license: cc-by-nc-sa-4.0 +license: apache-2.0 --- -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference \ No newline at end of file +Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/app.py b/app.py index 69d015015bd4db8e3fc969c0663634e49d6dda18..547958f92afdf5085e8f996061626d7fe4beaed1 100644 --- a/app.py +++ b/app.py @@ -1,311 +1,653 @@ -import gradio as gr -from PIL import Image -from src.tryon_pipeline import StableDiffusionXLInpaintPipeline as TryonPipeline -from src.unet_hacked_garmnet import UNet2DConditionModel as UNet2DConditionModel_ref -from src.unet_hacked_tryon import UNet2DConditionModel -from transformers import ( - CLIPImageProcessor, - CLIPVisionModelWithProjection, - CLIPTextModel, - CLIPTextModelWithProjection, -) -from diffusers import DDPMScheduler,AutoencoderKL -from typing import List +#!/usr/bin/env python +""" +This script runs a Gradio App for the Open-Sora model. -import torch +Usage: + python demo.py +""" + +import argparse +import datetime +import importlib import os -from transformers import AutoTokenizer -import numpy as np -from utils_mask import get_mask_location -from torchvision import transforms -import apply_net -from preprocess.humanparsing.run_parsing import Parsing -from preprocess.openpose.run_openpose import OpenPose -from detectron2.data.detection_utils import convert_PIL_to_numpy,_apply_exif_orientation -from torchvision.transforms.functional import to_pil_image - - -def pil_to_binary_mask(pil_image, threshold=0): - np_image = np.array(pil_image) - grayscale_image = Image.fromarray(np_image).convert("L") - binary_mask = np.array(grayscale_image) > threshold - mask = np.zeros(binary_mask.shape, dtype=np.uint8) - for i in range(binary_mask.shape[0]): - for j in range(binary_mask.shape[1]): - if binary_mask[i,j] == True : - mask[i,j] = 1 - mask = (mask*255).astype(np.uint8) - output_mask = Image.fromarray(mask) - return output_mask - - -base_path = 'yisol/IDM-VTON' -example_path = os.path.join(os.path.dirname(__file__), 'example') - -unet = UNet2DConditionModel.from_pretrained( - base_path, - subfolder="unet", - torch_dtype=torch.float16, -) -unet.requires_grad_(False) -tokenizer_one = AutoTokenizer.from_pretrained( - base_path, - subfolder="tokenizer", - revision=None, - use_fast=False, -) -tokenizer_two = AutoTokenizer.from_pretrained( - base_path, - subfolder="tokenizer_2", - revision=None, - use_fast=False, -) -noise_scheduler = DDPMScheduler.from_pretrained(base_path, subfolder="scheduler") +import subprocess +import sys +from tempfile import NamedTemporaryFile -text_encoder_one = CLIPTextModel.from_pretrained( - base_path, - subfolder="text_encoder", - torch_dtype=torch.float16, -) -text_encoder_two = CLIPTextModelWithProjection.from_pretrained( - base_path, - subfolder="text_encoder_2", - torch_dtype=torch.float16, -) -image_encoder = CLIPVisionModelWithProjection.from_pretrained( - base_path, - subfolder="image_encoder", - torch_dtype=torch.float16, - ) -vae = AutoencoderKL.from_pretrained(base_path, - subfolder="vae", - torch_dtype=torch.float16, -) +import spaces +import torch -# "stabilityai/stable-diffusion-xl-base-1.0", -UNet_Encoder = UNet2DConditionModel_ref.from_pretrained( - base_path, - subfolder="unet_encoder", - torch_dtype=torch.float16, -) +import gradio as gr -parsing_model = Parsing(0) -openpose_model = OpenPose(0) - -UNet_Encoder.requires_grad_(False) -image_encoder.requires_grad_(False) -vae.requires_grad_(False) -unet.requires_grad_(False) -text_encoder_one.requires_grad_(False) -text_encoder_two.requires_grad_(False) -tensor_transfrom = transforms.Compose( - [ - transforms.ToTensor(), - transforms.Normalize([0.5], [0.5]), - ] - ) +MODEL_TYPES = ["v1.2-stage3"] +WATERMARK_PATH = "./assets/images/watermark/watermark.png" +CONFIG_MAP = { + "v1.2-stage3": "configs/opensora-v1-2/inference/sample.py", +} +HF_STDIT_MAP = {"v1.2-stage3": "hpcai-tech/OpenSora-STDiT-v3"} + + +# ============================ +# Prepare Runtime Environment +# ============================ +def install_dependencies(enable_optimization=False): + """ + Install the required dependencies for the demo if they are not already installed. + """ + + def _is_package_available(name) -> bool: + try: + importlib.import_module(name) + return True + except (ImportError, ModuleNotFoundError): + return False + + if enable_optimization: + # install flash attention + if not _is_package_available("flash_attn"): + subprocess.run( + f"{sys.executable} -m pip install flash-attn --no-build-isolation", + env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, + shell=True, + ) + + # install apex for fused layernorm + if not _is_package_available("apex"): + subprocess.run( + f'{sys.executable} -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git', + shell=True, + ) -pipe = TryonPipeline.from_pretrained( - base_path, - unet=unet, - vae=vae, - feature_extractor= CLIPImageProcessor(), - text_encoder = text_encoder_one, - text_encoder_2 = text_encoder_two, - tokenizer = tokenizer_one, - tokenizer_2 = tokenizer_two, - scheduler = noise_scheduler, - image_encoder=image_encoder, - torch_dtype=torch.float16, -) -pipe.unet_encoder = UNet_Encoder - -def start_tryon(dict,garm_img,garment_des,is_checked,is_checked_crop,denoise_steps,seed): - device = "cuda" - - openpose_model.preprocessor.body_estimation.model.to(device) - pipe.to(device) - pipe.unet_encoder.to(device) - - garm_img= garm_img.convert("RGB").resize((768,1024)) - human_img_orig = dict["background"].convert("RGB") - - if is_checked_crop: - width, height = human_img_orig.size - target_width = int(min(width, height * (3 / 4))) - target_height = int(min(height, width * (4 / 3))) - left = (width - target_width) / 2 - top = (height - target_height) / 2 - right = (width + target_width) / 2 - bottom = (height + target_height) / 2 - cropped_img = human_img_orig.crop((left, top, right, bottom)) - crop_size = cropped_img.size - human_img = cropped_img.resize((768,1024)) - else: - human_img = human_img_orig.resize((768,1024)) + # install ninja + if not _is_package_available("ninja"): + subprocess.run(f"{sys.executable} -m pip install ninja", shell=True) + # install xformers + if not _is_package_available("xformers"): + subprocess.run( + f"{sys.executable} -m pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers", + shell=True, + ) - if is_checked: - keypoints = openpose_model(human_img.resize((384,512))) - model_parse, _ = parsing_model(human_img.resize((384,512))) - mask, mask_gray = get_mask_location('hd', "upper_body", model_parse, keypoints) - mask = mask.resize((768,1024)) - else: - mask = pil_to_binary_mask(dict['layers'][0].convert("RGB").resize((768, 1024))) - # mask = transforms.ToTensor()(mask) - # mask = mask.unsqueeze(0) - mask_gray = (1-transforms.ToTensor()(mask)) * tensor_transfrom(human_img) - mask_gray = to_pil_image((mask_gray+1.0)/2.0) - - - human_img_arg = _apply_exif_orientation(human_img.resize((384,512))) - human_img_arg = convert_PIL_to_numpy(human_img_arg, format="BGR") - - - - args = apply_net.create_argument_parser().parse_args(('show', './configs/densepose_rcnn_R_50_FPN_s1x.yaml', './ckpt/densepose/model_final_162be9.pkl', 'dp_segm', '-v', '--opts', 'MODEL.DEVICE', 'cuda')) - # verbosity = getattr(args, "verbosity", None) - pose_img = args.func(args,human_img_arg) - pose_img = pose_img[:,:,::-1] - pose_img = Image.fromarray(pose_img).resize((768,1024)) - - with torch.no_grad(): - # Extract the images - with torch.cuda.amp.autocast(): - with torch.no_grad(): - prompt = "model is wearing " + garment_des - negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality" - with torch.inference_mode(): - ( - prompt_embeds, - negative_prompt_embeds, - pooled_prompt_embeds, - negative_pooled_prompt_embeds, - ) = pipe.encode_prompt( - prompt, - num_images_per_prompt=1, - do_classifier_free_guidance=True, - negative_prompt=negative_prompt, - ) - - prompt = "a photo of " + garment_des - negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality" - if not isinstance(prompt, List): - prompt = [prompt] * 1 - if not isinstance(negative_prompt, List): - negative_prompt = [negative_prompt] * 1 - with torch.inference_mode(): - ( - prompt_embeds_c, - _, - _, - _, - ) = pipe.encode_prompt( - prompt, - num_images_per_prompt=1, - do_classifier_free_guidance=False, - negative_prompt=negative_prompt, - ) +# ============================ +# Model-related +# ============================ +def read_config(config_path): + """ + Read the configuration file. + """ + from mmengine.config import Config + return Config.fromfile(config_path) - pose_img = tensor_transfrom(pose_img).unsqueeze(0).to(device,torch.float16) - garm_tensor = tensor_transfrom(garm_img).unsqueeze(0).to(device,torch.float16) - generator = torch.Generator(device).manual_seed(seed) if seed is not None else None - pipe.enable_vae_slicing() - pipe.enable_xformers_memory_efficient_attention() - pipe.enable_vae_tiling() - images = pipe( - prompt_embeds=prompt_embeds.to(device,torch.float16), - negative_prompt_embeds=negative_prompt_embeds.to(device,torch.float16), - pooled_prompt_embeds=pooled_prompt_embeds.to(device,torch.float16), - negative_pooled_prompt_embeds=negative_pooled_prompt_embeds.to(device,torch.float16), - num_inference_steps=denoise_steps, - generator=generator, - strength = 1.0, - pose_img = pose_img.to(device,torch.float16), - text_embeds_cloth=prompt_embeds_c.to(device,torch.float16), - cloth = garm_tensor.to(device,torch.float16), - mask_image=mask, - image=human_img, - height=1024, - width=768, - ip_adapter_image = garm_img.resize((768,1024)), - guidance_scale=2.0, - )[0] - - if is_checked_crop: - out_img = images[0].resize(crop_size) - human_img_orig.paste(out_img, (int(left), int(top))) - return human_img_orig, mask_gray - else: - return images[0], mask_gray - # return images[0], mask_gray - -garm_list = os.listdir(os.path.join(example_path,"cloth")) -garm_list_path = [os.path.join(example_path,"cloth",garm) for garm in garm_list] - -human_list = os.listdir(os.path.join(example_path,"human")) -human_list_path = [os.path.join(example_path,"human",human) for human in human_list] - -human_ex_list = [] -for ex_human in human_list_path: - ex_dict= {} - ex_dict['background'] = ex_human - ex_dict['layers'] = None - ex_dict['composite'] = None - human_ex_list.append(ex_dict) - -##default human - -image_blocks = gr.Blocks().queue() -with image_blocks as demo: - with gr.Row(): - with gr.Column(): - imgs = gr.ImageEditor(sources='upload', type="pil", label='Human. Mask with pen or use auto-masking', interactive=True) - with gr.Row(): - is_checked = gr.Checkbox(label="Yes", info="Use auto-generated mask (Takes 5 seconds)",value=True) - with gr.Row(): - is_checked_crop = gr.Checkbox(label="Yes", info="Use auto-crop & resizing",value=False) - - example = gr.Examples( - inputs=imgs, - examples_per_page=10, - examples=human_ex_list - ) - with gr.Column(): - garm_img = gr.Image(label="Garment", sources='upload', type="pil") - with gr.Row(elem_id="prompt-container"): - with gr.Row(): - prompt = gr.Textbox(placeholder="Description of garment ex) Short Sleeve Round Neck T-shirts", show_label=False, elem_id="prompt") - example = gr.Examples( - inputs=garm_img, - examples_per_page=8, - examples=garm_list_path) - with gr.Column(): - # image_out = gr.Image(label="Output", elem_id="output-img", height=400) - masked_img = gr.Image(label="Masked image output", elem_id="masked-img",show_share_button=False) - with gr.Column(): - # image_out = gr.Image(label="Output", elem_id="output-img", height=400) - image_out = gr.Image(label="Output", elem_id="output-img",show_share_button=False) +def build_models(model_type, config, enable_optimization=False): + """ + Build the models for the given model type and configuration. + """ + # build vae + from opensora.registry import MODELS, build_module + vae = build_module(config.vae, MODELS).cuda() + # build text encoder + text_encoder = build_module(config.text_encoder, MODELS) # T5 must be fp32 + text_encoder.t5.model = text_encoder.t5.model.cuda() + # build stdit + # we load model from HuggingFace directly so that we don't need to + # handle model download logic in HuggingFace Space + from opensora.models.stdit.stdit3 import STDiT3 - with gr.Column(): - try_button = gr.Button(value="Try-on") - with gr.Accordion(label="Advanced Settings", open=False): - with gr.Row(): - denoise_steps = gr.Number(label="Denoising Steps", minimum=20, maximum=40, value=30, step=1) - seed = gr.Number(label="Seed", minimum=-1, maximum=2147483647, step=1, value=42) + stdit = STDiT3.from_pretrained(HF_STDIT_MAP[model_type]) + stdit = stdit.cuda() + # build scheduler + from opensora.registry import SCHEDULERS + scheduler = build_module(config.scheduler, SCHEDULERS) - try_button.click(fn=start_tryon, inputs=[imgs, garm_img, prompt, is_checked,is_checked_crop, denoise_steps, seed], outputs=[image_out,masked_img], api_name='tryon') + # hack for classifier-free guidance + text_encoder.y_embedder = stdit.y_embedder - + # move modelst to device + vae = vae.to(torch.bfloat16).eval() + text_encoder.t5.model = text_encoder.t5.model.eval() # t5 must be in fp32 + stdit = stdit.to(torch.bfloat16).eval() + + # clear cuda + torch.cuda.empty_cache() + return vae, text_encoder, stdit, scheduler -image_blocks.launch() +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--model-type", + default="v1.2-stage3", + choices=MODEL_TYPES, + help=f"The type of model to run for the Gradio App, can only be {MODEL_TYPES}", + ) + parser.add_argument("--output", default="./outputs", type=str, help="The path to the output folder") + parser.add_argument("--port", default=None, type=int, help="The port to run the Gradio App on.") + parser.add_argument("--host", default="0.0.0.0", type=str, help="The host to run the Gradio App on.") + parser.add_argument("--share", action="store_true", help="Whether to share this gradio demo.") + parser.add_argument( + "--enable-optimization", + action="store_true", + help="Whether to enable optimization such as flash attention and fused layernorm", + ) + return parser.parse_args() + + +# ============================ +# Main Gradio Script +# ============================ +# as `run_inference` needs to be wrapped by `spaces.GPU` and the input can only be the prompt text +# so we can't pass the models to `run_inference` as arguments. +# instead, we need to define them globally so that we can access these models inside `run_inference` + +# read config +args = parse_args() +config = read_config(CONFIG_MAP[args.model_type]) +torch.backends.cuda.matmul.allow_tf32 = True +torch.backends.cudnn.allow_tf32 = True + +# make outputs dir +os.makedirs(args.output, exist_ok=True) + +# disable torch jit as it can cause failure in gradio SDK +# gradio sdk uses torch with cuda 11.3 +torch.jit._state.disable() + +# set up +install_dependencies(enable_optimization=args.enable_optimization) + +# import after installation +from opensora.datasets import IMG_FPS, save_sample +from opensora.datasets.aspect import get_image_size, get_num_frames +from opensora.models.text_encoder.t5 import text_preprocessing +from opensora.utils.inference_utils import ( + add_watermark, + append_generated, + append_score_to_prompts, + apply_mask_strategy, + collect_references_batch, + dframe_to_frame, + extract_json_from_prompts, + extract_prompts_loop, + get_random_prompt_by_openai, + has_openai_key, + merge_prompt, + prepare_multi_resolution_info, + refine_prompts_by_openai, + split_prompt, + has_openai_key +) +from opensora.utils.misc import to_torch_dtype + +# some global variables +dtype = to_torch_dtype(config.dtype) +device = torch.device("cuda") + +# build model +vae, text_encoder, stdit, scheduler = build_models( + args.model_type, config, enable_optimization=args.enable_optimization +) + + +def run_inference( + mode, + prompt_text, + resolution, + aspect_ratio, + length, + motion_strength, + aesthetic_score, + use_motion_strength, + use_aesthetic_score, + camera_motion, + reference_image, + refine_prompt, + fps, + num_loop, + seed, + sampling_steps, + cfg_scale, +): + if prompt_text is None or prompt_text == "": + gr.Warning("Your prompt is empty, please enter a valid prompt") + return None + + torch.manual_seed(seed) + with torch.inference_mode(): + # ====================== + # 1. Preparation arguments + # ====================== + # parse the inputs + # frame_interval must be 1 so we ignore it here + image_size = get_image_size(resolution, aspect_ratio) + + # compute generation parameters + if mode == "Text2Image": + num_frames = 1 + fps = IMG_FPS + else: + num_frames = config.num_frames + num_frames = get_num_frames(length) + + condition_frame_length = int(num_frames / 17 * 5 / 3) + condition_frame_edit = 0.0 + + input_size = (num_frames, *image_size) + latent_size = vae.get_latent_size(input_size) + multi_resolution = "OpenSora" + align = 5 + + # == prepare mask strategy == + if mode == "Text2Image": + mask_strategy = [None] + elif mode == "Text2Video": + if reference_image is not None: + mask_strategy = ["0"] + else: + mask_strategy = [None] + else: + raise ValueError(f"Invalid mode: {mode}") + + # == prepare reference == + if mode == "Text2Image": + refs = [""] + elif mode == "Text2Video": + if reference_image is not None: + # save image to disk + from PIL import Image + + im = Image.fromarray(reference_image) + temp_file = NamedTemporaryFile(suffix=".png") + im.save(temp_file.name) + refs = [temp_file.name] + else: + refs = [""] + else: + raise ValueError(f"Invalid mode: {mode}") + + # == get json from prompts == + batch_prompts = [prompt_text] + batch_prompts, refs, mask_strategy = extract_json_from_prompts(batch_prompts, refs, mask_strategy) + + # == get reference for condition == + refs = collect_references_batch(refs, vae, image_size) + + # == multi-resolution info == + model_args = prepare_multi_resolution_info( + multi_resolution, len(batch_prompts), image_size, num_frames, fps, device, dtype + ) + + # == process prompts step by step == + # 0. split prompt + # each element in the list is [prompt_segment_list, loop_idx_list] + batched_prompt_segment_list = [] + batched_loop_idx_list = [] + for prompt in batch_prompts: + prompt_segment_list, loop_idx_list = split_prompt(prompt) + batched_prompt_segment_list.append(prompt_segment_list) + batched_loop_idx_list.append(loop_idx_list) + + # 1. refine prompt by openai + if refine_prompt: + # check if openai key is provided + if not has_openai_key(): + gr.Warning("OpenAI API key is not provided, the prompt will not be enhanced.") + else: + for idx, prompt_segment_list in enumerate(batched_prompt_segment_list): + batched_prompt_segment_list[idx] = refine_prompts_by_openai(prompt_segment_list) + + # process scores + aesthetic_score = aesthetic_score if use_aesthetic_score else None + motion_strength = motion_strength if use_motion_strength and mode != "Text2Image" else None + camera_motion = None if camera_motion == "none" or mode == "Text2Image" else camera_motion + # 2. append score + for idx, prompt_segment_list in enumerate(batched_prompt_segment_list): + batched_prompt_segment_list[idx] = append_score_to_prompts( + prompt_segment_list, + aes=aesthetic_score, + flow=motion_strength, + camera_motion=camera_motion, + ) + + # 3. clean prompt with T5 + for idx, prompt_segment_list in enumerate(batched_prompt_segment_list): + batched_prompt_segment_list[idx] = [text_preprocessing(prompt) for prompt in prompt_segment_list] + + # 4. merge to obtain the final prompt + batch_prompts = [] + for prompt_segment_list, loop_idx_list in zip(batched_prompt_segment_list, batched_loop_idx_list): + batch_prompts.append(merge_prompt(prompt_segment_list, loop_idx_list)) + + # ========================= + # Generate image/video + # ========================= + video_clips = [] + + for loop_i in range(num_loop): + # 4.4 sample in hidden space + batch_prompts_loop = extract_prompts_loop(batch_prompts, loop_i) + + # == loop == + if loop_i > 0: + refs, mask_strategy = append_generated( + vae, video_clips[-1], refs, mask_strategy, loop_i, condition_frame_length, condition_frame_edit + ) + + # == sampling == + z = torch.randn(len(batch_prompts), vae.out_channels, *latent_size, device=device, dtype=dtype) + masks = apply_mask_strategy(z, refs, mask_strategy, loop_i, align=align) + + # 4.6. diffusion sampling + # hack to update num_sampling_steps and cfg_scale + scheduler_kwargs = config.scheduler.copy() + scheduler_kwargs.pop("type") + scheduler_kwargs["num_sampling_steps"] = sampling_steps + scheduler_kwargs["cfg_scale"] = cfg_scale + + scheduler.__init__(**scheduler_kwargs) + samples = scheduler.sample( + stdit, + text_encoder, + z=z, + prompts=batch_prompts_loop, + device=device, + additional_args=model_args, + progress=True, + mask=masks, + ) + samples = vae.decode(samples.to(dtype), num_frames=num_frames) + video_clips.append(samples) + + # ========================= + # Save output + # ========================= + video_clips = [val[0] for val in video_clips] + for i in range(1, num_loop): + video_clips[i] = video_clips[i][:, dframe_to_frame(condition_frame_length) :] + video = torch.cat(video_clips, dim=1) + current_datetime = datetime.datetime.now() + timestamp = current_datetime.timestamp() + save_path = os.path.join(args.output, f"output_{timestamp}") + saved_path = save_sample(video, save_path=save_path, fps=24) + torch.cuda.empty_cache() + + # add watermark + # all watermarked videos should have a _watermarked suffix + if mode != "Text2Image" and os.path.exists(WATERMARK_PATH): + watermarked_path = saved_path.replace(".mp4", "_watermarked.mp4") + success = add_watermark(saved_path, WATERMARK_PATH, watermarked_path) + if success: + return watermarked_path + else: + return saved_path + else: + return saved_path + + +@spaces.GPU() +def run_image_inference( + prompt_text, + resolution, + aspect_ratio, + length, + motion_strength, + aesthetic_score, + use_motion_strength, + use_aesthetic_score, + camera_motion, + reference_image, + refine_prompt, + fps, + num_loop, + seed, + sampling_steps, + cfg_scale, +): + return run_inference( + "Text2Image", + prompt_text, + resolution, + aspect_ratio, + length, + motion_strength, + aesthetic_score, + use_motion_strength, + use_aesthetic_score, + camera_motion, + reference_image, + refine_prompt, + fps, + num_loop, + seed, + sampling_steps, + cfg_scale, + ) + + +@spaces.GPU(duration=200) +def run_video_inference( + prompt_text, + resolution, + aspect_ratio, + length, + motion_strength, + aesthetic_score, + use_motion_strength, + use_aesthetic_score, + camera_motion, + reference_image, + refine_prompt, + fps, + num_loop, + seed, + sampling_steps, + cfg_scale, +): + # if (resolution == "480p" and length == "16s") or \ + # (resolution == "720p" and length in ["8s", "16s"]): + # gr.Warning("Generation is interrupted as the combination of 480p and 16s will lead to CUDA out of memory") + # else: + return run_inference( + "Text2Video", + prompt_text, + resolution, + aspect_ratio, + length, + motion_strength, + aesthetic_score, + use_motion_strength, + use_aesthetic_score, + camera_motion, + reference_image, + refine_prompt, + fps, + num_loop, + seed, + sampling_steps, + cfg_scale, + ) + + +def generate_random_prompt(): + if "OPENAI_API_KEY" not in os.environ: + gr.Warning("Your prompt is empty and the OpenAI API key is not provided, please enter a valid prompt") + return None + else: + prompt_text = get_random_prompt_by_openai() + return prompt_text + + +def main(): + # create demo + with gr.Blocks() as demo: + with gr.Row(): + with gr.Column(): + gr.HTML( + """ +
+

+ +

+
+ + + + + + + +
+

Open-Sora: Democratizing Efficient Video Production for All

+
+ """ + ) + + with gr.Row(): + with gr.Column(): + prompt_text = gr.Textbox(label="Prompt", placeholder="Describe your video here", lines=4) + refine_prompt = gr.Checkbox(value=has_openai_key(), label="Refine prompt with GPT4o", interactive=has_openai_key()) + random_prompt_btn = gr.Button("Random Prompt By GPT4o", interactive=has_openai_key()) + + gr.Markdown("## Basic Settings") + resolution = gr.Radio( + choices=["144p", "240p", "360p", "480p", "720p"], + value="240p", + label="Resolution", + ) + aspect_ratio = gr.Radio( + choices=["9:16", "16:9", "3:4", "4:3", "1:1"], + value="9:16", + label="Aspect Ratio (H:W)", + ) + length = gr.Radio( + choices=["2s", "4s", "8s", "16s"], + value="2s", + label="Video Length", + info="only effective for video generation, 8s may fail as Hugging Face ZeroGPU has the limitation of max 200 seconds inference time.", + ) + + with gr.Row(): + seed = gr.Slider(value=1024, minimum=1, maximum=2048, step=1, label="Seed") + + sampling_steps = gr.Slider(value=30, minimum=1, maximum=200, step=1, label="Sampling steps") + cfg_scale = gr.Slider(value=7.0, minimum=0.0, maximum=10.0, step=0.1, label="CFG Scale") + + with gr.Row(): + with gr.Column(): + motion_strength = gr.Slider( + value=5, + minimum=0, + maximum=100, + step=1, + label="Motion Strength", + info="only effective for video generation", + ) + use_motion_strength = gr.Checkbox(value=False, label="Enable") + + with gr.Column(): + aesthetic_score = gr.Slider( + value=6.5, + minimum=4, + maximum=7, + step=0.1, + label="Aesthetic", + info="effective for text & video generation", + ) + use_aesthetic_score = gr.Checkbox(value=True, label="Enable") + + camera_motion = gr.Radio( + value="none", + label="Camera Motion", + choices=["none", "pan right", "pan left", "tilt up", "tilt down", "zoom in", "zoom out", "static"], + interactive=True, + ) + + gr.Markdown("## Advanced Settings") + with gr.Row(): + fps = gr.Slider( + value=24, + minimum=1, + maximum=60, + step=1, + label="FPS", + info="This is the frames per seconds for video generation, keep it to 24 if you are not sure", + ) + num_loop = gr.Slider( + value=1, + minimum=1, + maximum=20, + step=1, + label="Number of Loops", + info="This will change the length of the generated video, keep it to 1 if you are not sure", + ) + gr.Markdown("## Reference Image") + reference_image = gr.Image(label="Image (optional)", show_download_button=True) + + with gr.Column(): + output_video = gr.Video(label="Output Video", height="100%") + + with gr.Row(): + image_gen_button = gr.Button("Generate image") + video_gen_button = gr.Button("Generate video") + + image_gen_button.click( + fn=run_image_inference, + inputs=[ + prompt_text, + resolution, + aspect_ratio, + length, + motion_strength, + aesthetic_score, + use_motion_strength, + use_aesthetic_score, + camera_motion, + reference_image, + refine_prompt, + fps, + num_loop, + seed, + sampling_steps, + cfg_scale, + ], + outputs=reference_image, + ) + video_gen_button.click( + fn=run_video_inference, + inputs=[ + prompt_text, + resolution, + aspect_ratio, + length, + motion_strength, + aesthetic_score, + use_motion_strength, + use_aesthetic_score, + camera_motion, + reference_image, + refine_prompt, + fps, + num_loop, + seed, + sampling_steps, + cfg_scale, + ], + outputs=output_video, + ) + random_prompt_btn.click(fn=generate_random_prompt, outputs=prompt_text) + + # launch + demo.launch(server_port=args.port, server_name=args.host, share=args.share) + + +if __name__ == "__main__": + main() diff --git a/configs/dit/inference/16x256x256.py b/configs/dit/inference/16x256x256.py index ccb1d796824c0b459b569e44d5ab66543814d748..44818fe095f5f16f960d5e7d0c7f974076aaeaa7 100644 --- a/configs/dit/inference/16x256x256.py +++ b/configs/dit/inference/16x256x256.py @@ -22,10 +22,10 @@ scheduler = dict( num_sampling_steps=20, cfg_scale=4.0, ) -dtype = "fp16" +dtype = "bf16" # Others batch_size = 2 seed = 42 prompt_path = "./assets/texts/ucf101_labels.txt" -save_dir = "./outputs/samples/" +save_dir = "./samples/samples/" diff --git a/configs/dit/inference/1x256x256-class.py b/configs/dit/inference/1x256x256-class.py index 24d1c8af390a408bf3d43ef4cd9c87d18d3fea2b..bebaa11e286db0ea7968723909482e18f28a12c3 100644 --- a/configs/dit/inference/1x256x256-class.py +++ b/configs/dit/inference/1x256x256-class.py @@ -22,10 +22,10 @@ scheduler = dict( num_sampling_steps=20, cfg_scale=4.0, ) -dtype = "fp16" +dtype = "bf16" # Others batch_size = 2 seed = 42 prompt_path = "./assets/texts/imagenet_id.txt" -save_dir = "./outputs/samples/" +save_dir = "./samples/samples/" diff --git a/configs/dit/inference/1x256x256.py b/configs/dit/inference/1x256x256.py index 31a5b9f1f2f315b19b528b2c4b98cfeb8b213c58..e7cb9a2d20e6ae3a19e468f493f0e125cbb0a33f 100644 --- a/configs/dit/inference/1x256x256.py +++ b/configs/dit/inference/1x256x256.py @@ -23,10 +23,10 @@ scheduler = dict( num_sampling_steps=20, cfg_scale=4.0, ) -dtype = "fp16" +dtype = "bf16" # Others batch_size = 2 seed = 42 prompt_path = "./assets/texts/imagenet_labels.txt" -save_dir = "./outputs/samples/" +save_dir = "./samples/samples/" diff --git a/configs/dit/train/16x256x256.py b/configs/dit/train/16x256x256.py index af8ee8768af253ee124e2679706ea4320bb97def..c36e06b65f577d1faa1231886273167d2a611926 100644 --- a/configs/dit/train/16x256x256.py +++ b/configs/dit/train/16x256x256.py @@ -1,16 +1,16 @@ -num_frames = 16 -frame_interval = 3 -image_size = (256, 256) - # Define dataset -root = None -data_path = "CSV_PATH" -use_image_transform = False -num_workers = 4 +dataset = dict( + type="VideoTextDataset", + data_path=None, + num_frames=16, + frame_interval=3, + image_size=(256, 256), +) # Define acceleration +num_workers = 4 dtype = "bf16" -grad_checkpoint = False +grad_checkpoint = True plugin = "zero2" sp_size = 1 @@ -18,7 +18,7 @@ sp_size = 1 model = dict( type="DiT-XL/2", from_pretrained="DiT-XL-2-256x256.pt", - enable_flashattn=True, + enable_flash_attn=True, enable_layernorm_kernel=True, ) vae = dict( diff --git a/configs/dit/train/1x256x256.py b/configs/dit/train/1x256x256.py index 667e0a835652d25c41fbd1d7947e65291972f49c..aa5d478d00584cef2188a578048ff0b3dd6990ba 100644 --- a/configs/dit/train/1x256x256.py +++ b/configs/dit/train/1x256x256.py @@ -1,14 +1,15 @@ -num_frames = 1 -frame_interval = 1 -image_size = (256, 256) - # Define dataset -root = None -data_path = "CSV_PATH" -use_image_transform = True -num_workers = 4 +dataset = dict( + type="VideoTextDataset", + data_path=None, + num_frames=1, + frame_interval=1, + image_size=(256, 256), + transform_name="center", +) # Define acceleration +num_workers = 4 dtype = "bf16" grad_checkpoint = False plugin = "zero2" @@ -18,7 +19,7 @@ sp_size = 1 model = dict( type="DiT-XL/2", no_temporal_pos_emb=True, - enable_flashattn=True, + enable_flash_attn=True, enable_layernorm_kernel=True, ) vae = dict( diff --git a/configs/latte/inference/16x256x256-class.py b/configs/latte/inference/16x256x256-class.py index c46f4bc362f60effbb80c74e4cea3662d39302a1..8ccf6d43604240e724f0e78f2de3aefa85449277 100644 --- a/configs/latte/inference/16x256x256-class.py +++ b/configs/latte/inference/16x256x256-class.py @@ -21,10 +21,10 @@ scheduler = dict( num_sampling_steps=20, cfg_scale=4.0, ) -dtype = "fp16" +dtype = "bf16" # Others batch_size = 2 seed = 42 prompt_path = "./assets/texts/ucf101_id.txt" -save_dir = "./outputs/samples/" +save_dir = "./samples/samples/" diff --git a/configs/latte/inference/16x256x256.py b/configs/latte/inference/16x256x256.py index cb502371d39b9324084bcda151d0a168e69fafaf..6bdd58fad5f81bcca29c2d975fd2dd89a4bf7c58 100644 --- a/configs/latte/inference/16x256x256.py +++ b/configs/latte/inference/16x256x256.py @@ -22,10 +22,10 @@ scheduler = dict( num_sampling_steps=20, cfg_scale=4.0, ) -dtype = "fp16" +dtype = "bf16" # Others batch_size = 2 seed = 42 prompt_path = "./assets/texts/ucf101_labels.txt" -save_dir = "./outputs/samples/" +save_dir = "./samples/samples/" diff --git a/configs/latte/train/16x256x256.py b/configs/latte/train/16x256x256.py index 0bf6bd4126c8517d526c2af1b75d5af8a1660df0..e087f8a99638a5d5036af94d5b6cecc80a867bc3 100644 --- a/configs/latte/train/16x256x256.py +++ b/configs/latte/train/16x256x256.py @@ -1,14 +1,14 @@ -num_frames = 16 -frame_interval = 3 -image_size = (256, 256) - # Define dataset -root = None -data_path = "CSV_PATH" -use_image_transform = False -num_workers = 4 +dataset = dict( + type="VideoTextDataset", + data_path=None, + num_frames=16, + frame_interval=3, + image_size=(256, 256), +) # Define acceleration +num_workers = 4 dtype = "bf16" grad_checkpoint = True plugin = "zero2" @@ -17,7 +17,7 @@ sp_size = 1 # Define model model = dict( type="Latte-XL/2", - enable_flashattn=True, + enable_flash_attn=True, enable_layernorm_kernel=True, ) vae = dict( diff --git a/configs/opensora-v1-1/inference/sample-ref.py b/configs/opensora-v1-1/inference/sample-ref.py new file mode 100644 index 0000000000000000000000000000000000000000..ae80774f3c6d675347f9952cfd8fc8ae02820526 --- /dev/null +++ b/configs/opensora-v1-1/inference/sample-ref.py @@ -0,0 +1,64 @@ +num_frames = 16 +frame_interval = 3 +fps = 24 +image_size = (240, 426) +multi_resolution = "STDiT2" + +# Condition +prompt_path = None +prompt = [ + 'Drone view of waves crashing against the rugged cliffs along Big Sur\'s garay point beach. {"reference_path": "assets/images/condition/cliff.png", "mask_strategy": "0"}', + 'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/sunset1.png","mask_strategy": "0"}', + 'A car driving on the ocean.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4","mask_strategy": "0,0,-8,0,8"}', + 'A snowy forest.{"reference_path": "https://cdn.pixabay.com/video/2021/04/25/72171-542991404_large.mp4","mask_strategy": "0,0,0,0,15,0.8"}', + 'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/sunset1.png;assets/images/condition/sunset2.png","mask_strategy": "0;0,1,0,-1,1"}', + '|0|a white jeep equipped with a roof rack driving on a dirt road in a coniferous forest.|2|a white jeep equipped with a roof rack driving on a dirt road in the desert.|4|a white jeep equipped with a roof rack driving on a dirt road in a mountain.|6|A white jeep equipped with a roof rack driving on a dirt road in a city.|8|a white jeep equipped with a roof rack driving on a dirt road on the surface of a river.|10|a white jeep equipped with a roof rack driving on a dirt road under the lake.|12|a white jeep equipped with a roof rack flying into the sky.|14|a white jeep equipped with a roof rack driving in the universe. Earth is the background.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4", "mask_strategy": "0,0,0,0,15"}', +] + +loop = 2 +condition_frame_length = 4 +# ( +# loop id, [the loop index of the condition image or video] +# reference id, [the index of the condition image or video in the reference_path] +# reference start, [the start frame of the condition image or video] +# target start, [the location to insert] +# length, [the number of frames to insert] +# edit_ratio [the edit rate of the condition image or video] +# ) +# See https://github.com/hpcaitech/Open-Sora/blob/main/docs/config.md#advanced-inference-config for more details +# See https://github.com/hpcaitech/Open-Sora/blob/main/docs/commands.md#inference-with-open-sora-11 for more examples + +# Define model +model = dict( + type="STDiT2-XL/2", + from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3", + input_sq_size=512, + qk_norm=True, + qk_norm_legacy=True, + enable_flash_attn=True, + enable_layernorm_kernel=True, +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", + cache_dir=None, # "/mnt/hdd/cached_models", + micro_batch_size=4, +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + cache_dir=None, # "/mnt/hdd/cached_models", + model_max_length=200, +) +scheduler = dict( + type="iddpm", + num_sampling_steps=100, + cfg_scale=7.0, + cfg_channel=3, # or None +) +dtype = "bf16" + +# Others +batch_size = 1 +seed = 42 +save_dir = "./samples/samples/" diff --git a/configs/opensora-v1-1/inference/sample.py b/configs/opensora-v1-1/inference/sample.py new file mode 100644 index 0000000000000000000000000000000000000000..c2800466c67caaa30b889ee6977e57a08ae5dbe9 --- /dev/null +++ b/configs/opensora-v1-1/inference/sample.py @@ -0,0 +1,44 @@ +num_frames = 16 +frame_interval = 3 +fps = 24 +image_size = (240, 426) +multi_resolution = "STDiT2" + +# Define model +model = dict( + type="STDiT2-XL/2", + from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3", + input_sq_size=512, + qk_norm=True, + qk_norm_legacy=True, + enable_flash_attn=True, + enable_layernorm_kernel=True, +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", + cache_dir=None, # "/mnt/hdd/cached_models", + micro_batch_size=4, +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + cache_dir=None, # "/mnt/hdd/cached_models", + model_max_length=200, +) +scheduler = dict( + type="iddpm", + num_sampling_steps=100, + cfg_scale=7.0, + cfg_channel=3, # or None +) +dtype = "bf16" + +# Condition +prompt_path = "./assets/texts/t2v_samples.txt" +prompt = None # prompt has higher priority than prompt_path + +# Others +batch_size = 1 +seed = 42 +save_dir = "./samples/samples/" diff --git a/configs/opensora-v1-1/train/benchmark.py b/configs/opensora-v1-1/train/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..da6cc0633af7f81316d52c53321700907e91ec2c --- /dev/null +++ b/configs/opensora-v1-1/train/benchmark.py @@ -0,0 +1,102 @@ +# this file is only for batch size search and is not used for training + +# Define dataset +dataset = dict( + type="VariableVideoTextDataset", + data_path=None, + num_frames=None, + frame_interval=3, + image_size=(None, None), + transform_name="resize_crop", +) + +# bucket config format: +# 1. { resolution: {num_frames: (prob, batch_size)} }, in this case batch_size is ignored when searching +# 2. { resolution: {num_frames: (prob, (max_batch_size, ))} }, batch_size is searched in the range [batch_size_start, max_batch_size), batch_size_start is configured via CLI +# 3. { resolution: {num_frames: (prob, (min_batch_size, max_batch_size))} }, batch_size is searched in the range [min_batch_size, max_batch_size) +# 4. { resolution: {num_frames: (prob, (min_batch_size, max_batch_size, step_size))} }, batch_size is searched in the range [min_batch_size, max_batch_size) with step_size (grid search) +# 5. { resolution: {num_frames: (0.0, None)} }, this bucket will not be used + +bucket_config = { + # == manual search == + # "240p": {128: (1.0, 2)}, # 4.28s/it + # "240p": {64: (1.0, 4)}, + # "240p": {32: (1.0, 8)}, # 4.6s/it + # "240p": {16: (1.0, 16)}, # 4.6s/it + # "480p": {16: (1.0, 4)}, # 4.6s/it + # "720p": {16: (1.0, 2)}, # 5.89s/it + # "256": {1: (1.0, 256)}, # 4.5s/it + # "512": {1: (1.0, 96)}, # 4.7s/it + # "512": {1: (1.0, 128)}, # 6.3s/it + # "480p": {1: (1.0, 50)}, # 4.0s/it + # "1024": {1: (1.0, 32)}, # 6.8s/it + # "1024": {1: (1.0, 20)}, # 4.3s/it + # "1080p": {1: (1.0, 16)}, # 8.6s/it + # "1080p": {1: (1.0, 8)}, # 4.4s/it + # == stage 2 == + # "240p": { + # 16: (1.0, (2, 32)), + # 32: (1.0, (2, 16)), + # 64: (1.0, (2, 8)), + # 128: (1.0, (2, 6)), + # }, + # "256": {1: (1.0, (128, 300))}, + # "512": {1: (0.5, (64, 128))}, + # "480p": {1: (0.4, (32, 128)), 16: (0.4, (2, 32)), 32: (0.0, None)}, + # "720p": {16: (0.1, (2, 16)), 32: (0.0, None)}, # No examples now + # "1024": {1: (0.3, (8, 64))}, + # "1080p": {1: (0.3, (2, 32))}, + # == stage 3 == + "720p": {1: (20, 40), 32: (0.5, (2, 4)), 64: (0.5, (1, 1))}, +} + + +# Define acceleration +num_workers = 4 +num_bucket_build_workers = 16 +dtype = "bf16" +grad_checkpoint = True +plugin = "zero2" +sp_size = 1 + +# Define model +model = dict( + type="STDiT2-XL/2", + from_pretrained=None, + input_sq_size=512, # pretrained model is trained on 512x512 + qk_norm=True, + qk_norm_legacy=True, + enable_flash_attn=True, + enable_layernorm_kernel=True, +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", + micro_batch_size=4, + local_files_only=True, +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=200, + shardformer=True, + local_files_only=True, +) +scheduler = dict( + type="iddpm", + timestep_respacing="", +) + +# Others +seed = 42 +outputs = "outputs" +wandb = False + +epochs = 1000 +log_every = 10 +ckpt_every = 1000 +load = None + +batch_size = None +lr = 2e-5 +grad_clip = 1.0 diff --git a/configs/opensora-v1-1/train/image.py b/configs/opensora-v1-1/train/image.py new file mode 100644 index 0000000000000000000000000000000000000000..ee43c2ee3a57855a3d443995dda9fd7a07cd7f69 --- /dev/null +++ b/configs/opensora-v1-1/train/image.py @@ -0,0 +1,66 @@ +# Define dataset +dataset = dict( + type="VariableVideoTextDataset", + data_path=None, + num_frames=None, + frame_interval=3, + image_size=(None, None), + transform_name="resize_crop", +) +bucket_config = { # 6s/it + "256": {1: (1.0, 256)}, + "512": {1: (1.0, 80)}, + "480p": {1: (1.0, 52)}, + "1024": {1: (1.0, 20)}, + "1080p": {1: (1.0, 8)}, +} + +# Define acceleration +num_workers = 4 +num_bucket_build_workers = 16 +dtype = "bf16" +grad_checkpoint = True +plugin = "zero2" +sp_size = 1 + +# Define model +model = dict( + type="STDiT2-XL/2", + from_pretrained=None, + input_sq_size=512, # pretrained model is trained on 512x512 + qk_norm=True, + qk_norm_legacy=True, + enable_flash_attn=True, + enable_layernorm_kernel=True, +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", + micro_batch_size=4, + local_files_only=True, +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=200, + shardformer=True, + local_files_only=True, +) +scheduler = dict( + type="iddpm", + timestep_respacing="", +) + +# Others +seed = 42 +outputs = "outputs" +wandb = False + +epochs = 1000 +log_every = 10 +ckpt_every = 500 +load = None + +batch_size = 10 # only for logging +lr = 2e-5 +grad_clip = 1.0 diff --git a/configs/opensora-v1-1/train/image_rflow.py b/configs/opensora-v1-1/train/image_rflow.py new file mode 100644 index 0000000000000000000000000000000000000000..08d52efb47ca3400546186bfe060ee5d9b6327a8 --- /dev/null +++ b/configs/opensora-v1-1/train/image_rflow.py @@ -0,0 +1,88 @@ +# Define dataset +# dataset = dict( +# type="VariableVideoTextDataset", +# data_path=None, +# num_frames=None, +# frame_interval=3, +# image_size=(None, None), +# transform_name="resize_crop", +# ) +dataset = dict( + type="VideoTextDataset", + data_path=None, + num_frames=1, + frame_interval=1, + image_size=(256, 256), + transform_name="center", +) +bucket_config = { # 6s/it + "256": {1: (1.0, 256)}, + "512": {1: (1.0, 80)}, + "480p": {1: (1.0, 52)}, + "1024": {1: (1.0, 20)}, + "1080p": {1: (1.0, 8)}, +} + +# Define acceleration +num_workers = 16 +dtype = "bf16" +grad_checkpoint = True +plugin = "zero2" +sp_size = 1 + +# Define model +# model = dict( +# type="DiT-XL/2", +# from_pretrained="/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/PixArt-XL-2-512x512.pth", +# # input_sq_size=512, # pretrained model is trained on 512x512 +# enable_flash_attn=True, +# enable_layernorm_kernel=True, +# ) +model = dict( + type="PixArt-XL/2", + space_scale=1.0, + time_scale=1.0, + no_temporal_pos_emb=True, + from_pretrained="PixArt-XL-2-512x512.pth", + enable_flash_attn=True, + enable_layernorm_kernel=True, +) +# model = dict( +# type="DiT-XL/2", +# # space_scale=1.0, +# # time_scale=1.0, +# no_temporal_pos_emb=True, +# # from_pretrained="PixArt-XL-2-512x512.pth", +# from_pretrained="/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/PixArt-XL-2-512x512.pth", +# enable_flash_attn=True, +# enable_layernorm_kernel=True, +# ) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", + micro_batch_size=4, +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=200, + shardformer=True, +) +scheduler = dict( + type="rflow", + # timestep_respacing="", +) + +# Others +seed = 42 +outputs = "outputs" +wandb = False + +epochs = 10 +log_every = 10 +ckpt_every = 500 +load = None + +batch_size = 100 # only for logging +lr = 2e-5 +grad_clip = 1.0 diff --git a/configs/opensora-v1-1/train/stage1.py b/configs/opensora-v1-1/train/stage1.py new file mode 100644 index 0000000000000000000000000000000000000000..dfba99666cf5991630a39e6f73895defea95ca17 --- /dev/null +++ b/configs/opensora-v1-1/train/stage1.py @@ -0,0 +1,78 @@ +# Define dataset +dataset = dict( + type="VariableVideoTextDataset", + data_path=None, + num_frames=None, + frame_interval=3, + image_size=(None, None), + transform_name="resize_crop", +) +# IMG: 1024 (20%) 512 (30%) 256 (50%) drop (50%) +bucket_config = { # 1s/it + "144p": {1: (0.5, 48), 16: (1.0, 6), 32: (1.0, 3), 96: (1.0, 1)}, + "256": {1: (0.5, 24), 16: (0.5, 3), 48: (0.5, 1), 64: (0.0, None)}, + "240p": {16: (0.3, 2), 32: (0.3, 1), 64: (0.0, None)}, + "512": {1: (0.4, 12)}, + "1024": {1: (0.3, 3)}, +} +mask_ratios = { + "identity": 0.75, + "quarter_random": 0.025, + "quarter_head": 0.025, + "quarter_tail": 0.025, + "quarter_head_tail": 0.05, + "image_random": 0.025, + "image_head": 0.025, + "image_tail": 0.025, + "image_head_tail": 0.05, +} + +# Define acceleration +num_workers = 8 +num_bucket_build_workers = 16 +dtype = "bf16" +grad_checkpoint = False +plugin = "zero2" +sp_size = 1 + +# Define model +model = dict( + type="STDiT2-XL/2", + from_pretrained=None, + input_sq_size=512, # pretrained model is trained on 512x512 + qk_norm=True, + qk_norm_legacy=True, + enable_flash_attn=True, + enable_layernorm_kernel=True, +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", + micro_batch_size=4, + local_files_only=True, +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=200, + shardformer=True, + local_files_only=True, +) +scheduler = dict( + type="iddpm", + timestep_respacing="", +) + +# Others +seed = 42 +outputs = "outputs" +wandb = False + +epochs = 1000 +log_every = 10 +ckpt_every = 500 +load = None + +batch_size = None +lr = 2e-5 +grad_clip = 1.0 diff --git a/configs/opensora-v1-1/train/stage2.py b/configs/opensora-v1-1/train/stage2.py new file mode 100644 index 0000000000000000000000000000000000000000..ce884aa8099d3590dd8770db763c42624810cf81 --- /dev/null +++ b/configs/opensora-v1-1/train/stage2.py @@ -0,0 +1,80 @@ +# Define dataset +dataset = dict( + type="VariableVideoTextDataset", + data_path=None, + num_frames=None, + frame_interval=3, + image_size=(None, None), + transform_name="resize_crop", +) +bucket_config = { # 7s/it + "144p": {1: (1.0, 48), 16: (1.0, 17), 32: (1.0, 9), 64: (1.0, 4), 128: (1.0, 1)}, + "256": {1: (0.8, 254), 16: (0.5, 17), 32: (0.5, 9), 64: (0.5, 4), 128: (0.5, 1)}, + "240p": {1: (0.1, 20), 16: (0.9, 17), 32: (0.8, 9), 64: (0.8, 4), 128: (0.8, 2)}, + "512": {1: (0.5, 86), 16: (0.2, 4), 32: (0.2, 2), 64: (0.2, 1), 128: (0.0, None)}, + "480p": {1: (0.4, 54), 16: (0.4, 4), 32: (0.0, None)}, + "720p": {1: (0.1, 20), 16: (0.1, 2), 32: (0.0, None)}, + "1024": {1: (0.3, 20)}, + "1080p": {1: (0.4, 8)}, +} +mask_ratios = { + "identity": 0.75, + "quarter_random": 0.025, + "quarter_head": 0.025, + "quarter_tail": 0.025, + "quarter_head_tail": 0.05, + "image_random": 0.025, + "image_head": 0.025, + "image_tail": 0.025, + "image_head_tail": 0.05, +} + +# Define acceleration +num_workers = 8 +num_bucket_build_workers = 16 +dtype = "bf16" +grad_checkpoint = True +plugin = "zero2" +sp_size = 1 + +# Define model +model = dict( + type="STDiT2-XL/2", + from_pretrained=None, + input_sq_size=512, # pretrained model is trained on 512x512 + qk_norm=True, + qk_norm_legacy=True, + enable_flash_attn=True, + enable_layernorm_kernel=True, +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", + micro_batch_size=4, + local_files_only=True, +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=200, + shardformer=True, + local_files_only=True, +) +scheduler = dict( + type="iddpm", + timestep_respacing="", +) + +# Others +seed = 42 +outputs = "outputs" +wandb = False + +epochs = 1000 +log_every = 10 +ckpt_every = 500 +load = None + +batch_size = None +lr = 2e-5 +grad_clip = 1.0 diff --git a/configs/opensora-v1-1/train/stage3.py b/configs/opensora-v1-1/train/stage3.py new file mode 100644 index 0000000000000000000000000000000000000000..3ccc8a6bbc632889c8ef05e32cb8a6001f61bf14 --- /dev/null +++ b/configs/opensora-v1-1/train/stage3.py @@ -0,0 +1,80 @@ +# Define dataset +dataset = dict( + type="VariableVideoTextDataset", + data_path=None, + num_frames=None, + frame_interval=3, + image_size=(None, None), + transform_name="resize_crop", +) +bucket_config = { # 13s/it + "144p": {1: (1.0, 200), 16: (1.0, 36), 32: (1.0, 18), 64: (1.0, 9), 128: (1.0, 4)}, + "256": {1: (0.8, 200), 16: (0.5, 22), 32: (0.5, 11), 64: (0.5, 6), 128: (0.8, 4)}, + "240p": {1: (0.8, 200), 16: (0.5, 22), 32: (0.5, 10), 64: (0.5, 6), 128: (0.5, 3)}, + "360p": {1: (0.5, 120), 16: (0.5, 9), 32: (0.5, 4), 64: (0.5, 2), 128: (0.5, 1)}, + "512": {1: (0.5, 120), 16: (0.5, 9), 32: (0.5, 4), 64: (0.5, 2), 128: (0.8, 1)}, + "480p": {1: (0.4, 80), 16: (0.6, 6), 32: (0.6, 3), 64: (0.6, 1), 128: (0.0, None)}, + "720p": {1: (0.4, 40), 16: (0.6, 3), 32: (0.6, 1), 96: (0.0, None)}, + "1024": {1: (0.3, 40)}, +} +mask_ratios = { + "identity": 0.75, + "quarter_random": 0.025, + "quarter_head": 0.025, + "quarter_tail": 0.025, + "quarter_head_tail": 0.05, + "image_random": 0.025, + "image_head": 0.025, + "image_tail": 0.025, + "image_head_tail": 0.05, +} + +# Define acceleration +num_workers = 8 +num_bucket_build_workers = 16 +dtype = "bf16" +grad_checkpoint = True +plugin = "zero2" +sp_size = 1 + +# Define model +model = dict( + type="STDiT2-XL/2", + from_pretrained=None, + input_sq_size=512, # pretrained model is trained on 512x512 + qk_norm=True, + qk_norm_legacy=True, + enable_flash_attn=True, + enable_layernorm_kernel=True, +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", + micro_batch_size=4, + local_files_only=True, +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=200, + shardformer=True, + local_files_only=True, +) +scheduler = dict( + type="iddpm", + timestep_respacing="", +) + +# Others +seed = 42 +outputs = "outputs" +wandb = False + +epochs = 1000 +log_every = 10 +ckpt_every = 500 +load = None + +batch_size = None +lr = 2e-5 +grad_clip = 1.0 diff --git a/configs/opensora-v1-1/train/video.py b/configs/opensora-v1-1/train/video.py new file mode 100644 index 0000000000000000000000000000000000000000..0a068a53ecf0897e0ab7d3f32e18392b7d4ab16d --- /dev/null +++ b/configs/opensora-v1-1/train/video.py @@ -0,0 +1,68 @@ +# Define dataset +dataset = dict( + type="VariableVideoTextDataset", + data_path=None, + num_frames=None, + frame_interval=3, + image_size=(None, None), + transform_name="resize_crop", +) +bucket_config = { # 6s/it + "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)}, + "256": {1: (1.0, 256)}, + "512": {1: (0.5, 80)}, + "480p": {1: (0.4, 52), 16: (0.4, 4), 32: (0.0, None)}, + "720p": {16: (0.1, 2), 32: (0.0, None)}, # No examples now + "1024": {1: (0.3, 20)}, + "1080p": {1: (0.3, 8)}, +} + +# Define acceleration +num_workers = 4 +num_bucket_build_workers = 16 +dtype = "bf16" +grad_checkpoint = True +plugin = "zero2" +sp_size = 1 + +# Define model +model = dict( + type="STDiT2-XL/2", + from_pretrained=None, + input_sq_size=512, # pretrained model is trained on 512x512 + qk_norm=True, + qk_norm_legacy=True, + enable_flash_attn=True, + enable_layernorm_kernel=True, +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", + micro_batch_size=4, + local_files_only=True, +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=200, + shardformer=True, + local_files_only=True, +) +scheduler = dict( + type="iddpm", + timestep_respacing="", +) + +# Others +seed = 42 +outputs = "outputs" +wandb = False + +epochs = 1000 +log_every = 10 +ckpt_every = 500 +load = None + +batch_size = 10 # only for logging +lr = 2e-5 +grad_clip = 1.0 diff --git a/configs/opensora-v1-2/inference/sample.py b/configs/opensora-v1-2/inference/sample.py new file mode 100644 index 0000000000000000000000000000000000000000..3e2c623ea181de445a4ee89e865e2ff134e47461 --- /dev/null +++ b/configs/opensora-v1-2/inference/sample.py @@ -0,0 +1,42 @@ +resolution = "240p" +aspect_ratio = "9:16" +num_frames = 51 +fps = 24 +frame_interval = 1 +save_fps = 24 + +save_dir = "./samples/samples/" +seed = 42 +batch_size = 1 +multi_resolution = "STDiT2" +dtype = "bf16" +condition_frame_length = 5 +align = 5 + +model = dict( + type="STDiT3-XL/2", + from_pretrained="hpcai-tech/OpenSora-STDiT-v3", + qk_norm=True, + enable_flash_attn=True, + enable_layernorm_kernel=True, +) +vae = dict( + type="OpenSoraVAE_V1_2", + from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", + micro_frame_size=17, + micro_batch_size=4, +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=300, +) +scheduler = dict( + type="rflow", + use_timestep_transform=True, + num_sampling_steps=30, + cfg_scale=7.0, +) + +aes = 6.5 +flow = None diff --git a/configs/opensora-v1-2/misc/bs.py b/configs/opensora-v1-2/misc/bs.py new file mode 100644 index 0000000000000000000000000000000000000000..77e7f503208755610edf69c32c18aa644459495c --- /dev/null +++ b/configs/opensora-v1-2/misc/bs.py @@ -0,0 +1,117 @@ +# Dataset settings +dataset = dict( + type="VariableVideoTextDataset", + transform_name="resize_crop", +) + +# == Config 1: Webvid == +# base: (512, 408), 12s/it +grad_checkpoint = True +base = ("512", "408") +base_step_time = 12 +bucket_config = { + "144p": { + 1: (475, 0), + 51: (51, 0), + 102: (27, 0), + 204: (13, 0), + 408: (6, 0), + }, + # --- + "240p": { + 1: (297, 200), # 8.25 + 51: (20, 0), + 102: (10, 0), + 204: (5, 0), + 408: (2, 0), + }, + # --- + "512": { + 1: (141, 0), + 51: (8, 0), + 102: (4, 0), + 204: (2, 0), + 408: (1, 0), + }, + # --- + "480p": { + 1: (89, 0), + 51: (5, 0), + 102: (2, 0), + 204: (1, 0), + }, + # --- + "1024": { + 1: (36, 0), + 51: (1, 0), + }, + # --- + "1080p": {1: (5, 0)}, + # --- + "2048": {1: (5, 0)}, +} + +# == Config 1 == +# base: (512, 408), 16s/it + +# Acceleration settings +num_workers = 8 +num_bucket_build_workers = 16 +dtype = "bf16" +plugin = "zero2" + +# Model settings +model = dict( + type="STDiT3-XL/2", + from_pretrained=None, + qk_norm=True, + enable_flash_attn=True, + enable_layernorm_kernel=True, +) +vae = dict( + type="OpenSoraVAE_V1_2", + from_pretrained="pretrained_models/vae-pipeline", + micro_frame_size=17, + micro_batch_size=4, +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=300, + shardformer=True, + local_files_only=True, +) +scheduler = dict( + type="rflow", + use_timestep_transform=True, + sample_method="logit-normal", +) + +# Mask settings +mask_ratios = { + "random": 0.2, + "intepolate": 0.01, + "quarter_random": 0.01, + "quarter_head": 0.01, + "quarter_tail": 0.01, + "quarter_head_tail": 0.01, + "image_random": 0.05, + "image_head": 0.1, + "image_tail": 0.05, + "image_head_tail": 0.05, +} + +# Log settings +seed = 42 +outputs = "outputs" +wandb = False +epochs = 1000 +log_every = 10 +ckpt_every = 500 + +# optimization settings +load = None +grad_clip = 1.0 +lr = 2e-4 +ema_decay = 0.99 +adam_eps = 1e-15 diff --git a/configs/opensora-v1-2/misc/eval_loss.py b/configs/opensora-v1-2/misc/eval_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..f052ad301ae19464e1825726012649e266857c64 --- /dev/null +++ b/configs/opensora-v1-2/misc/eval_loss.py @@ -0,0 +1,49 @@ +num_workers = 8 +dtype = "bf16" +seed = 42 +num_eval_timesteps = 10 + +# Dataset settings +dataset = dict( + type="VariableVideoTextDataset", + transform_name="resize_crop", +) + +bucket_config = { + "144p": {1: (None, 100), 51: (None, 30), 102: (None, 20), 204: (None, 8), 408: (None, 4)}, + # --- + "240p": {1: (None, 100), 51: (None, 24), 102: (None, 12), 204: (None, 4), 408: (None, 2)}, + # --- + "360p": {1: (None, 60), 51: (None, 12), 102: (None, 6), 204: (None, 2), 408: (None, 1)}, + # --- + "480p": {1: (None, 40), 51: (None, 6), 102: (None, 3), 204: (None, 1)}, + # --- + "720p": {1: (None, 20), 51: (None, 2), 102: (None, 1)}, + # --- + "1080p": {1: (None, 10)}, + # --- + "2048": {1: (None, 5)}, +} + +# Model settings +model = dict( + type="STDiT3-XL/2", + from_pretrained=None, + qk_norm=True, + enable_flash_attn=True, + enable_layernorm_kernel=True, +) +vae = dict( + type="OpenSoraVAE_V1_2", + from_pretrained="pretrained_models/vae-pipeline", + micro_frame_size=17, + micro_batch_size=4, + local_files_only=True, +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=300, + local_files_only=True, +) +scheduler = dict(type="rflow") diff --git a/configs/opensora-v1-2/misc/extract.py b/configs/opensora-v1-2/misc/extract.py new file mode 100644 index 0000000000000000000000000000000000000000..b2dab83058d956a6d9f513c2053e9dd7c4791290 --- /dev/null +++ b/configs/opensora-v1-2/misc/extract.py @@ -0,0 +1,62 @@ +# Dataset settings +dataset = dict( + type="VariableVideoTextDataset", + transform_name="resize_crop", +) + +# webvid +bucket_config = { # 12s/it + "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)}, + # --- + "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 0.1), 5), 408: ((0.5, 0.1), 2)}, + "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 0.1), 5), 408: ((0.4, 0.1), 2)}, + # --- + "360p": {1: (0.2, 141), 51: (0.15, 8), 102: ((0.15, 0.33), 4), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)}, + "512": {1: (0.1, 141)}, + # --- + "480p": {1: (0.1, 89)}, + # --- + "720p": {1: (0.05, 36)}, + "1024": {1: (0.05, 36)}, + # --- + "1080p": {1: (0.1, 5)}, + # --- + "2048": {1: (0.1, 5)}, +} + +# Acceleration settings +num_workers = 8 +num_bucket_build_workers = 16 +dtype = "bf16" +seed = 42 +outputs = "outputs" +wandb = False + + +# Model settings +model = dict( + type="STDiT3-XL/2", + from_pretrained="/mnt/nfs-206/zangwei/opensora/outputs/1091-STDiT3-XL-2/epoch0-global_step8500", + qk_norm=True, + enable_flash_attn=True, + enable_layernorm_kernel=True, +) +vae = dict( + type="OpenSoraVAE_V1_2", + from_pretrained="pretrained_models/vae-pipeline", + micro_frame_size=17, + micro_batch_size=32, +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=300, + shardformer=True, + local_files_only=True, +) + +# feature extraction settings +save_text_features = True +save_compressed_text_features = True +bin_size = 250 # 1GB, 4195 bins +log_time = False diff --git a/configs/opensora-v1-2/misc/feat.py b/configs/opensora-v1-2/misc/feat.py new file mode 100644 index 0000000000000000000000000000000000000000..228cbee3d8b669ea5a1ab1342a4db873e2b4c26b --- /dev/null +++ b/configs/opensora-v1-2/misc/feat.py @@ -0,0 +1,94 @@ +# Dataset settings +dataset = dict( + type="VariableVideoTextDataset", + transform_name="resize_crop", + dummy_text_feature=True, +) + +# webvid +bucket_config = { # 12s/it + "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)}, + # --- + "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 0.1), 5), 408: ((0.5, 0.1), 2)}, + "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 0.1), 5), 408: ((0.4, 0.1), 2)}, + # --- + "360p": {1: (0.2, 141), 51: (0.15, 8), 102: ((0.15, 0.33), 4), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)}, + "512": {1: (0.1, 141)}, + # --- + "480p": {1: (0.1, 89)}, + # --- + "720p": {1: (0.05, 36)}, + "1024": {1: (0.05, 36)}, + # --- + "1080p": {1: (0.1, 5)}, + # --- + "2048": {1: (0.1, 5)}, +} + +grad_checkpoint = True + +load_text_features = True + +# Acceleration settings +num_workers = 0 +num_bucket_build_workers = 16 +dtype = "bf16" +plugin = "zero2" + +# Model settings +model = dict( + type="STDiT3-XL/2", + from_pretrained=None, + qk_norm=True, + enable_flash_attn=True, + enable_layernorm_kernel=True, + freeze_y_embedder=True, + skip_y_embedder=True, +) +vae = dict( + type="OpenSoraVAE_V1_2", + from_pretrained="pretrained_models/vae-pipeline", + micro_frame_size=17, + micro_batch_size=4, +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=300, + shardformer=True, + local_files_only=True, +) +scheduler = dict( + type="rflow", + use_timestep_transform=True, + sample_method="logit-normal", +) + +# Mask settings +mask_ratios = { + "random": 0.2, + "intepolate": 0.01, + "quarter_random": 0.01, + "quarter_head": 0.01, + "quarter_tail": 0.01, + "quarter_head_tail": 0.01, + "image_random": 0.05, + "image_head": 0.1, + "image_tail": 0.05, + "image_head_tail": 0.05, +} + +# Log settings +seed = 42 +outputs = "outputs" +wandb = False +epochs = 1000 +log_every = 10 +ckpt_every = 1 + +# optimization settings +load = None +grad_clip = 1.0 +lr = 2e-4 +ema_decay = 0.99 +adam_eps = 1e-15 diff --git a/configs/opensora-v1-2/train/adapt.py b/configs/opensora-v1-2/train/adapt.py new file mode 100644 index 0000000000000000000000000000000000000000..94a6ca4a978d9db800c646c09c5cae9061073762 --- /dev/null +++ b/configs/opensora-v1-2/train/adapt.py @@ -0,0 +1,84 @@ +# Dataset settings +dataset = dict( + type="VariableVideoTextDataset", + transform_name="resize_crop", +) +bucket_config = { # 2s/it + "144p": {1: (0.5, 48), 34: (1.0, 2), 51: (1.0, 4), 102: (1.0, 2), 204: (1.0, 1)}, + # --- + "256": {1: (0.6, 20), 34: (0.5, 2), 51: (0.5, 1), 68: (0.5, 1), 136: (0.0, None)}, + "240p": {1: (0.6, 20), 34: (0.5, 2), 51: (0.5, 1), 68: (0.5, 1), 136: (0.0, None)}, + # --- + "360p": {1: (0.5, 8), 34: (0.2, 1), 102: (0.0, None)}, + "512": {1: (0.5, 8), 34: (0.2, 1), 102: (0.0, None)}, + # --- + "480p": {1: (0.2, 4), 17: (0.3, 1), 68: (0.0, None)}, + # --- + "720p": {1: (0.1, 2)}, + "1024": {1: (0.1, 2)}, + # --- + "1080p": {1: (0.1, 1)}, +} +grad_checkpoint = False + +# Acceleration settings +num_workers = 8 +num_bucket_build_workers = 16 +dtype = "bf16" +plugin = "zero2" + +# Model settings +model = dict( + type="STDiT3-XL/2", + from_pretrained=None, + qk_norm=True, + enable_flash_attn=True, + enable_layernorm_kernel=True, +) +vae = dict( + type="OpenSoraVAE_V1_2", + from_pretrained="pretrained_models/vae-pipeline", + micro_frame_size=17, + micro_batch_size=4, +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=300, + shardformer=True, + local_files_only=True, +) +scheduler = dict( + type="rflow", + use_timestep_transform=True, + sample_method="logit-normal", +) + +# Mask settings +mask_ratios = { + "random": 0.2, + "intepolate": 0.01, + "quarter_random": 0.01, + "quarter_head": 0.01, + "quarter_tail": 0.01, + "quarter_head_tail": 0.01, + "image_random": 0.05, + "image_head": 0.1, + "image_tail": 0.05, + "image_head_tail": 0.05, +} + +# Log settings +seed = 42 +outputs = "outputs" +wandb = False +epochs = 1000 +log_every = 10 +ckpt_every = 500 + +# optimization settings +load = None +grad_clip = 1.0 +lr = 1e-4 +ema_decay = 0.99 +adam_eps = 1e-15 diff --git a/configs/opensora-v1-2/train/stage1.py b/configs/opensora-v1-2/train/stage1.py new file mode 100644 index 0000000000000000000000000000000000000000..57bb7d21f4ba21936222865dac7fb984c2683813 --- /dev/null +++ b/configs/opensora-v1-2/train/stage1.py @@ -0,0 +1,111 @@ +# Dataset settings +dataset = dict( + type="VariableVideoTextDataset", + transform_name="resize_crop", +) + +# backup +# bucket_config = { # 20s/it +# "144p": {1: (1.0, 100), 51: (1.0, 30), 102: (1.0, 20), 204: (1.0, 8), 408: (1.0, 4)}, +# # --- +# "256": {1: (0.5, 100), 51: (0.3, 24), 102: (0.3, 12), 204: (0.3, 4), 408: (0.3, 2)}, +# "240p": {1: (0.5, 100), 51: (0.3, 24), 102: (0.3, 12), 204: (0.3, 4), 408: (0.3, 2)}, +# # --- +# "360p": {1: (0.5, 60), 51: (0.3, 12), 102: (0.3, 6), 204: (0.3, 2), 408: (0.3, 1)}, +# "512": {1: (0.5, 60), 51: (0.3, 12), 102: (0.3, 6), 204: (0.3, 2), 408: (0.3, 1)}, +# # --- +# "480p": {1: (0.5, 40), 51: (0.3, 6), 102: (0.3, 3), 204: (0.3, 1), 408: (0.0, None)}, +# # --- +# "720p": {1: (0.2, 20), 51: (0.3, 2), 102: (0.3, 1), 204: (0.0, None)}, +# "1024": {1: (0.1, 20), 51: (0.3, 2), 102: (0.3, 1), 204: (0.0, None)}, +# # --- +# "1080p": {1: (0.1, 10)}, +# # --- +# "2048": {1: (0.1, 5)}, +# } + +# webvid +bucket_config = { # 12s/it + "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)}, + # --- + "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 0.1), 5), 408: ((0.5, 0.1), 2)}, + "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 0.1), 5), 408: ((0.4, 0.1), 2)}, + # --- + "360p": {1: (0.2, 141), 51: (0.15, 8), 102: ((0.15, 0.33), 4), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)}, + "512": {1: (0.1, 141)}, + # --- + "480p": {1: (0.1, 89)}, + # --- + "720p": {1: (0.05, 36)}, + "1024": {1: (0.05, 36)}, + # --- + "1080p": {1: (0.1, 5)}, + # --- + "2048": {1: (0.1, 5)}, +} + +grad_checkpoint = True + +# Acceleration settings +num_workers = 8 +num_bucket_build_workers = 16 +dtype = "bf16" +plugin = "zero2" + +# Model settings +model = dict( + type="STDiT3-XL/2", + from_pretrained=None, + qk_norm=True, + enable_flash_attn=True, + enable_layernorm_kernel=True, + freeze_y_embedder=True, +) +vae = dict( + type="OpenSoraVAE_V1_2", + from_pretrained="/mnt/jfs/sora_checkpoints/vae-pipeline", + micro_frame_size=17, + micro_batch_size=4, +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=300, + shardformer=True, + local_files_only=True, +) +scheduler = dict( + type="rflow", + use_timestep_transform=True, + sample_method="logit-normal", +) + +# Mask settings +mask_ratios = { + "random": 0.05, + "intepolate": 0.005, + "quarter_random": 0.005, + "quarter_head": 0.005, + "quarter_tail": 0.005, + "quarter_head_tail": 0.005, + "image_random": 0.025, + "image_head": 0.05, + "image_tail": 0.025, + "image_head_tail": 0.025, +} + +# Log settings +seed = 42 +outputs = "outputs" +wandb = False +epochs = 1000 +log_every = 10 +ckpt_every = 200 + +# optimization settings +load = None +grad_clip = 1.0 +lr = 1e-4 +ema_decay = 0.99 +adam_eps = 1e-15 +warmup_steps = 1000 diff --git a/configs/opensora-v1-2/train/stage1_feat.py b/configs/opensora-v1-2/train/stage1_feat.py new file mode 100644 index 0000000000000000000000000000000000000000..e0414fc8c5218ad6847275e585bbb0985d0fa841 --- /dev/null +++ b/configs/opensora-v1-2/train/stage1_feat.py @@ -0,0 +1,59 @@ +# Dataset settings +dataset = dict(type="BatchFeatureDataset") +grad_checkpoint = True +num_workers = 4 + +# Acceleration settings +dtype = "bf16" +plugin = "zero2" + +# Model settings +model = dict( + type="STDiT3-XL/2", + from_pretrained=None, + qk_norm=True, + enable_flash_attn=True, + enable_layernorm_kernel=True, + freeze_y_embedder=True, + skip_y_embedder=True, +) +scheduler = dict( + type="rflow", + use_timestep_transform=True, + sample_method="logit-normal", +) + +vae_out_channels = 4 +model_max_length = 300 +text_encoder_output_dim = 4096 +load_video_features = True +load_text_features = True + +# Mask settings +mask_ratios = { + "random": 0.2, + "intepolate": 0.01, + "quarter_random": 0.01, + "quarter_head": 0.01, + "quarter_tail": 0.01, + "quarter_head_tail": 0.01, + "image_random": 0.05, + "image_head": 0.1, + "image_tail": 0.05, + "image_head_tail": 0.05, +} + +# Log settings +seed = 42 +outputs = "outputs" +wandb = False +epochs = 1000 +log_every = 10 +ckpt_every = 500 + +# optimization settings +load = None +grad_clip = 1.0 +lr = 2e-4 +ema_decay = 0.99 +adam_eps = 1e-15 diff --git a/configs/opensora-v1-2/train/stage2.py b/configs/opensora-v1-2/train/stage2.py new file mode 100644 index 0000000000000000000000000000000000000000..033ae6d85c5cb0d6ff0660429aefc6693a48c28e --- /dev/null +++ b/configs/opensora-v1-2/train/stage2.py @@ -0,0 +1,90 @@ +# Dataset settings +dataset = dict( + type="VariableVideoTextDataset", + transform_name="resize_crop", +) + +# webvid +bucket_config = { # 12s/it + "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)}, + # --- + "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 1.0), 5), 408: ((0.5, 1.0), 2)}, + "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 1.0), 5), 408: ((0.4, 1.0), 2)}, + # --- + "360p": {1: (0.5, 141), 51: (0.15, 8), 102: ((0.3, 0.5), 4), 204: ((0.3, 1.0), 2), 408: ((0.5, 0.5), 1)}, + "512": {1: (0.4, 141), 51: (0.15, 8), 102: ((0.2, 0.4), 4), 204: ((0.2, 1.0), 2), 408: ((0.4, 0.5), 1)}, + # --- + "480p": {1: (0.5, 89), 51: (0.2, 5), 102: (0.2, 2), 204: (0.1, 1)}, + # --- + "720p": {1: (0.1, 36), 51: (0.03, 1)}, + "1024": {1: (0.1, 36), 51: (0.02, 1)}, + # --- + "1080p": {1: (0.01, 5)}, + # --- + "2048": {1: (0.01, 5)}, +} + +grad_checkpoint = True + +# Acceleration settings +num_workers = 8 +num_bucket_build_workers = 16 +dtype = "bf16" +plugin = "zero2" + +# Model settings +model = dict( + type="STDiT3-XL/2", + from_pretrained=None, + qk_norm=True, + enable_flash_attn=True, + enable_layernorm_kernel=True, + freeze_y_embedder=True, +) +vae = dict( + type="OpenSoraVAE_V1_2", + from_pretrained="/mnt/jfs/sora_checkpoints/vae-pipeline", + micro_frame_size=17, + micro_batch_size=4, +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=300, + shardformer=True, + local_files_only=True, +) +scheduler = dict( + type="rflow", + use_timestep_transform=True, + sample_method="logit-normal", +) + +# Mask settings +mask_ratios = { + "random": 0.05, + "intepolate": 0.005, + "quarter_random": 0.005, + "quarter_head": 0.005, + "quarter_tail": 0.005, + "quarter_head_tail": 0.005, + "image_random": 0.025, + "image_head": 0.05, + "image_tail": 0.025, + "image_head_tail": 0.025, +} + +# Log settings +seed = 42 +outputs = "outputs" +wandb = False +epochs = 1000 +log_every = 10 +ckpt_every = 200 + +# optimization settings +load = None +grad_clip = 1.0 +lr = 1e-4 +ema_decay = 0.99 +adam_eps = 1e-15 diff --git a/configs/opensora-v1-2/train/stage3.py b/configs/opensora-v1-2/train/stage3.py new file mode 100644 index 0000000000000000000000000000000000000000..2595d4d45548c9c6f505a24d81a5ddd64fc1d990 --- /dev/null +++ b/configs/opensora-v1-2/train/stage3.py @@ -0,0 +1,92 @@ +# Dataset settings +dataset = dict( + type="VariableVideoTextDataset", + transform_name="resize_crop", +) + +# webvid +bucket_config = { # 20s/it + "144p": {1: (1.0, 475), 51: (1.0, 51), 102: (1.0, 27), 204: (1.0, 13), 408: (1.0, 6)}, + # --- + "256": {1: (1.0, 297), 51: (0.5, 20), 102: (0.5, 10), 204: (0.5, 5), 408: ((0.5, 0.5), 2)}, + "240p": {1: (1.0, 297), 51: (0.5, 20), 102: (0.5, 10), 204: (0.5, 5), 408: ((0.5, 0.4), 2)}, + # --- + "360p": {1: (1.0, 141), 51: (0.5, 8), 102: (0.5, 4), 204: (0.5, 2), 408: ((0.5, 0.3), 1)}, + "512": {1: (1.0, 141), 51: (0.5, 8), 102: (0.5, 4), 204: (0.5, 2), 408: ((0.5, 0.2), 1)}, + # --- + "480p": {1: (1.0, 89), 51: (0.5, 5), 102: (0.5, 3), 204: ((0.5, 0.5), 1), 408: (0.0, None)}, + # --- + "720p": {1: (0.3, 36), 51: (0.2, 2), 102: (0.1, 1), 204: (0.0, None)}, + "1024": {1: (0.3, 36), 51: (0.1, 2), 102: (0.1, 1), 204: (0.0, None)}, + # --- + "1080p": {1: (0.1, 5)}, + # --- + "2048": {1: (0.05, 5)}, +} + +grad_checkpoint = True + +# Acceleration settings +num_workers = 8 +num_bucket_build_workers = 16 +dtype = "bf16" +plugin = "zero2" + +# Model settings +model = dict( + type="STDiT3-XL/2", + from_pretrained=None, + qk_norm=True, + enable_flash_attn=True, + enable_layernorm_kernel=True, + freeze_y_embedder=True, +) +vae = dict( + type="OpenSoraVAE_V1_2", + from_pretrained="/mnt/jfs/sora_checkpoints/vae-pipeline", + micro_frame_size=17, + micro_batch_size=4, +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=300, + shardformer=True, + local_files_only=True, +) +scheduler = dict( + type="rflow", + use_timestep_transform=True, + sample_method="logit-normal", +) + +# Mask settings +# 25% +mask_ratios = { + "random": 0.01, + "intepolate": 0.002, + "quarter_random": 0.002, + "quarter_head": 0.002, + "quarter_tail": 0.002, + "quarter_head_tail": 0.002, + "image_random": 0.0, + "image_head": 0.22, + "image_tail": 0.005, + "image_head_tail": 0.005, +} + +# Log settings +seed = 42 +outputs = "outputs" +wandb = False +epochs = 1000 +log_every = 10 +ckpt_every = 200 + +# optimization settings +load = None +grad_clip = 1.0 +lr = 1e-4 +ema_decay = 0.99 +adam_eps = 1e-15 +warmup_steps = 1000 diff --git a/configs/opensora/inference/16x256x256.py b/configs/opensora/inference/16x256x256.py index 20d6ea4837cfb7565f932f5c8582f21f2d394d52..4053e1238fd0e5c4cc4acb76f4d923b0c6a21c66 100644 --- a/configs/opensora/inference/16x256x256.py +++ b/configs/opensora/inference/16x256x256.py @@ -7,28 +7,33 @@ model = dict( type="STDiT-XL/2", space_scale=0.5, time_scale=1.0, - enable_flashattn=False, - enable_layernorm_kernel=False, + enable_flash_attn=True, + enable_layernorm_kernel=True, from_pretrained="PRETRAINED_MODEL", ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", + micro_batch_size=4, ) text_encoder = dict( type="t5", - from_pretrained="./pretrained_models/t5_ckpts", + from_pretrained="DeepFloyd/t5-v1_1-xxl", model_max_length=120, ) scheduler = dict( type="iddpm", num_sampling_steps=100, - cfg_scale=5.0, + cfg_scale=7.0, + cfg_channel=3, # or None ) -dtype = "fp16" +dtype = "bf16" + +# Condition +prompt_path = "./assets/texts/t2v_samples.txt" +prompt = None # prompt has higher priority than prompt_path # Others -batch_size = 2 +batch_size = 1 seed = 42 -prompt_path = "./assets/texts/t2v_samples.txt" -save_dir = "./outputs/samples/" +save_dir = "./samples/samples/" diff --git a/configs/opensora/inference/16x512x512-rflow.py b/configs/opensora/inference/16x512x512-rflow.py new file mode 100644 index 0000000000000000000000000000000000000000..cf2381053b0d0abf786ec97bf7ba3539def9f0a1 --- /dev/null +++ b/configs/opensora/inference/16x512x512-rflow.py @@ -0,0 +1,35 @@ +num_frames = 16 +fps = 24 // 3 +image_size = (512, 512) + +# Define model +model = dict( + type="STDiT-XL/2", + space_scale=1.0, + time_scale=1.0, + enable_flash_attn=True, + enable_layernorm_kernel=True, + from_pretrained="PRETRAINED_MODEL", +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", + micro_batch_size=2, +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=120, +) +scheduler = dict( + type="rflow", + num_sampling_steps=10, + cfg_scale=7.0, +) +dtype = "bf16" + +# Others +batch_size = 2 +seed = 42 +prompt_path = "./assets/texts/t2v_samples.txt" +save_dir = "./outputs/samples/" diff --git a/configs/opensora/inference/16x512x512.py b/configs/opensora/inference/16x512x512.py index 52e3086432030e81b56a3d1b7d0e9b3e8dd80d51..478cb5b482d21f11c42107e477ba4869ddba4b05 100644 --- a/configs/opensora/inference/16x512x512.py +++ b/configs/opensora/inference/16x512x512.py @@ -7,18 +7,18 @@ model = dict( type="STDiT-XL/2", space_scale=1.0, time_scale=1.0, - enable_flashattn=True, - enable_layernorm_kernel=False, - from_pretrained="PRETRAINED_MODEL" + enable_flash_attn=True, + enable_layernorm_kernel=True, + from_pretrained="PRETRAINED_MODEL", ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", - micro_batch_size=128, + micro_batch_size=2, ) text_encoder = dict( type="t5", - from_pretrained="./pretrained_models/t5_ckpts", + from_pretrained="DeepFloyd/t5-v1_1-xxl", model_max_length=120, ) scheduler = dict( @@ -26,10 +26,10 @@ scheduler = dict( num_sampling_steps=100, cfg_scale=7.0, ) -dtype = "fp16" +dtype = "bf16" # Others batch_size = 2 seed = 42 prompt_path = "./assets/texts/t2v_samples.txt" -save_dir = "./outputs/samples/" +save_dir = "./samples/samples/" diff --git a/configs/opensora/inference/64x512x512.py b/configs/opensora/inference/64x512x512.py index b9f9636c9171bb82acb9304b929e68a41c51cd07..03cce23de5d0c190e6f8baadc74e761cfdd39598 100644 --- a/configs/opensora/inference/64x512x512.py +++ b/configs/opensora/inference/64x512x512.py @@ -7,8 +7,8 @@ model = dict( type="STDiT-XL/2", space_scale=1.0, time_scale=2 / 3, - enable_flashattn=True, - enable_layernorm_kernel=False, + enable_flash_attn=True, + enable_layernorm_kernel=True, from_pretrained="PRETRAINED_MODEL", ) vae = dict( @@ -18,7 +18,7 @@ vae = dict( ) text_encoder = dict( type="t5", - from_pretrained="./pretrained_models/t5_ckpts", + from_pretrained="DeepFloyd/t5-v1_1-xxl", model_max_length=120, ) scheduler = dict( @@ -26,10 +26,10 @@ scheduler = dict( num_sampling_steps=100, cfg_scale=7.0, ) -dtype = "fp16" +dtype = "bf16" # Others batch_size = 1 seed = 42 prompt_path = "./assets/texts/t2v_samples.txt" -save_dir = "./outputs/samples/" +save_dir = "./samples/samples/" diff --git a/configs/opensora/train/16x256x256-mask.py b/configs/opensora/train/16x256x256-mask.py new file mode 100644 index 0000000000000000000000000000000000000000..19dcae6d10b62b1cba253b6441a7d9044e16fc95 --- /dev/null +++ b/configs/opensora/train/16x256x256-mask.py @@ -0,0 +1,60 @@ +# Define dataset +dataset = dict( + type="VideoTextDataset", + data_path=None, + num_frames=16, + frame_interval=3, + image_size=(256, 256), +) + +# Define acceleration +num_workers = 4 +dtype = "bf16" +grad_checkpoint = True +plugin = "zero2" +sp_size = 1 + +# Define model +model = dict( + type="STDiT-XL/2", + space_scale=0.5, + time_scale=1.0, + from_pretrained="PixArt-XL-2-512x512.pth", + enable_flash_attn=True, + enable_layernorm_kernel=True, +) +mask_ratios = { + "identity": 0.7, + "random": 0.15, + "mask_head": 0.05, + "mask_tail": 0.05, + "mask_head_tail": 0.05, +} +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=120, + shardformer=True, +) +scheduler = dict( + type="iddpm", + timestep_respacing="", +) + +# Others +seed = 42 +outputs = "outputs" +wandb = False + +epochs = 1000 +log_every = 10 +ckpt_every = 1000 +load = None + +batch_size = 8 +lr = 2e-5 +grad_clip = 1.0 diff --git a/configs/opensora/train/16x256x256-spee-rflow.py b/configs/opensora/train/16x256x256-spee-rflow.py new file mode 100644 index 0000000000000000000000000000000000000000..966c9d012f971ab885352c107110e4dd9ad0c5fe --- /dev/null +++ b/configs/opensora/train/16x256x256-spee-rflow.py @@ -0,0 +1,64 @@ +# Define dataset +dataset = dict( + type="VideoTextDataset", + data_path=None, + num_frames=16, + frame_interval=3, + image_size=(256, 256), +) + +# Define acceleration +num_workers = 4 +dtype = "bf16" +grad_checkpoint = True +plugin = "zero2" +sp_size = 1 + +# Define model +model = dict( + type="STDiT-XL/2", + space_scale=0.5, + time_scale=1.0, + # from_pretrained="PixArt-XL-2-512x512.pth", + # from_pretrained = "/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/OpenSora-v1-HQ-16x512x512.pth", + # from_pretrained = "OpenSora-v1-HQ-16x512x512.pth", + from_pretrained="PRETRAINED_MODEL", + enable_flash_attn=True, + enable_layernorm_kernel=True, +) +# mask_ratios = [0.5, 0.29, 0.07, 0.07, 0.07] +# mask_ratios = { +# "identity": 0.9, +# "random": 0.06, +# "mask_head": 0.01, +# "mask_tail": 0.01, +# "mask_head_tail": 0.02, +# } +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=120, + shardformer=True, +) +scheduler = dict( + type="rflow", + # timestep_respacing="", +) + +# Others +seed = 42 +outputs = "outputs" +wandb = True + +epochs = 1 +log_every = 10 +ckpt_every = 1000 +load = None + +batch_size = 16 +lr = 2e-5 +grad_clip = 1.0 diff --git a/configs/opensora/train/16x256x256-spee.py b/configs/opensora/train/16x256x256-spee.py new file mode 100644 index 0000000000000000000000000000000000000000..4b7278997f499523c63ff8d11def91ecab6dcbdb --- /dev/null +++ b/configs/opensora/train/16x256x256-spee.py @@ -0,0 +1,60 @@ +# Define dataset +dataset = dict( + type="VideoTextDataset", + data_path=None, + num_frames=16, + frame_interval=3, + image_size=(256, 256), +) + +# Define acceleration +num_workers = 4 +dtype = "bf16" +grad_checkpoint = True +plugin = "zero2" +sp_size = 1 + +# Define model +model = dict( + type="STDiT-XL/2", + space_scale=0.5, + time_scale=1.0, + from_pretrained="PixArt-XL-2-512x512.pth", + enable_flash_attn=True, + enable_layernorm_kernel=True, +) +mask_ratios = { + "identity": 0.5, + "random": 0.29, + "mask_head": 0.07, + "mask_tail": 0.07, + "mask_head_tail": 0.07, +} +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=120, + shardformer=True, +) +scheduler = dict( + type="iddpm-speed", + timestep_respacing="", +) + +# Others +seed = 42 +outputs = "outputs" +wandb = False + +epochs = 1000 +log_every = 10 +ckpt_every = 1000 +load = None + +batch_size = 8 +lr = 2e-5 +grad_clip = 1.0 diff --git a/configs/opensora/train/16x256x256.py b/configs/opensora/train/16x256x256.py index a64a318f0c72f2786690c2631eb43884898684d8..f7a68a8b32f3cfeef931a0b025e2ce4809027b19 100644 --- a/configs/opensora/train/16x256x256.py +++ b/configs/opensora/train/16x256x256.py @@ -1,14 +1,14 @@ -num_frames = 16 -frame_interval = 3 -image_size = (256, 256) - # Define dataset -root = None -data_path = "CSV_PATH" -use_image_transform = False -num_workers = 4 +dataset = dict( + type="VideoTextDataset", + data_path=None, + num_frames=16, + frame_interval=3, + image_size=(256, 256), +) # Define acceleration +num_workers = 0 dtype = "bf16" grad_checkpoint = True plugin = "zero2" @@ -20,7 +20,7 @@ model = dict( space_scale=0.5, time_scale=1.0, from_pretrained="PixArt-XL-2-512x512.pth", - enable_flashattn=True, + enable_flash_attn=True, enable_layernorm_kernel=True, ) vae = dict( @@ -29,7 +29,7 @@ vae = dict( ) text_encoder = dict( type="t5", - from_pretrained="./pretrained_models/t5_ckpts", + from_pretrained="DeepFloyd/t5-v1_1-xxl", model_max_length=120, shardformer=True, ) diff --git a/configs/opensora/train/16x512x512.py b/configs/opensora/train/16x512x512.py index 885aad1fed966acddfa9ce609c65b24449cc9c05..c566fd1b7a80b90f45e48f46c1cacbd0036f0fa9 100644 --- a/configs/opensora/train/16x512x512.py +++ b/configs/opensora/train/16x512x512.py @@ -1,16 +1,16 @@ -num_frames = 16 -frame_interval = 3 -image_size = (512, 512) - # Define dataset -root = None -data_path = "CSV_PATH" -use_image_transform = False -num_workers = 4 +dataset = dict( + type="VideoTextDataset", + data_path=None, + num_frames=16, + frame_interval=3, + image_size=(512, 512), +) # Define acceleration +num_workers = 4 dtype = "bf16" -grad_checkpoint = False +grad_checkpoint = True plugin = "zero2" sp_size = 1 @@ -20,7 +20,7 @@ model = dict( space_scale=1.0, time_scale=1.0, from_pretrained=None, - enable_flashattn=True, + enable_flash_attn=True, enable_layernorm_kernel=True, ) vae = dict( @@ -30,7 +30,7 @@ vae = dict( ) text_encoder = dict( type="t5", - from_pretrained="./pretrained_models/t5_ckpts", + from_pretrained="DeepFloyd/t5-v1_1-xxl", model_max_length=120, shardformer=True, ) diff --git a/configs/opensora/train/360x512x512.py b/configs/opensora/train/360x512x512.py index 7a6f75995b96152a80ad14e6a40f4b1e2482c1e9..62bfd1475a61e0668b46307f1d970bc9c9a8e6d0 100644 --- a/configs/opensora/train/360x512x512.py +++ b/configs/opensora/train/360x512x512.py @@ -1,12 +1,18 @@ -num_frames = 360 -frame_interval = 1 -image_size = (512, 512) - # Define dataset -root = None -data_path = "CSV_PATH" -use_image_transform = False +dataset = dict( + type="VideoTextDataset", + data_path=None, + num_frames=360, + frame_interval=3, + image_size=(512, 512), +) + +# Define acceleration num_workers = 4 +dtype = "bf16" +grad_checkpoint = True +plugin = "zero2" +sp_size = 1 # Define acceleration dtype = "bf16" @@ -20,7 +26,7 @@ model = dict( space_scale=1.0, time_scale=2 / 3, from_pretrained=None, - enable_flashattn=True, + enable_flash_attn=True, enable_layernorm_kernel=True, enable_sequence_parallelism=True, # enable sq here ) @@ -31,7 +37,7 @@ vae = dict( ) text_encoder = dict( type="t5", - from_pretrained="./pretrained_models/t5_ckpts", + from_pretrained="DeepFloyd/t5-v1_1-xxl", model_max_length=120, shardformer=True, ) diff --git a/configs/opensora/train/64x512x512-sp.py b/configs/opensora/train/64x512x512-sp.py index b0b9062c987e7e90c75e5e1d2064fe8654e22b46..fd34a2a51fa25b64f8ff32bf87aff504af2cec42 100644 --- a/configs/opensora/train/64x512x512-sp.py +++ b/configs/opensora/train/64x512x512-sp.py @@ -1,17 +1,17 @@ -num_frames = 64 -frame_interval = 2 -image_size = (512, 512) - # Define dataset -root = None -data_path = "CSV_PATH" -use_image_transform = False -num_workers = 4 +dataset = dict( + type="VideoTextDataset", + data_path=None, + num_frames=16, + frame_interval=3, + image_size=(512, 512), +) # Define acceleration +num_workers = 4 dtype = "bf16" grad_checkpoint = True -plugin = "zero2-seq" +plugin = "zero2" sp_size = 2 # Define model @@ -20,7 +20,7 @@ model = dict( space_scale=1.0, time_scale=2 / 3, from_pretrained=None, - enable_flashattn=True, + enable_flash_attn=True, enable_layernorm_kernel=True, enable_sequence_parallelism=True, # enable sq here ) @@ -30,7 +30,7 @@ vae = dict( ) text_encoder = dict( type="t5", - from_pretrained="./pretrained_models/t5_ckpts", + from_pretrained="DeepFloyd/t5-v1_1-xxl", model_max_length=120, shardformer=True, ) diff --git a/configs/opensora/train/64x512x512.py b/configs/opensora/train/64x512x512.py index dfcdcc08d250e0a1d23ece174c023975309d2ae1..e07f8c1ccf362031c26a23cb78edf400dbb8943e 100644 --- a/configs/opensora/train/64x512x512.py +++ b/configs/opensora/train/64x512x512.py @@ -1,14 +1,14 @@ -num_frames = 64 -frame_interval = 2 -image_size = (512, 512) - # Define dataset -root = None -data_path = "CSV_PATH" -use_image_transform = False -num_workers = 4 +dataset = dict( + type="VideoTextDataset", + data_path=None, + num_frames=64, + frame_interval=3, + image_size=(512, 512), +) # Define acceleration +num_workers = 4 dtype = "bf16" grad_checkpoint = True plugin = "zero2" @@ -20,7 +20,7 @@ model = dict( space_scale=1.0, time_scale=2 / 3, from_pretrained=None, - enable_flashattn=True, + enable_flash_attn=True, enable_layernorm_kernel=True, ) vae = dict( @@ -30,7 +30,7 @@ vae = dict( ) text_encoder = dict( type="t5", - from_pretrained="./pretrained_models/t5_ckpts", + from_pretrained="DeepFloyd/t5-v1_1-xxl", model_max_length=120, shardformer=True, ) diff --git a/configs/pixart/inference/16x256x256.py b/configs/pixart/inference/16x256x256.py index 4f7ee91d245560233fdbaa31aa8d9eb7e3a59eb8..5013c08739f54e174ab9394353f6055cca409e96 100644 --- a/configs/pixart/inference/16x256x256.py +++ b/configs/pixart/inference/16x256x256.py @@ -15,18 +15,18 @@ vae = dict( ) text_encoder = dict( type="t5", - from_pretrained="./pretrained_models/t5_ckpts", + from_pretrained="DeepFloyd/t5-v1_1-xxl", model_max_length=120, ) scheduler = dict( type="dpm-solver", - num_sampling_steps=50, - cfg_scale=5.0, + num_sampling_steps=20, + cfg_scale=7.0, ) -dtype = "fp16" +dtype = "bf16" # Others batch_size = 2 seed = 42 prompt_path = "./assets/texts/t2v_samples.txt" -save_dir = "./outputs/samples/" +save_dir = "./samples/samples/" diff --git a/configs/pixart/inference/1x1024MS.py b/configs/pixart/inference/1x1024MS.py index 41cc97ad0402d54610302ff6c10a7a4630d1f15b..e6af8c6773b2dde38be7203a98bfa2f59cde8901 100644 --- a/configs/pixart/inference/1x1024MS.py +++ b/configs/pixart/inference/1x1024MS.py @@ -1,7 +1,7 @@ num_frames = 1 fps = 1 image_size = (1920, 512) -multi_resolution = True +multi_resolution = "PixArtMS" # Define model model = dict( @@ -17,7 +17,7 @@ vae = dict( ) text_encoder = dict( type="t5", - from_pretrained="./pretrained_models/t5_ckpts", + from_pretrained="DeepFloyd/t5-v1_1-xxl", model_max_length=120, ) scheduler = dict( @@ -25,10 +25,10 @@ scheduler = dict( num_sampling_steps=20, cfg_scale=7.0, ) -dtype = "fp16" +dtype = "bf16" # Others batch_size = 2 seed = 42 prompt_path = "./assets/texts/t2i_samples.txt" -save_dir = "./outputs/samples/" +save_dir = "./samples/samples/" diff --git a/configs/pixart/inference/1x20481B.py b/configs/pixart/inference/1x20481B.py new file mode 100644 index 0000000000000000000000000000000000000000..3d5eba996cbac4934e28901c1552d31cf7a95789 --- /dev/null +++ b/configs/pixart/inference/1x20481B.py @@ -0,0 +1,36 @@ +num_frames = 1 +fps = 1 +image_size = (2560, 1536) +# image_size = (2048, 2048) + +model = dict( + type="PixArt-1B/2", + from_pretrained="PixArt-1B-2.pth", + space_scale=4, + no_temporal_pos_emb=True, + enable_flash_attn=True, + enable_layernorm_kernel=True, + base_size=2048 // 8, +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers", + subfolder="vae", +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=300, +) +scheduler = dict( + type="dpm-solver", + num_sampling_steps=14, + cfg_scale=4.5, +) +dtype = "bf16" + +# Others +batch_size = 1 +seed = 42 +prompt_path = "./assets/texts/t2i_sigma.txt" +save_dir = "./samples/samples/" diff --git a/configs/pixart/inference/1x2048MS.py b/configs/pixart/inference/1x2048MS.py new file mode 100644 index 0000000000000000000000000000000000000000..a0daca4ce04d1f8b7bd9c5c7ccff9003aae348c7 --- /dev/null +++ b/configs/pixart/inference/1x2048MS.py @@ -0,0 +1,36 @@ +num_frames = 1 +fps = 1 +image_size = (2560, 1536) +# image_size = (2048, 2048) + +model = dict( + type="PixArt-XL/2", + from_pretrained="PixArt-Sigma-XL-2-2K-MS.pth", + space_scale=4, + no_temporal_pos_emb=True, + enable_flash_attn=True, + enable_layernorm_kernel=True, + base_size=2048 // 8, +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers", + subfolder="vae", +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=300, +) +scheduler = dict( + type="dpm-solver", + num_sampling_steps=14, + cfg_scale=4.5, +) +dtype = "bf16" + +# Others +batch_size = 1 +seed = 42 +prompt_path = "./assets/texts/t2i_sigma.txt" +save_dir = "./samples/samples/" diff --git a/configs/pixart/inference/1x256x256.py b/configs/pixart/inference/1x256x256.py index 381c89cb0bbce0ba0a20c2a963fac64b5a0a1e50..16f92602b6fab414726aad3a2cd3b79b0ee5abed 100644 --- a/configs/pixart/inference/1x256x256.py +++ b/configs/pixart/inference/1x256x256.py @@ -16,18 +16,18 @@ vae = dict( ) text_encoder = dict( type="t5", - from_pretrained="./pretrained_models/t5_ckpts", + from_pretrained="DeepFloyd/t5-v1_1-xxl", model_max_length=120, ) scheduler = dict( type="dpm-solver", - num_sampling_steps=30, + num_sampling_steps=20, cfg_scale=7.0, ) -dtype = "fp16" +dtype = "bf16" # Others batch_size = 2 seed = 42 prompt_path = "./assets/texts/t2i_samples.txt" -save_dir = "./outputs/samples/" +save_dir = "./samples/samples/" diff --git a/configs/pixart/inference/1x512x512-rflow.py b/configs/pixart/inference/1x512x512-rflow.py new file mode 100644 index 0000000000000000000000000000000000000000..7bce7e27ad923f6d2f192f7dc992b057550af659 --- /dev/null +++ b/configs/pixart/inference/1x512x512-rflow.py @@ -0,0 +1,39 @@ +num_frames = 1 +fps = 1 +image_size = (512, 512) + +# Define model +model = dict( + type="PixArt-XL/2", + space_scale=1.0, + time_scale=1.0, + no_temporal_pos_emb=True, + from_pretrained="PRETRAINED_MODEL", +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=120, +) +scheduler = dict( + type="rflow", + num_sampling_steps=20, + cfg_scale=7.0, +) +dtype = "bf16" + +# prompt_path = "./assets/texts/t2i_samples.txt" +prompt = [ + "Pirate ship trapped in a cosmic maelstrom nebula.", + "A small cactus with a happy face in the Sahara desert.", + "A small cactus with a sad face in the Sahara desert.", +] + +# Others +batch_size = 2 +seed = 42 +save_dir = "./outputs/samples2/" diff --git a/configs/pixart/inference/1x512x512.py b/configs/pixart/inference/1x512x512.py index d172bc1d70d1b0c7ddd50d8ff92480adc50dd32e..dbc90df5f51bff532b3309cb9f7140b267a00945 100644 --- a/configs/pixart/inference/1x512x512.py +++ b/configs/pixart/inference/1x512x512.py @@ -16,18 +16,24 @@ vae = dict( ) text_encoder = dict( type="t5", - from_pretrained="./pretrained_models/t5_ckpts", + from_pretrained="DeepFloyd/t5-v1_1-xxl", model_max_length=120, ) scheduler = dict( type="dpm-solver", - num_sampling_steps=35, + num_sampling_steps=20, cfg_scale=7.0, ) -dtype = "fp16" +dtype = "bf16" + +# prompt_path = "./assets/texts/t2i_samples.txt" +prompt = [ + "Pirate ship trapped in a cosmic maelstrom nebula.", + "A small cactus with a happy face in the Sahara desert.", + "A small cactus with a sad face in the Sahara desert.", +] # Others batch_size = 2 seed = 42 -prompt_path = "./assets/texts/t2i_samples.txt" -save_dir = "./outputs/samples/" +save_dir = "./samples/samples/" diff --git a/configs/pixart/train/16x256x256.py b/configs/pixart/train/16x256x256.py index b47731e2d5fcb1418c23b68442ee1cae54425726..66285bf9ecdbc814a8bbd22d14272a0e3045513a 100644 --- a/configs/pixart/train/16x256x256.py +++ b/configs/pixart/train/16x256x256.py @@ -1,16 +1,16 @@ -num_frames = 16 -frame_interval = 3 -image_size = (256, 256) - # Define dataset -root = None -data_path = "CSV_PATH" -use_image_transform = False -num_workers = 4 +dataset = dict( + type="VideoTextDataset", + data_path=None, + num_frames=16, + frame_interval=3, + image_size=(256, 256), +) # Define acceleration +num_workers = 4 dtype = "bf16" -grad_checkpoint = False +grad_checkpoint = True plugin = "zero2" sp_size = 1 @@ -20,7 +20,7 @@ model = dict( space_scale=0.5, time_scale=1.0, from_pretrained="PixArt-XL-2-512x512.pth", - enable_flashattn=True, + enable_flash_attn=True, enable_layernorm_kernel=True, ) vae = dict( @@ -29,7 +29,7 @@ vae = dict( ) text_encoder = dict( type="t5", - from_pretrained="./pretrained_models/t5_ckpts", + from_pretrained="DeepFloyd/t5-v1_1-xxl", model_max_length=120, shardformer=True, ) diff --git a/configs/pixart/train/1x2048x2048.py b/configs/pixart/train/1x2048x2048.py new file mode 100644 index 0000000000000000000000000000000000000000..31f17849f5daed026133279efd9c6bae4268f1e4 --- /dev/null +++ b/configs/pixart/train/1x2048x2048.py @@ -0,0 +1,54 @@ +# Define dataset +dataset = dict( + type="VideoTextDataset", + data_path="/home/zhaowangbo/data/csv/image-v1_1_ext_noempty_rcp_clean_info.csv", + num_frames=1, + frame_interval=3, + image_size=(2048, 2048), +) + +# Define acceleration +num_workers = 4 +dtype = "bf16" +grad_checkpoint = True +plugin = "zero2" +sp_size = 1 + +# Define model +model = dict( + type="PixArt-1B/2", + space_scale=4.0, + no_temporal_pos_emb=True, + from_pretrained="PixArt-1B-2.pth", + enable_flash_attn=True, + enable_layernorm_kernel=True, +) + +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers", + subfolder="vae", +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=300, +) +scheduler = dict( + type="iddpm", + timestep_respacing="", +) + +# Others +seed = 42 +outputs = "outputs" +wandb = False + +epochs = 1000 +log_every = 10 +ckpt_every = 1000 +load = None + +batch_size = 4 +lr = 2e-5 +grad_clip = 1.0 diff --git a/configs/pixart/train/1x512x512-rflow.py b/configs/pixart/train/1x512x512-rflow.py new file mode 100644 index 0000000000000000000000000000000000000000..9551ce86e14b17f44827ee79632425b54c8bf153 --- /dev/null +++ b/configs/pixart/train/1x512x512-rflow.py @@ -0,0 +1,55 @@ +# Define dataset +dataset = dict( + type="VideoTextDataset", + data_path=None, + num_frames=1, + frame_interval=3, + image_size=(512, 512), +) + +# Define acceleration +num_workers = 4 +dtype = "bf16" +grad_checkpoint = True +plugin = "zero2" +sp_size = 1 + +# Define model +model = dict( + type="PixArt-XL/2", + space_scale=1.0, + time_scale=1.0, + no_temporal_pos_emb=True, + # from_pretrained="PixArt-XL-2-512x512.pth", + from_pretrained="PRETRAINED_MODEL", + enable_flash_attn=True, + enable_layernorm_kernel=True, +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=120, + shardformer=True, +) +scheduler = dict( + type="rflow", + # timestep_respacing="", +) + +# Others +seed = 42 +outputs = "outputs" +wandb = True + +epochs = 2 +log_every = 10 +ckpt_every = 1000 +load = None + +batch_size = 64 +lr = 2e-5 +grad_clip = 1.0 diff --git a/configs/pixart/train/1x512x512.py b/configs/pixart/train/1x512x512.py index 619c9aafd03a68a36815b5bbc7d12d59c3ea40c6..0e7a1f75fba60061f756d073f2ab169dc6b8dec8 100644 --- a/configs/pixart/train/1x512x512.py +++ b/configs/pixart/train/1x512x512.py @@ -1,14 +1,14 @@ -num_frames = 1 -frame_interval = 1 -image_size = (512, 512) - # Define dataset -root = None -data_path = "CSV_PATH" -use_image_transform = True -num_workers = 4 +dataset = dict( + type="VideoTextDataset", + data_path=None, + num_frames=1, + frame_interval=3, + image_size=(512, 512), +) # Define acceleration +num_workers = 4 dtype = "bf16" grad_checkpoint = True plugin = "zero2" @@ -21,7 +21,7 @@ model = dict( time_scale=1.0, no_temporal_pos_emb=True, from_pretrained="PixArt-XL-2-512x512.pth", - enable_flashattn=True, + enable_flash_attn=True, enable_layernorm_kernel=True, ) vae = dict( @@ -30,7 +30,7 @@ vae = dict( ) text_encoder = dict( type="t5", - from_pretrained="./pretrained_models/t5_ckpts", + from_pretrained="DeepFloyd/t5-v1_1-xxl", model_max_length=120, shardformer=True, ) diff --git a/configs/pixart/train/64x512x512.py b/configs/pixart/train/64x512x512.py index 628cf254fe3d379e4fe6661d62ddad6511003abc..dd59d1150e5d5ad3e51931c6c355d9085a930ce4 100644 --- a/configs/pixart/train/64x512x512.py +++ b/configs/pixart/train/64x512x512.py @@ -1,26 +1,27 @@ -num_frames = 64 -frame_interval = 2 -image_size = (512, 512) - # Define dataset -root = None -data_path = "CSV_PATH" -use_image_transform = False -num_workers = 4 +dataset = dict( + type="VideoTextDataset", + data_path=None, + num_frames=64, + frame_interval=3, + image_size=(256, 256), +) # Define acceleration +num_workers = 4 dtype = "bf16" grad_checkpoint = True plugin = "zero2" sp_size = 1 + # Define model model = dict( type="PixArt-XL/2", space_scale=1.0, time_scale=2 / 3, from_pretrained=None, - enable_flashattn=True, + enable_flash_attn=True, enable_layernorm_kernel=True, ) vae = dict( @@ -30,7 +31,7 @@ vae = dict( ) text_encoder = dict( type="t5", - from_pretrained="./pretrained_models/t5_ckpts", + from_pretrained="DeepFloyd/t5-v1_1-xxl", model_max_length=120, shardformer=True, ) diff --git a/configs/vae/inference/image.py b/configs/vae/inference/image.py new file mode 100644 index 0000000000000000000000000000000000000000..2eebcb087384dd7e6637332f9b01455ac22b98ea --- /dev/null +++ b/configs/vae/inference/image.py @@ -0,0 +1,32 @@ +image_size = (256, 256) +num_frames = 1 + +dtype = "bf16" +batch_size = 1 +seed = 42 +save_dir = "samples/vae_video" +cal_stats = True +log_stats_every = 100 + +# Define dataset +dataset = dict( + type="VideoTextDataset", + data_path=None, + num_frames=num_frames, + image_size=image_size, +) +num_samples = 100 +num_workers = 4 + +# Define model +model = dict( + type="OpenSoraVAE_V1_2", + from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", + micro_frame_size=None, + micro_batch_size=4, + cal_loss=True, +) + +# loss weights +perceptual_loss_weight = 0.1 # use vgg is not None and more than 0 +kl_loss_weight = 1e-6 diff --git a/configs/vae/inference/video.py b/configs/vae/inference/video.py new file mode 100644 index 0000000000000000000000000000000000000000..e4211b831bb8b0b4b5e50d75ffe7d77bdefd1ba8 --- /dev/null +++ b/configs/vae/inference/video.py @@ -0,0 +1,32 @@ +image_size = (256, 256) +num_frames = 17 + +dtype = "bf16" +batch_size = 1 +seed = 42 +save_dir = "samples/vae_video" +cal_stats = True +log_stats_every = 100 + +# Define dataset +dataset = dict( + type="VideoTextDataset", + data_path=None, + num_frames=num_frames, + image_size=image_size, +) +num_samples = 100 +num_workers = 4 + +# Define model +model = dict( + type="OpenSoraVAE_V1_2", + from_pretrained="hpcai-tech/OpenSora-VAE-v1.2", + micro_frame_size=None, + micro_batch_size=4, + cal_loss=True, +) + +# loss weights +perceptual_loss_weight = 0.1 # use vgg is not None and more than 0 +kl_loss_weight = 1e-6 diff --git a/configs/vae/train/stage1.py b/configs/vae/train/stage1.py new file mode 100644 index 0000000000000000000000000000000000000000..151d86db601d15f55e5f96f8d0e6c2a14bb06303 --- /dev/null +++ b/configs/vae/train/stage1.py @@ -0,0 +1,59 @@ +num_frames = 17 +image_size = (256, 256) + +# Define dataset +dataset = dict( + type="VideoTextDataset", + data_path=None, + num_frames=num_frames, + frame_interval=1, + image_size=image_size, +) + +# Define acceleration +num_workers = 16 +dtype = "bf16" +grad_checkpoint = True +plugin = "zero2" + +# Define model +model = dict( + type="VideoAutoencoderPipeline", + freeze_vae_2d=True, + from_pretrained=None, + cal_loss=True, + vae_2d=dict( + type="VideoAutoencoderKL", + from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers", + subfolder="vae", + local_files_only=True, + ), + vae_temporal=dict( + type="VAE_Temporal_SD", + from_pretrained=None, + ), +) + +# loss weights +perceptual_loss_weight = 0.1 # use vgg is not None and more than 0 +kl_loss_weight = 1e-6 + +mixed_strategy = "mixed_video_image" +mixed_image_ratio = 0.2 +use_real_rec_loss = False +use_z_rec_loss = True +use_image_identity_loss = True + +# Others +seed = 42 +outputs = "outputs/vae_stage1" +wandb = False + +epochs = 100 # NOTE: adjust accordingly w.r.t dataset size +log_every = 1 +ckpt_every = 1000 +load = None + +batch_size = 1 +lr = 1e-5 +grad_clip = 1.0 diff --git a/configs/vae/train/stage2.py b/configs/vae/train/stage2.py new file mode 100644 index 0000000000000000000000000000000000000000..d6961e094345ff8363ad0a5c5531353f5500a451 --- /dev/null +++ b/configs/vae/train/stage2.py @@ -0,0 +1,59 @@ +num_frames = 17 +image_size = (256, 256) + +# Define dataset +dataset = dict( + type="VideoTextDataset", + data_path=None, + num_frames=num_frames, + frame_interval=1, + image_size=image_size, +) + +# Define acceleration +num_workers = 16 +dtype = "bf16" +grad_checkpoint = True +plugin = "zero2" + +# Define model +model = dict( + type="VideoAutoencoderPipeline", + freeze_vae_2d=False, + from_pretrained="outputs/vae_stage1", + cal_loss=True, + vae_2d=dict( + type="VideoAutoencoderKL", + from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers", + subfolder="vae", + local_files_only=True, + ), + vae_temporal=dict( + type="VAE_Temporal_SD", + from_pretrained=None, + ), +) + +# loss weights +perceptual_loss_weight = 0.1 # use vgg is not None and more than 0 +kl_loss_weight = 1e-6 + +mixed_strategy = "mixed_video_image" +mixed_image_ratio = 0.2 +use_real_rec_loss = False +use_z_rec_loss = True +use_image_identity_loss = False + +# Others +seed = 42 +outputs = "outputs/vae_stage2" +wandb = False + +epochs = 100 # NOTE: adjust accordingly w.r.t dataset size +log_every = 1 +ckpt_every = 1000 +load = None + +batch_size = 1 +lr = 1e-5 +grad_clip = 1.0 diff --git a/configs/vae/train/stage3.py b/configs/vae/train/stage3.py new file mode 100644 index 0000000000000000000000000000000000000000..464a3efa7605be1f3c9b0257711b52cb73f9c235 --- /dev/null +++ b/configs/vae/train/stage3.py @@ -0,0 +1,58 @@ +num_frames = 33 +image_size = (256, 256) + +# Define dataset +dataset = dict( + type="VideoTextDataset", + data_path=None, + num_frames=num_frames, + frame_interval=1, + image_size=image_size, +) + +# Define acceleration +num_workers = 16 +dtype = "bf16" +grad_checkpoint = True +plugin = "zero2" + +# Define model +model = dict( + type="VideoAutoencoderPipeline", + freeze_vae_2d=False, + from_pretrained="outputs/vae_stage2", + cal_loss=True, + vae_2d=dict( + type="VideoAutoencoderKL", + from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers", + subfolder="vae", + local_files_only=True, + ), + vae_temporal=dict( + type="VAE_Temporal_SD", + from_pretrained=None, + ), +) + +# loss weights +perceptual_loss_weight = 0.1 # use vgg is not None and more than 0 +kl_loss_weight = 1e-6 + +mixed_strategy = "mixed_video_random" +use_real_rec_loss = True +use_z_rec_loss = False +use_image_identity_loss = False + +# Others +seed = 42 +outputs = "outputs/vae_stage3" +wandb = False + +epochs = 100 # NOTE: adjust accordingly w.r.t dataset size +log_every = 1 +ckpt_every = 1000 +load = None + +batch_size = 1 +lr = 1e-5 +grad_clip = 1.0 diff --git a/requirements.txt b/requirements.txt index a9243fb4b897c3cc398fda901f58f4978a0eba28..5a643e5052be40b3b31914da040e780c31f946cd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,23 +1,3 @@ -transformers==4.36.2 -torch==2.0.1 -torchvision==0.15.2 -torchaudio==2.0.2 -numpy==1.24.4 -scipy==1.10.1 -scikit-image==0.21.0 -opencv-python==4.7.0.72 -pillow==9.4.0 -diffusers==0.25.0 -transformers==4.36.2 -accelerate==0.26.1 -matplotlib==3.7.4 -tqdm==4.64.1 -config==0.5.1 -einops==0.7.0 -onnxruntime==1.16.2 -basicsr -av -fvcore -cloudpickle -omegaconf -pycocotools \ No newline at end of file +xformers +transformers +git+https://github.com/hpcaitech/Open-Sora.git