File size: 1,505 Bytes
9670e85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
num_frames = 17
image_size = (256, 256)

# Define dataset
dataset = dict(
    type="VideoTextDataset",
    data_path=None,
    num_frames=num_frames,
    frame_interval=1,
    image_size=image_size,
)

# Define acceleration
num_workers = 16
dtype = "bf16"
grad_checkpoint = True
plugin = "zero2"

# Define model
model = dict(
    type="VideoAutoencoderPipeline",
    freeze_vae_2d=False,
    from_pretrained=None,
    cal_loss=True,
    vae_2d=dict(
        type="VideoAutoencoderKL",
        from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
        subfolder="vae",
        local_files_only=True,
    ),
    vae_temporal=dict(
        type="VAE_Temporal_SD",
        from_pretrained=None,
    ),
)

discriminator = dict(
    type="NLayerDiscriminator",
    from_pretrained="/home/shenchenhui/opensoraplan-v1.0.0-discriminator.pt",
    input_nc=3,
    n_layers=3,
    use_actnorm=False,
)

# discriminator hyper-parames TODO
discriminator_factor = 1
discriminator_start = -1
generator_factor = 0.5
generator_loss_type = "hinge"
discriminator_loss_type = "hinge"
lecam_loss_weight = None
gradient_penalty_loss_weight = None

# loss weights
perceptual_loss_weight = 0.1  # use vgg is not None and more than 0
kl_loss_weight = 1e-6

mixed_image_ratio = 0.2
use_real_rec_loss = True
use_z_rec_loss = False
use_image_identity_loss = False

# Others
seed = 42
outputs = "outputs"
wandb = False

epochs = 100
log_every = 1
ckpt_every = 1000
load = None

batch_size = 1
lr = 1e-5
grad_clip = 1.0