Spaces:
Runtime error
Runtime error
args: | |
latent_channels: 16 | |
mode: inference | |
load: "CogVideoX-2b-sat/transformer" | |
batch_size: 1 | |
input_type: txt | |
input_file: test.txt | |
sampling_num_frames: 13 # Must be 13, 11 or 9 | |
sampling_fps: 8 | |
fp16: True | |
output_dir: outputs/ | |
force_inference: True | |
model: | |
scale_factor: 1.15258426 | |
disable_first_stage_autocast: true | |
log_keys: | |
- txt | |
denoiser_config: | |
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser | |
params: | |
num_idx: 1000 | |
quantize_c_noise: False | |
weighting_config: | |
target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting | |
scaling_config: | |
target: sgm.modules.diffusionmodules.denoiser_scaling.VideoScaling | |
discretization_config: | |
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization | |
params: | |
shift_scale: 3.0 | |
network_config: | |
target: dit_video_concat.DiffusionTransformer | |
params: | |
time_embed_dim: 512 | |
elementwise_affine: True | |
num_frames: 49 | |
time_compressed_rate: 4 | |
latent_width: 90 | |
latent_height: 60 | |
num_layers: 30 | |
patch_size: 2 | |
in_channels: 16 | |
out_channels: 16 | |
hidden_size: 1920 | |
adm_in_channels: 256 | |
num_attention_heads: 30 | |
transformer_args: | |
vocab_size: 1 | |
max_sequence_length: 64 | |
layernorm_order: pre | |
skip_init: false | |
model_parallel_size: 1 | |
is_decoder: false | |
modules: | |
pos_embed_config: | |
target: dit_video_concat.Basic3DPositionEmbeddingMixin | |
params: | |
text_length: 226 | |
height_interpolation: 1.875 | |
width_interpolation: 1.875 | |
patch_embed_config: | |
target: dit_video_concat.ImagePatchEmbeddingMixin | |
params: | |
text_hidden_size: 4096 | |
adaln_layer_config: | |
target: dit_video_concat.AdaLNMixin | |
params: | |
qk_ln: True | |
final_layer_config: | |
target: dit_video_concat.FinalLayerMixin | |
conditioner_config: | |
target: sgm.modules.GeneralConditioner | |
params: | |
emb_models: | |
- is_trainable: false | |
input_key: txt | |
ucg_rate: 0.1 | |
target: sgm.modules.encoders.modules.FrozenT5Embedder | |
params: | |
model_dir: "google/t5-v1_1-xxl" | |
max_length: 226 | |
first_stage_config: | |
target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper | |
params: | |
cp_size: 1 | |
ckpt_path: "CogVideoX-2b-sat/vae/3d-vae.pt" | |
ignore_keys: [ 'loss' ] | |
loss_config: | |
target: torch.nn.Identity | |
regularizer_config: | |
target: vae_modules.regularizers.DiagonalGaussianRegularizer | |
encoder_config: | |
target: vae_modules.cp_enc_dec.ContextParallelEncoder3D | |
params: | |
double_z: true | |
z_channels: 16 | |
resolution: 256 | |
in_channels: 3 | |
out_ch: 3 | |
ch: 128 | |
ch_mult: [ 1, 2, 2, 4 ] | |
attn_resolutions: [ ] | |
num_res_blocks: 3 | |
dropout: 0.0 | |
gather_norm: True | |
decoder_config: | |
target: vae_modules.cp_enc_dec.ContextParallelDecoder3D | |
params: | |
double_z: True | |
z_channels: 16 | |
resolution: 256 | |
in_channels: 3 | |
out_ch: 3 | |
ch: 128 | |
ch_mult: [ 1, 2, 2, 4 ] | |
attn_resolutions: [ ] | |
num_res_blocks: 3 | |
dropout: 0.0 | |
gather_norm: false | |
loss_fn_config: | |
target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss | |
params: | |
offset_noise_level: 0 | |
sigma_sampler_config: | |
target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling | |
params: | |
uniform_sampling: True | |
num_idx: 1000 | |
discretization_config: | |
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization | |
params: | |
shift_scale: 3.0 | |
sampler_config: | |
target: sgm.modules.diffusionmodules.sampling.VPSDEDPMPP2MSampler | |
params: | |
num_steps: 50 | |
verbose: True | |
discretization_config: | |
target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization | |
params: | |
shift_scale: 3.0 | |
guider_config: | |
target: sgm.modules.diffusionmodules.guiders.DynamicCFG | |
params: | |
scale: 6 | |
exp: 5 | |
num_steps: 50 |