| { | |
| "vae": { | |
| "_class_name": "CausalVideoAutoencoder", | |
| "dims": 3, | |
| "in_channels": 3, | |
| "out_channels": 3, | |
| "latent_channels": 128, | |
| "encoder_blocks": [ | |
| [ | |
| "res_x", | |
| { | |
| "num_layers": 4 | |
| } | |
| ], | |
| [ | |
| "compress_space_res", | |
| { | |
| "multiplier": 2 | |
| } | |
| ], | |
| [ | |
| "res_x", | |
| { | |
| "num_layers": 6 | |
| } | |
| ], | |
| [ | |
| "compress_time_res", | |
| { | |
| "multiplier": 2 | |
| } | |
| ], | |
| [ | |
| "res_x", | |
| { | |
| "num_layers": 6 | |
| } | |
| ], | |
| [ | |
| "compress_all_res", | |
| { | |
| "multiplier": 2 | |
| } | |
| ], | |
| [ | |
| "res_x", | |
| { | |
| "num_layers": 2 | |
| } | |
| ], | |
| [ | |
| "compress_all_res", | |
| { | |
| "multiplier": 2 | |
| } | |
| ], | |
| [ | |
| "res_x", | |
| { | |
| "num_layers": 2 | |
| } | |
| ] | |
| ], | |
| "decoder_blocks": [ | |
| [ | |
| "res_x", | |
| { | |
| "num_layers": 5, | |
| "inject_noise": false | |
| } | |
| ], | |
| [ | |
| "compress_all", | |
| { | |
| "residual": true, | |
| "multiplier": 2 | |
| } | |
| ], | |
| [ | |
| "res_x", | |
| { | |
| "num_layers": 5, | |
| "inject_noise": false | |
| } | |
| ], | |
| [ | |
| "compress_all", | |
| { | |
| "residual": true, | |
| "multiplier": 2 | |
| } | |
| ], | |
| [ | |
| "res_x", | |
| { | |
| "num_layers": 5, | |
| "inject_noise": false | |
| } | |
| ], | |
| [ | |
| "compress_all", | |
| { | |
| "residual": true, | |
| "multiplier": 2 | |
| } | |
| ], | |
| [ | |
| "res_x", | |
| { | |
| "num_layers": 5, | |
| "inject_noise": false | |
| } | |
| ] | |
| ], | |
| "scaling_factor": 1.0, | |
| "norm_layer": "pixel_norm", | |
| "patch_size": 4, | |
| "latent_log_var": "uniform", | |
| "use_quant_conv": false, | |
| "causal_decoder": false, | |
| "timestep_conditioning": false, | |
| "normalize_latent_channels": false, | |
| "encoder_base_channels": 128, | |
| "decoder_base_channels": 128 | |
| }, | |
| "_class_name": "CausalVideoAutoencoder" | |
| } | |