|
{ |
|
"bottleneck": { |
|
"args": { |
|
"bottleneck_dim": 16, |
|
"norm": "none", |
|
"regularizer": { |
|
"args": { |
|
"codebook_loss_weight": 1.0, |
|
"codebook_size": 8192, |
|
"commitment_loss_weight": 0.25, |
|
"entropy_loss_temperature": 0.01, |
|
"entropy_loss_weight": 0.0, |
|
"l2_normalized": true, |
|
"stochastic": true, |
|
"stochastic_temperature": 0.03 |
|
}, |
|
"name": "vq" |
|
} |
|
}, |
|
"name": "bottleneck" |
|
}, |
|
"bottleneck_token_num": 1024, |
|
"decoder_depth": 12, |
|
"decoder_hidden_size": 768, |
|
"decoder_name": "none", |
|
"decoder_num_heads": 12, |
|
"decoder_patch_size": 8, |
|
"decoder_temporal_patch_size": 4, |
|
"encoder_depth": 12, |
|
"encoder_hidden_size": 768, |
|
"encoder_name": "none", |
|
"encoder_num_heads": 12, |
|
"encoder_query_gaussian_init": true, |
|
"frame_num": 16, |
|
"in_channels": 3, |
|
"input_size": 128, |
|
"latent_pe_scale_factor": 10000, |
|
"learned_decoder_latent_pe": false, |
|
"learned_decoder_patch_query_embed": false, |
|
"learned_encoder_latent_query_embed": true, |
|
"learned_encoder_patch_pe": false, |
|
"patch_size": 8, |
|
"prior_model": { |
|
"args": { |
|
"l2_normalized": true |
|
}, |
|
"avg_loss_over_rounds": true, |
|
"latent_ce_temperature": 1.0, |
|
"mix_ss_max_ratio": 0.5, |
|
"mix_ss_peak_steps_ratio": 0.3, |
|
"n_rounds": 2, |
|
"name": "gptc-S", |
|
"no_dropout": false, |
|
"no_grad_before_last_round": false, |
|
"use_mix_ss": true |
|
}, |
|
"query_init_std": 0.02, |
|
"temporal_patch_size": 4, |
|
"transformer_name": "transformer_encoder_parallel", |
|
"use_decoder_latent_token_type_embed": false, |
|
"use_decoder_patch_query_token_type_embed": true, |
|
"use_encoder_latent_query_token_type_embed": false, |
|
"use_encoder_patch_token_type_embed": false |
|
} |