{ "run_cfg": { "checkpoint": "", "output_dir": "./output/hdli/pretrain_mmctx", "gradient_accumulation_steps": 1, "clip_lr": 5e-07, "optim": "adamw", "learning_rate": 5e-05, "betas": [ 0.9, 0.98 ], "weight_decay": 0.01, "grad_norm": 2.0, "warmup_ratio": 0.1, "resume": false, "seed": 50, "fp16": true, "bf16": false, "zero_shot": false, "scheduler": "warmup_linear", "new_lr": 0, "new_params_name": [], "valid_freq": 10, "dataset_mix_type": "random", "remove_before_ckpt": true, "first_eval": true, "pretrain_dir": "./output/vast/pretrain_vast", "num_train_steps": 0, "save_best": false, "pin_mem": true, "vision_resolution": 224, "pretrain_concat_num": 1, "use_ddp": false, "mode": "training", "log_steps": 100, "default": "./config/mmctx/default_run_cfg.json" }, "model_cfg": { "model_type": "mmctx", "itm_ratio": 0.1, "frozen_vision": false, "frozen_audio": false, "checkpointing": true, "pool_video": false, "max_caption_len": 40, "max_omni_caption_len": 70, "max_subtitle_len": 70, "contra_dim": 512, "inherit_keys": [ "vision_encoder_type", "audio_encoder_type" ], "frame_embedding_type": "adaptive", "vision_resolution": 224, "vision_encoder_type": "evaclip01_giant", "audio_encoder_type": "beats", "audio_melbins": 224, "audio_target_length": 224, "beam_size": 3, "captioner_mode": false, "generate_nums": 1, "ret_bidirection_evaluation": false, "itm_rerank_num": 50, "evaluation_type": "evaluation_mm", "default": "./config/mmctx/default_model_cfg.json", "max_vision_sample_num": 2, "max_audio_sample_num": 2, "max_depth_sample_num": 1 }, "data_cfg": { "train": [ ], "val": [], "concatenated_nums": 1 }, "local_rank": 0 }