model: arch: mini_gpt4_llama_v2 model_type: pretrain_vicuna freeze_vit: True freeze_qformer: True max_txt_len: 256 low_resource: False image_size: 224 end_sym: "" llama_model: "meta-llama/Llama-2-7b-chat-hf" ckpt: "checkpoints/image_llama2_checkpoint.pth" use_grad_checkpoint: True chat_template: True lora_r: 64 lora_alpha: 16 length: 50 use_grad_checkpoint_llm: True max_context_len: 3600 token_pooling: True datasets: cmd_video: # 15938 batch_size: 4 vis_processor: train: name: "blip2_image_train" image_size: 224 text_processor: train: name: "blip_caption" sample_ratio: 100 webvid: # 42387 batch_size: 4 vis_processor: train: name: "blip2_image_train" image_size: 224 text_processor: train: name: "blip_caption" sample_ratio: 50 run: task: image_text_pretrain # optimizer lr_sched: "linear_warmup_cosine_lr" init_lr: 1e-4 min_lr: 8e-5 warmup_lr: 1e-6 weight_decay: 0.05 max_epoch: 50 num_workers: 16 warmup_steps: 1000 iters_per_epoch: 1000 seed: 42 output_dir: "training_output/cmd_webvid_pretrain" amp: True resume_ckpt_path: null evaluate: False train_splits: ["train"] device: "cuda" world_size: 1 dist_url: "env://" distributed: True