model: arch: video_instruction_llama model_type: pretrain_vicuna freeze_vit: True freeze_qformer: True # Q-Former num_query_token: 32 # If you want train models based on LLaMA-2-chat, # some ckpts could be download from our provided huggingface repo # i.e. https://huggingface.co/DAMO-NLP-SG/Video-LLaMA-2-13B-Finetuned llama-2-7b-chat-hf #llama_model: "/projectnb/ivc-ml/rxtan/llama-2-7b-chat-hf/" llama_model: "meta-llama/Llama-2-7b-chat-hf" imagebind_ckpt_path: "ckpt/imagebind_path/" # The ckpt of vision branch after stage1 pretrained, ckpt: 'ckpt/VL_LLaMA_2_7B_Finetuned.pth' # you can use our pretrained ckpt from https://huggingface.co/DAMO-NLP-SG/Video-LLaMA-2-13B-Pretrained/ # only train vision branch equip_audio_branch: False # whether equips the audio branch frozen_llama_proj: False frozen_video_Qformer: True frozen_audio_Qformer: True fusion_head_layers: 2 max_frame_pos: 32 fusion_header_type: "seqTransf" max_txt_len: 320 # for llama_2_chat: end_sym: "" prompt_path: "prompts/alignment_image.txt" prompt_template: '[INST] <>\n \n<>\n\n{} [/INST] ' datasets: webvid: vis_processor: train: name: "alpro_video_eval" n_frms: 8 image_size: 224 text_processor: train: name: "blip_caption" run: task: video_text_pretrain # optimizer lr_sched: "linear_warmup_cosine_lr" init_lr: 3e-5 min_lr: 1e-5 warmup_lr: 1e-6 weight_decay: 0.05 max_epoch: 3 iters_per_epoch: 1000 batch_size_train: 4 batch_size_eval: 4 num_workers: 4 warmup_steps: 1000 seed: 42 output_dir: "output/videollama_stage2_finetune" amp: True resume_ckpt_path: null evaluate: False train_splits: ["train"] device: "cuda" world_size: 1 dist_url: "env://" distributed: True