model: arch: video_llama model_type: pretrain_vicuna freeze_vit: True freeze_qformer: True max_txt_len: 140 end_sym: "###" low_resource: False llama_model: "DAMO-NLP-SG/vicuna-7b" fusion_head_layers: 2 max_frame_pos: 32 fusion_header_type: "seqTransf" ckpt: 'ckpt/finetune-vicuna7b-v2.pth' q_former_model: 'ckpt/blip2_pretrained_flant5xxl.pth' datasets: webvid: vis_processor: train: name: "alpro_video_eval" n_frms: 8 image_size: 224 text_processor: train: name: "blip_caption" run: task: video_text_pretrain