Video-LLaMA / eval_configs /video_llama_eval.yaml
θˆŸε‹€
test
8eaeb01
model:
arch: video_llama
model_type: pretrain_vicuna
freeze_vit: True
freeze_qformer: True
max_txt_len: 140
end_sym: "###"
low_resource: False
llama_model: "DAMO-NLP-SG/vicuna-7b"
fusion_head_layers: 2
max_frame_pos: 32
fusion_header_type: "seqTransf"
ckpt: 'ckpt/finetune-vicuna7b-v2.pth'
q_former_model: 'ckpt/blip2_pretrained_flant5xxl.pth'
datasets:
webvid:
vis_processor:
train:
name: "alpro_video_eval"
n_frms: 8
image_size: 224
text_processor:
train:
name: "blip_caption"
run:
task: video_text_pretrain