Spaces:
Sleeping
Sleeping
model: | |
arch: video_instruction_llama | |
model_type: pretrain_vicuna | |
freeze_vit: True | |
freeze_qformer: True | |
# Q-Former | |
num_query_token: 32 | |
# If you want train models based on LLaMA-2-chat, | |
# some ckpts could be download from our provided huggingface repo | |
# i.e. https://huggingface.co/DAMO-NLP-SG/Video-LLaMA-2-13B-Finetuned llama-2-7b-chat-hf | |
#llama_model: "/projectnb/ivc-ml/rxtan/llama-2-7b-chat-hf/" | |
llama_model: "meta-llama/Llama-2-7b-chat-hf" | |
imagebind_ckpt_path: "ckpt/imagebind_path/" | |
# The ckpt of vision branch after stage1 pretrained, | |
ckpt: 'ckpt/VL_LLaMA_2_7B_Finetuned.pth' # you can use our pretrained ckpt from https://huggingface.co/DAMO-NLP-SG/Video-LLaMA-2-13B-Pretrained/ | |
# only train vision branch | |
equip_audio_branch: False # whether equips the audio branch | |
frozen_llama_proj: False | |
frozen_video_Qformer: True | |
frozen_audio_Qformer: True | |
fusion_head_layers: 2 | |
max_frame_pos: 32 | |
fusion_header_type: "seqTransf" | |
max_txt_len: 320 | |
# for llama_2_chat: | |
end_sym: "</s>" | |
prompt_path: "prompts/alignment_image.txt" | |
prompt_template: '[INST] <<SYS>>\n \n<</SYS>>\n\n{} [/INST] ' | |
datasets: | |
webvid: | |
vis_processor: | |
train: | |
name: "alpro_video_eval" | |
n_frms: 8 | |
image_size: 224 | |
text_processor: | |
train: | |
name: "blip_caption" | |
run: | |
task: video_text_pretrain | |
# optimizer | |
lr_sched: "linear_warmup_cosine_lr" | |
init_lr: 3e-5 | |
min_lr: 1e-5 | |
warmup_lr: 1e-6 | |
weight_decay: 0.05 | |
max_epoch: 3 | |
iters_per_epoch: 1000 | |
batch_size_train: 4 | |
batch_size_eval: 4 | |
num_workers: 4 | |
warmup_steps: 1000 | |
seed: 42 | |
output_dir: "output/videollama_stage2_finetune" | |
amp: True | |
resume_ckpt_path: null | |
evaluate: False | |
train_splits: ["train"] | |
device: "cuda" | |
world_size: 1 | |
dist_url: "env://" | |
distributed: True |