Spaces:

rxtan
/

Koala-video-llm

Sleeping

Koala-video-llm / eval_configs /conversation_demo.yaml

Reuben Tan

remove img upload button

290c75a 8 months ago

1.84 kB

	model:
	arch: video_instruction_llama
	model_type: pretrain_vicuna
	freeze_vit: True
	freeze_qformer: True


	# Q-Former
	num_query_token: 32

	# If you want train models based on LLaMA-2-chat,
	# some ckpts could be download from our provided huggingface repo
	# i.e. https://huggingface.co/DAMO-NLP-SG/Video-LLaMA-2-13B-Finetuned llama-2-7b-chat-hf
	#llama_model: "/projectnb/ivc-ml/rxtan/llama-2-7b-chat-hf/"
	llama_model: "meta-llama/Llama-2-7b-chat-hf"
	imagebind_ckpt_path: "ckpt/imagebind_path/"

	# The ckpt of vision branch after stage1 pretrained,
	ckpt: 'ckpt/VL_LLaMA_2_7B_Finetuned.pth' # you can use our pretrained ckpt from https://huggingface.co/DAMO-NLP-SG/Video-LLaMA-2-13B-Pretrained/


	# only train vision branch
	equip_audio_branch: False # whether equips the audio branch
	frozen_llama_proj: False
	frozen_video_Qformer: True
	frozen_audio_Qformer: True

	fusion_head_layers: 2
	max_frame_pos: 32
	fusion_header_type: "seqTransf"

	max_txt_len: 320

	# for llama_2_chat:
	end_sym: "</s>"
	prompt_path: "prompts/alignment_image.txt"
	prompt_template: '[INST] <<SYS>>\n \n<</SYS>>\n\n{} [/INST] '

	datasets:
	webvid:
	vis_processor:
	train:
	name: "alpro_video_eval"
	n_frms: 8
	image_size: 224
	text_processor:
	train:
	name: "blip_caption"

	run:
	task: video_text_pretrain
	# optimizer
	lr_sched: "linear_warmup_cosine_lr"
	init_lr: 3e-5
	min_lr: 1e-5
	warmup_lr: 1e-6

	weight_decay: 0.05
	max_epoch: 3
	iters_per_epoch: 1000
	batch_size_train: 4
	batch_size_eval: 4
	num_workers: 4
	warmup_steps: 1000

	seed: 42
	output_dir: "output/videollama_stage2_finetune"

	amp: True
	resume_ckpt_path: null

	evaluate: False
	train_splits: ["train"]

	device: "cuda"
	world_size: 1
	dist_url: "env://"
	distributed: True