Reself commited on May 2

Commit

17d6fb8

•

1 Parent(s): 1ac6868

Upload folder using huggingface_hub

Browse files

Files changed (30) hide show

llm/config.json +33 -0
llm/generation_config.json +10 -0
llm/pytorch_model-00001-of-00007.bin +3 -0
llm/pytorch_model-00002-of-00007.bin +3 -0
llm/pytorch_model-00003-of-00007.bin +3 -0
llm/pytorch_model-00004-of-00007.bin +3 -0
llm/pytorch_model-00005-of-00007.bin +3 -0
llm/pytorch_model-00006-of-00007.bin +3 -0
llm/pytorch_model-00007-of-00007.bin +3 -0
llm/pytorch_model.bin.index.json +298 -0
llm/special_tokens_map.json +30 -0
llm/tokenizer.json +0 -0
llm/tokenizer.model +3 -0
llm/tokenizer_config.json +42 -0
llm_adapter/README.md +204 -0
llm_adapter/adapter_config.json +31 -0
llm_adapter/adapter_model.safetensors +3 -0
projector/config.json +17 -0
projector/configuration_projector.py +23 -0
projector/model.safetensors +3 -0
projector/modeling_projector.py +41 -0
visual_encoder_adapter/README.md +204 -0
visual_encoder_adapter/adapter_config.json +33 -0
visual_encoder_adapter/adapter_model.safetensors +3 -0
vit/config.json +22 -0
vit/preprocessor_config.json +27 -0
vit/pytorch_model-00001-of-00002.bin +3 -0
vit/pytorch_model-00002-of-00002.bin +3 -0
vit/pytorch_model.bin.index.json +526 -0
xtuner_config.py +256 -0

llm/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_name_or_path": "lmsys/vicuna-7b-v1.5-16k",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 4096,
+  "max_sequence_length": 16384,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "pad_token_id": 0,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 4.0,
+    "type": "linear"
+  },
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.37.2",
+  "use_cache": true,
+  "vocab_size": 32000
+}

llm/generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "bos_token_id": 1,
+  "do_sample": true,
+  "eos_token_id": 2,
+  "max_length": 16384,
+  "pad_token_id": 0,
+  "temperature": 0.9,
+  "top_p": 0.6,
+  "transformers_version": "4.37.2"
+}

llm/pytorch_model-00001-of-00007.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5f0930d41985d7ad41560deae0b420fde08ea1137eb5e8e7ae17ed8d959347d
+size 1981888506

llm/pytorch_model-00002-of-00007.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a537aeaafb0642be98e293759ad6228d35e2a0dcd5256d4e545944e9cb7f14b9
+size 1990294914

llm/pytorch_model-00003-of-00007.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f369b5b02edb0620b9a8b089aeb6bfe8b65d417c135c0636e1ebe45a3dbbf67
+size 1990294978

llm/pytorch_model-00004-of-00007.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:491c8f011be678033613b5a757d25478ae9356d3d0f489711d16ce13be0c0124
+size 1990294978

llm/pytorch_model-00005-of-00007.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be5d1457665213cbe6d5ba068150530f73043e15e002680bba5caff93fd618b3
+size 1933654814

llm/pytorch_model-00006-of-00007.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fbb83efac542fc219cebdfd00f1cb6a51e03928d87a7d140e7abd0ed57f711b
+size 1933671874

llm/pytorch_model-00007-of-00007.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e015ce0ac5f9db55a5c21fc17a0b7502b8caa47d2f2d70f571335a5d04e3adaa
+size 1656835708

llm/pytorch_model.bin.index.json ADDED Viewed

	@@ -0,0 +1,298 @@

+{
+  "metadata": {
+    "total_size": 13476831232
+  },
+  "weight_map": {
+    "lm_head.weight": "pytorch_model-00007-of-00007.bin",
+    "model.embed_tokens.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.10.input_layernorm.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.10.mlp.down_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.10.mlp.up_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.11.input_layernorm.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.11.mlp.down_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.11.mlp.up_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.12.input_layernorm.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.12.mlp.down_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.12.mlp.up_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.13.input_layernorm.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.13.mlp.down_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.13.mlp.up_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.14.input_layernorm.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.14.mlp.down_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.14.mlp.up_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.15.input_layernorm.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.15.mlp.down_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.15.mlp.up_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.16.input_layernorm.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.16.mlp.down_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.16.mlp.up_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.17.input_layernorm.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.17.mlp.down_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.17.mlp.up_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.18.input_layernorm.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.18.mlp.down_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.18.mlp.up_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00004-of-00007.bin",
+    "model.layers.19.input_layernorm.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.19.mlp.down_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.19.mlp.up_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.20.input_layernorm.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.20.mlp.down_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.20.mlp.up_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.21.input_layernorm.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.21.mlp.down_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.21.mlp.up_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.22.input_layernorm.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.22.mlp.down_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.22.mlp.up_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.23.input_layernorm.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.23.mlp.down_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.23.mlp.up_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00005-of-00007.bin",
+    "model.layers.24.input_layernorm.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.24.mlp.down_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.24.mlp.up_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.25.input_layernorm.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.25.mlp.down_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.25.mlp.up_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.26.input_layernorm.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.26.mlp.down_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.26.mlp.up_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.27.input_layernorm.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.27.mlp.down_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.27.mlp.up_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.28.input_layernorm.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.28.mlp.down_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.28.mlp.up_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00006-of-00007.bin",
+    "model.layers.29.input_layernorm.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.29.mlp.down_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.29.mlp.up_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.30.input_layernorm.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.30.mlp.down_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.30.mlp.up_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.31.input_layernorm.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.31.mlp.down_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.31.mlp.up_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00007-of-00007.bin",
+    "model.layers.4.input_layernorm.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.4.mlp.down_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.4.mlp.up_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00007.bin",
+    "model.layers.5.input_layernorm.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.5.mlp.down_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.5.mlp.up_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.6.input_layernorm.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.6.mlp.down_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.6.mlp.up_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.7.input_layernorm.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.7.mlp.down_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.7.mlp.up_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.8.input_layernorm.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.8.mlp.down_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.8.mlp.up_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.9.input_layernorm.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.9.mlp.down_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.9.mlp.up_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00002-of-00007.bin",
+    "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00003-of-00007.bin",
+    "model.norm.weight": "pytorch_model-00007-of-00007.bin"
+  }
+}

llm/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

llm/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

llm/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

llm/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 16384,
+  "pad_token": "<unk>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

llm_adapter/README.md ADDED Viewed

	@@ -0,0 +1,204 @@

+---
+library_name: peft
+base_model: lmsys/vicuna-7b-v1.5-16k
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.7.1

llm_adapter/adapter_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "lmsys/vicuna-7b-v1.5-16k",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 512,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "q_proj",
+    "up_proj",
+    "v_proj",
+    "down_proj",
+    "gate_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM"
+}

llm_adapter/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d6c537e3423a9fe2e02094b3b3996a86c25aa59b033231b682ad2da630c4e1f
+size 2558587064

projector/config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "architectures": [
+    "ProjectorModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_projector.ProjectorConfig",
+    "AutoModel": "modeling_projector.ProjectorModel"
+  },
+  "bias": true,
+  "depth": 2,
+  "hidden_act": "gelu",
+  "llm_hidden_size": 4096,
+  "model_type": "projector",
+  "torch_dtype": "float32",
+  "transformers_version": "4.37.2",
+  "visual_hidden_size": 1280
+}

projector/configuration_projector.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from transformers import PretrainedConfig
+class ProjectorConfig(PretrainedConfig):
+    model_type = "projector"
+    _auto_class = "AutoConfig"
+    def __init__(
+        self,
+        visual_hidden_size=4096,
+        llm_hidden_size=4096,
+        depth=2,
+        hidden_act="gelu",
+        bias=True,
+        **kwargs,
+    ):
+        self.visual_hidden_size = visual_hidden_size
+        self.llm_hidden_size = llm_hidden_size
+        self.depth = depth
+        self.hidden_act = hidden_act
+        self.bias = bias
+        super().__init__(**kwargs)

projector/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2b805fdce8968993d54e23535ff93f113833a35d6c3b629e82ce7fcbf2943c5
+size 88113520

projector/modeling_projector.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel
+from transformers.activations import ACT2FN
+from .configuration_projector import ProjectorConfig
+class ProjectorModel(PreTrainedModel):
+    _auto_class = "AutoModel"
+    config_class = ProjectorConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    def __init__(self, config: ProjectorConfig) -> None:
+        super().__init__(config)
+        self.gradient_checkpointing = False
+        modules = [nn.Linear(config.visual_hidden_size, config.llm_hidden_size, bias=config.bias)]
+        for _ in range(1, config.depth):
+            modules.append(ACT2FN[config.hidden_act])
+            modules.append(nn.Linear(config.llm_hidden_size, config.llm_hidden_size, bias=config.bias))
+        self.model = nn.Sequential(*modules)
+    def enable_input_require_grads(self):
+        def make_inputs_require_grad(module, input, output):
+            output.requires_grad_(True)
+        self.model.register_forward_hook(make_inputs_require_grad)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, ProjectorModel):
+            module.gradient_checkpointing = value
+    def forward(self, x):
+        if self.gradient_checkpointing and self.training:
+            layer_outputs = torch.utils.checkpoint.checkpoint(self.model, x)
+        else:
+            layer_outputs = self.model(x)
+        return layer_outputs

visual_encoder_adapter/README.md ADDED Viewed

	@@ -0,0 +1,204 @@

+---
+library_name: peft
+base_model: apple/DFN5B-CLIP-ViT-H-14-378
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.7.1

visual_encoder_adapter/adapter_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": {
+    "base_model_class": "PikaVidEncoder",
+    "parent_library": "xtuner.model.video_encoder"
+  },
+  "base_model_name_or_path": "apple/DFN5B-CLIP-ViT-H-14-378",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "fc1",
+    "fc2",
+    "q_proj",
+    "v_proj",
+    "out_proj"
+  ],
+  "task_type": null
+}

visual_encoder_adapter/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:07d6cc5058d81b85ce7c06affe42ba6c3f7724178cbdef6efb773c4e92003c51
+size 188800496

vit/config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "_name_or_path": "apple/DFN5B-CLIP-ViT-H-14-378",
+  "architectures": [
+    "CLIPVisionModel"
+  ],
+  "attention_dropout": 0.0,
+  "hidden_act": "quick_gelu",
+  "hidden_size": 1280,
+  "image_size": 378,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 5120,
+  "layer_norm_eps": 1e-05,
+  "model_type": "clip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 32,
+  "patch_size": 14,
+  "projection_dim": 512,
+  "torch_dtype": "float32",
+  "transformers_version": "4.37.2"
+}

vit/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "crop_size": {
+    "height": 378,
+    "width": 378
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "CLIPImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 378
+  }
+}

vit/pytorch_model-00001-of-00002.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b57623d22174038e0d60e66f4b0c64b834e88d7d3159f10ea2ae02213b67cb7
+size 1994332295

vit/pytorch_model-00002-of-00002.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d2af6f3129761250a4b0be44b53fbd4aef509cd897c6c8484c723cc69acd82f
+size 531341514

vit/pytorch_model.bin.index.json ADDED Viewed

	@@ -0,0 +1,526 @@

+{
+  "metadata": {
+    "total_size": 2525486080
+  },
+  "weight_map": {
+    "vision_model.embeddings.class_embedding": "pytorch_model-00001-of-00002.bin",
+    "vision_model.embeddings.patch_embedding.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.embeddings.position_embedding.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.0.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.0.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.0.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.0.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.0.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.0.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.0.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.0.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.0.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.0.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.0.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.0.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.0.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.1.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.1.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.1.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.1.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.1.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.1.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.1.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.1.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.1.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.1.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.1.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.1.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.1.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.10.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.10.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.10.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.10.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.10.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.10.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.10.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.10.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.10.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.10.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.10.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.10.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.10.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.11.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.11.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.11.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.11.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.11.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.11.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.11.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.11.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.11.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.11.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.11.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.11.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.11.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.12.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.12.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.12.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.12.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.12.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.12.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.12.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.12.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.12.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.12.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.12.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.12.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.12.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.13.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.13.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.13.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.13.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.13.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.13.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.13.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.13.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.13.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.13.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.13.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.13.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.13.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.14.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.14.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.14.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.14.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.14.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.14.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.14.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.14.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.14.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.14.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.14.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.14.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.14.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.15.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.15.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.15.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.15.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.15.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.15.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.15.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.15.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.15.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.15.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.15.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.15.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.15.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.16.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.16.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.16.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.16.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.16.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.16.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.16.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.16.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.16.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.16.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.16.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.16.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.16.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.16.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.17.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.17.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.17.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.17.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.17.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.17.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.17.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.17.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.17.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.17.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.17.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.17.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.17.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.17.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.18.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.18.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.18.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.18.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.18.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.18.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.18.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.18.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.18.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.18.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.18.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.18.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.18.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.18.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.19.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.19.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.19.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.19.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.19.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.19.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.19.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.19.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.19.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.19.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.19.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.19.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.19.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.19.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.2.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.2.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.2.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.2.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.2.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.2.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.2.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.2.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.2.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.2.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.2.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.2.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.2.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.20.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.20.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.20.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.20.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.20.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.20.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.20.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.20.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.20.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.20.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.20.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.20.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.20.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.20.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.20.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.20.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.21.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.21.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.21.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.21.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.21.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.21.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.21.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.21.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.21.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.21.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.21.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.21.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.21.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.21.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.21.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.21.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.22.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.22.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.22.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.22.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.22.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.22.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.22.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.22.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.22.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.22.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.22.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.22.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.22.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.22.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.22.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.22.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.23.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.23.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.23.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.23.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.23.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.23.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.23.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.23.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.23.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.23.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.23.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.23.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.23.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.23.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.23.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.23.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.24.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.24.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.24.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.24.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.24.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.24.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.24.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.24.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.24.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.24.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.24.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.24.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.24.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.24.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.24.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.24.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.25.layer_norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.25.layer_norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.25.layer_norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.25.layer_norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.25.mlp.fc1.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.25.mlp.fc1.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.25.mlp.fc2.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.25.mlp.fc2.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.25.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.25.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.25.self_attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.25.self_attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.25.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.25.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.25.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.25.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.26.layer_norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.26.layer_norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.26.layer_norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.26.layer_norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.26.mlp.fc1.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.26.mlp.fc1.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.26.mlp.fc2.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.26.mlp.fc2.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.26.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.26.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.26.self_attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.26.self_attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.26.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.26.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.26.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.26.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.27.layer_norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.27.layer_norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.27.layer_norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.27.layer_norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.27.mlp.fc1.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.27.mlp.fc1.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.27.mlp.fc2.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.27.mlp.fc2.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.27.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.27.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.27.self_attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.27.self_attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.27.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.27.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.27.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.27.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.28.layer_norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.28.layer_norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.28.layer_norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.28.layer_norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.28.mlp.fc1.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.28.mlp.fc1.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.28.mlp.fc2.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.28.mlp.fc2.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.28.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.28.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.28.self_attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.28.self_attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.28.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.28.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.28.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.28.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.29.layer_norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.29.layer_norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.29.layer_norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.29.layer_norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.29.mlp.fc1.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.29.mlp.fc1.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.29.mlp.fc2.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.29.mlp.fc2.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.29.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.29.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.29.self_attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.29.self_attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.29.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.29.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.29.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.29.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.3.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.3.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.3.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.3.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.3.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.3.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.3.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.3.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.3.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.3.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.3.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.3.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.3.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.30.layer_norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.30.layer_norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.30.layer_norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.30.layer_norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.30.mlp.fc1.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.30.mlp.fc1.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.30.mlp.fc2.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.30.mlp.fc2.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.30.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.30.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.30.self_attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.30.self_attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.30.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.30.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.30.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.30.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.31.layer_norm1.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.31.layer_norm1.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.31.layer_norm2.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.31.layer_norm2.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.31.mlp.fc1.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.31.mlp.fc1.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.31.mlp.fc2.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.31.mlp.fc2.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.31.self_attn.k_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.31.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.31.self_attn.out_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.31.self_attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.31.self_attn.q_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.31.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.31.self_attn.v_proj.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.31.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.encoder.layers.4.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.4.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.4.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.4.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.4.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.4.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.4.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.4.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.4.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.4.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.4.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.4.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.4.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.5.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.5.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.5.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.5.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.5.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.5.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.5.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.5.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.5.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.5.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.5.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.5.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.5.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.6.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.6.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.6.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.6.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.6.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.6.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.6.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.6.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.6.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.6.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.6.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.6.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.6.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.7.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.7.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.7.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.7.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.7.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.7.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.7.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.7.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.7.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.7.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.7.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.7.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.7.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.8.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.8.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.8.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.8.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.8.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.8.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.8.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.8.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.8.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.8.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.8.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.8.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.8.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.9.layer_norm1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.9.layer_norm1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.9.layer_norm2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.9.layer_norm2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.9.mlp.fc1.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.9.mlp.fc1.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.9.mlp.fc2.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.9.mlp.fc2.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.9.self_attn.k_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.9.self_attn.out_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.9.self_attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.9.self_attn.q_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.9.self_attn.v_proj.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.encoder.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "vision_model.post_layernorm.bias": "pytorch_model-00002-of-00002.bin",
+    "vision_model.post_layernorm.weight": "pytorch_model-00002-of-00002.bin",
+    "vision_model.pre_layrnorm.bias": "pytorch_model-00001-of-00002.bin",
+    "vision_model.pre_layrnorm.weight": "pytorch_model-00001-of-00002.bin"
+  }
+}

xtuner_config.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import torch
+from mmengine.dataset import DefaultSampler
+from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
+                            LoggerHook, ParamSchedulerHook)
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                          BitsAndBytesConfig,
+                          CLIPImageProcessor, CLIPVisionModel)
+from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
+from peft import LoraConfig
+from math import sqrt
+from torch.optim import AdamW
+from xtuner.dataset import VideoDataset, PikaDataset, ConcatDataset, ShareGPTVideoDataset
+from xtuner.dataset.collate_fns import default_collate_fn
+from xtuner.dataset.map_fns import llava_video_map_fn, llava_map_fn, pika_map_fn, template_map_fn_factory
+from xtuner.dataset.samplers import LengthGroupedSampler
+from xtuner.engine import DatasetInfoHook, EvaluateChatHook
+from xtuner.model import PikaModel, PikaVidEncoder
+from xtuner.utils import PROMPT_TEMPLATE
+#######################################################################
+#                          PART 1  Settings                           #
+#######################################################################
+# Model
+llm_name_or_path = 'lmsys/vicuna-7b-v1.5-16k'
+visual_encoder_name_or_path = 'apple/DFN5B-CLIP-ViT-H-14-378'
+# Specify the s3 pretrained pth
+# pretrained_pth = './work_dirs/7b_16k_s4_cont/iter_2000.pth'
+pretrained_pth = 'work_dirs/7b_16k_s5/iter_800.pth'
+prompt_template = PROMPT_TEMPLATE.vicuna
+size = 378
+# None for sampling all the video frames
+n_sample_frames = 32
+visual_token_merge_ratio = 0.1
+accumulative_counts = 32
+lr = 1e-4
+batch_size = 1  # per_device can only be set to 1 to support image and video mix training
+max_length = 4096
+dataloader_num_workers = 0
+max_epochs = 1
+optim_type = AdamW
+betas = (0.9, 0.999)
+weight_decay = 0.1
+max_norm = 1  # grad clip
+warmup_ratio = 0.03
+# Save
+save_steps = 200
+save_total_limit = 2  # Maximum checkpoints to keep (-1 means unlimited)
+#######################################################################
+#            PART 2  Model & Tokenizer & Image Processor              #
+#######################################################################
+tokenizer = dict(
+    type=AutoTokenizer.from_pretrained,
+    pretrained_model_name_or_path=llm_name_or_path,
+    trust_remote_code=True,
+    padding_side='right')
+image_processor = dict(
+    type=CLIPImageProcessor.from_pretrained,
+    pretrained_model_name_or_path='laion/CLIP-ViT-bigG-14-laion2B-39B-b160k',
+    trust_remote_code=True,
+    size=size,
+    crop_size=size)
+model = dict(
+    type=PikaModel,
+    freeze_llm=True,
+    freeze_visual_encoder=True,
+    pretrained_pth=pretrained_pth,
+    llm=dict(
+        type=AutoModelForCausalLM.from_pretrained,
+        pretrained_model_name_or_path=llm_name_or_path,
+        trust_remote_code=True,
+        torch_dtype=torch.float16,
+        quantization_config=dict(
+            type=BitsAndBytesConfig,
+            load_in_4bit=True,
+            load_in_8bit=False,
+            llm_int8_threshold=6.0,
+            llm_int8_has_fp16_weight=False,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type='nf4')),
+    llm_lora=dict(
+        type=LoraConfig,
+        r=512,
+        lora_alpha=256,
+        lora_dropout=0.05,
+        bias='none',
+        task_type='CAUSAL_LM'),
+    visual_encoder=dict(
+        # type=CLIPVisionModel.from_pretrained,
+        type=PikaVidEncoder.from_pretrained,
+        pretrained_model_name_or_path=visual_encoder_name_or_path,
+        visual_token_merge_ratio=visual_token_merge_ratio),
+    visual_encoder_lora=dict(
+        type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, bias='none'),
+)
+#######################################################################
+#                      PART 3  Dataset & Dataloader                   #
+#######################################################################
+allava_image_caption_dataset = dict(
+    type=PikaDataset,
+    data_path='./data/image_finetune/ALLaVA-Caption-LAION-4V',
+    image_folder='./data/image_data',
+    tokenizer=tokenizer,
+    image_processor=image_processor,
+    dataset_map_fn=llava_map_fn,
+    template_map_fn=dict(
+        type=template_map_fn_factory, template=prompt_template),
+    max_length=max_length,
+    pad_image_to_square=False,
+    keep_aspect_ratio=True,)
+sharegpt4v_video_caption_dataset = dict(
+    type=ShareGPTVideoDataset,
+    data_path='./data/video_finetune/sharegptvideo_caption_full_frame',
+    image_folder='./data/video_data/sharegptvideo_900k',
+    tokenizer=tokenizer,
+    image_processor=image_processor,
+    dataset_map_fn=llava_video_map_fn,
+    template_map_fn=dict(
+        type=template_map_fn_factory, template=prompt_template),
+    max_length=max_length,
+    pad_image_to_square=False,
+    frame_number=n_sample_frames,
+    keep_aspect_ratio=True,)
+# mix video and image
+train_dataset = dict(
+    type=ConcatDataset,
+    datasets=[
+        allava_image_caption_dataset,
+        sharegpt4v_video_caption_dataset,
+    ])
+train_dataloader = dict(
+    batch_size=batch_size,
+    num_workers=dataloader_num_workers,
+    dataset=train_dataset,
+    # sampler=dict(
+    #     type=LengthGroupedSampler,
+    #     length_property='modality_length',
+    #     per_device_batch_size=batch_size * accumulative_counts),
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    collate_fn=dict(type=default_collate_fn))
+#######################################################################
+#                    PART 4  Scheduler & Optimizer                    #
+#######################################################################
+# optimizer
+optim_wrapper = dict(
+    type=AmpOptimWrapper,
+    optimizer=dict(
+        type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
+    clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
+    accumulative_counts=accumulative_counts,
+    loss_scale='dynamic',
+    dtype='float16')
+# learning policy
+# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md  # noqa: E501
+param_scheduler = [
+    dict(
+        type=LinearLR,
+        start_factor=1e-5,
+        by_epoch=True,
+        begin=0,
+        end=warmup_ratio * max_epochs,
+        convert_to_iter_based=True),
+    dict(
+        type=CosineAnnealingLR,
+        eta_min=0.0,
+        by_epoch=True,
+        begin=warmup_ratio * max_epochs,
+        T_max=max_epochs,
+        convert_to_iter_based=True)
+]
+# train, val, test setting
+train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1)
+#######################################################################
+#                           PART 5  Runtime                           #
+#######################################################################
+# Evaluate the generation performance during the training
+evaluation_freq = 500
+SYSTEM = ''
+evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg'
+evaluation_inputs = ['请描述一下这张照片', 'Please describe this picture']
+# Log the dialogue periodically during the training process, optional
+custom_hooks = [
+    dict(type=DatasetInfoHook, tokenizer=tokenizer),
+    dict(
+        type=EvaluateChatHook,
+        tokenizer=tokenizer,
+        image_processor=image_processor,
+        every_n_iters=evaluation_freq,
+        evaluation_inputs=evaluation_inputs,
+        evaluation_images=evaluation_images,
+        system=SYSTEM,
+        prompt_template=prompt_template)
+]
+# configure default hooks
+default_hooks = dict(
+    # record the time of every iteration.
+    timer=dict(type=IterTimerHook),
+    # print log every 100 iterations.
+    logger=dict(type=LoggerHook, interval=10),
+    # enable the parameter scheduler.
+    param_scheduler=dict(type=ParamSchedulerHook),
+    # save checkpoint per epoch.
+    # checkpoint=dict(type=CheckpointHook, interval=1),
+    checkpoint=dict(
+        type=CheckpointHook,
+        by_epoch=False,
+        interval=save_steps,
+        max_keep_ckpts=save_total_limit),
+    # set sampler seed in distributed evrionment.
+    sampler_seed=dict(type=DistSamplerSeedHook),
+)
+# configure environment
+env_cfg = dict(
+    # whether to enable cudnn benchmark
+    cudnn_benchmark=False,
+    # set multi process parameters
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    # set distributed parameters
+    dist_cfg=dict(backend='nccl'),
+)
+# set visualizer
+visualizer = None
+# set log level
+log_level = 'INFO'
+# load from which checkpoint
+load_from = None
+# whether to resume training from the loaded checkpoint
+resume = False
+# Defaults to use random seed and disable `deterministic`
+randomness = dict(seed=None, deterministic=False)