U4R
/

yxc97 commited on
Commit
db3c061
1 Parent(s): 95e6b4a

Upload folder using huggingface_hub

Browse files
Files changed (39) hide show
  1. auxiliary_decoder/README.md +21 -0
  2. auxiliary_decoder/adapter_config.json +21 -0
  3. auxiliary_decoder/adapter_model.bin +3 -0
  4. auxiliary_decoder/base/README.md +48 -0
  5. auxiliary_decoder/base/config.json +27 -0
  6. auxiliary_decoder/base/generation_config.json +10 -0
  7. auxiliary_decoder/base/pytorch_model-00001-of-00003.bin +3 -0
  8. auxiliary_decoder/base/pytorch_model-00002-of-00003.bin +3 -0
  9. auxiliary_decoder/base/pytorch_model-00003-of-00003.bin +3 -0
  10. auxiliary_decoder/base/pytorch_model.bin.index.json +410 -0
  11. auxiliary_decoder/base/special_tokens_map.json +24 -0
  12. auxiliary_decoder/base/tokenizer.model +3 -0
  13. auxiliary_decoder/base/tokenizer_config.json +35 -0
  14. auxiliary_decoder/optimizer.pt +3 -0
  15. auxiliary_decoder/rng_state_0.pth +3 -0
  16. auxiliary_decoder/rng_state_1.pth +3 -0
  17. auxiliary_decoder/rng_state_2.pth +3 -0
  18. auxiliary_decoder/rng_state_3.pth +3 -0
  19. auxiliary_decoder/rng_state_4.pth +3 -0
  20. auxiliary_decoder/rng_state_5.pth +3 -0
  21. auxiliary_decoder/rng_state_6.pth +3 -0
  22. auxiliary_decoder/rng_state_7.pth +3 -0
  23. auxiliary_decoder/scheduler.pt +3 -0
  24. auxiliary_decoder/trainer_state.json +787 -0
  25. auxiliary_decoder/training_args.bin +3 -0
  26. base_decoder/config.json +180 -0
  27. base_decoder/generation_config.json +8 -0
  28. base_decoder/preprocessor_config.json +12 -0
  29. base_decoder/pytorch_model.bin +3 -0
  30. base_decoder/special_tokens_map.json +107 -0
  31. base_decoder/state_dict.pth +3 -0
  32. base_decoder/title_type/config.json +181 -0
  33. base_decoder/title_type/generation_config.json +8 -0
  34. base_decoder/title_type/pytorch_model.bin +3 -0
  35. base_decoder/title_type/state_dict.pth +3 -0
  36. base_decoder/tokenizer.json +0 -0
  37. base_decoder/tokenizer_config.json +113 -0
  38. instruction_adapter/mlp_classifier.pth +3 -0
  39. instruction_adapter/vectorizer.pkl +3 -0
auxiliary_decoder/README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - quant_method: bitsandbytes
9
+ - load_in_8bit: True
10
+ - load_in_4bit: False
11
+ - llm_int8_threshold: 6.0
12
+ - llm_int8_skip_modules: None
13
+ - llm_int8_enable_fp32_cpu_offload: False
14
+ - llm_int8_has_fp16_weight: False
15
+ - bnb_4bit_quant_type: fp4
16
+ - bnb_4bit_use_double_quant: False
17
+ - bnb_4bit_compute_dtype: float32
18
+ ### Framework versions
19
+
20
+
21
+ - PEFT 0.4.0
auxiliary_decoder/adapter_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "/cpfs01/shared/ADLab/hug_ckpts/vicuna-13b-v1.5",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 32,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 16,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "q_proj",
18
+ "v_proj"
19
+ ],
20
+ "task_type": "CAUSAL_LM"
21
+ }
auxiliary_decoder/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5e1621f48d9ad8feb1d6d31050275f0aafd080c5c07153301fe2f48411f4406
3
+ size 443
auxiliary_decoder/base/README.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ inference: false
3
+ license: llama2
4
+ ---
5
+
6
+ # Vicuna Model Card
7
+
8
+ ## Model Details
9
+
10
+ Vicuna is a chat assistant trained by fine-tuning Llama 2 on user-shared conversations collected from ShareGPT.
11
+
12
+ - **Developed by:** [LMSYS](https://lmsys.org/)
13
+ - **Model type:** An auto-regressive language model based on the transformer architecture
14
+ - **License:** Llama 2 Community License Agreement
15
+ - **Finetuned from model:** [Llama 2](https://arxiv.org/abs/2307.09288)
16
+
17
+ ### Model Sources
18
+
19
+ - **Repository:** https://github.com/lm-sys/FastChat
20
+ - **Blog:** https://lmsys.org/blog/2023-03-30-vicuna/
21
+ - **Paper:** https://arxiv.org/abs/2306.05685
22
+ - **Demo:** https://chat.lmsys.org/
23
+
24
+ ## Uses
25
+
26
+ The primary use of Vicuna is research on large language models and chatbots.
27
+ The primary intended users of the model are researchers and hobbyists in natural language processing, machine learning, and artificial intelligence.
28
+
29
+ ## How to Get Started with the Model
30
+
31
+ - Command line interface: https://github.com/lm-sys/FastChat#vicuna-weights
32
+ - APIs (OpenAI API, Huggingface API): https://github.com/lm-sys/FastChat/tree/main#api
33
+
34
+ ## Training Details
35
+
36
+ Vicuna v1.5 is fine-tuned from Llama 2 with supervised instruction fine-tuning.
37
+ The training data is around 125K conversations collected from ShareGPT.com.
38
+ See more details in the "Training Details of Vicuna Models" section in the appendix of this [paper](https://arxiv.org/pdf/2306.05685.pdf).
39
+
40
+ ## Evaluation
41
+
42
+ ![Evaluation Results](https://github.com/lm-sys/lm-sys.github.io/blob/main/public/images/webdata/vicuna_v1.5_eval.png?raw=true)
43
+
44
+ Vicuna is evaluated with standard benchmarks, human preference, and LLM-as-a-judge. See more details in this [paper](https://arxiv.org/pdf/2306.05685.pdf) and [leaderboard](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard).
45
+
46
+ ## Difference between different versions of Vicuna
47
+
48
+ See [vicuna_weights_version.md](https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md)
auxiliary_decoder/base/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "vicuna-13b-v1.5",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "bos_token_id": 1,
7
+ "eos_token_id": 2,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 5120,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 13824,
12
+ "max_length": 4096,
13
+ "max_position_embeddings": 4096,
14
+ "model_type": "llama",
15
+ "num_attention_heads": 40,
16
+ "num_hidden_layers": 40,
17
+ "num_key_value_heads": 40,
18
+ "pad_token_id": 0,
19
+ "pretraining_tp": 1,
20
+ "rms_norm_eps": 1e-05,
21
+ "rope_scaling": null,
22
+ "tie_word_embeddings": false,
23
+ "torch_dtype": "float16",
24
+ "transformers_version": "4.31.0",
25
+ "use_cache": true,
26
+ "vocab_size": 32000
27
+ }
auxiliary_decoder/base/generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "max_length": 4096,
6
+ "pad_token_id": 0,
7
+ "temperature": 0.9,
8
+ "top_p": 0.6,
9
+ "transformers_version": "4.31.0"
10
+ }
auxiliary_decoder/base/pytorch_model-00001-of-00003.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9b54ba4bf8f87c6129f1e35c63d1ed248971d827ee5559f1a3bb48b39e43187
3
+ size 9948728430
auxiliary_decoder/base/pytorch_model-00002-of-00003.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ade40ae3180cbe5553278edb8e61ff4f2059f62be12b06f253b373838c199221
3
+ size 9904165024
auxiliary_decoder/base/pytorch_model-00003-of-00003.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4bd8cf2ab7e7bf22aaaf9b35ec5d41602d83f5a97bb82e17662ffe0a20ab215
3
+ size 6178983625
auxiliary_decoder/base/pytorch_model.bin.index.json ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 26031738880
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "pytorch_model-00003-of-00003.bin",
7
+ "model.embed_tokens.weight": "pytorch_model-00001-of-00003.bin",
8
+ "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
9
+ "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
10
+ "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
11
+ "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
12
+ "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
13
+ "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
14
+ "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
15
+ "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
16
+ "model.layers.0.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
17
+ "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
18
+ "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
19
+ "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
20
+ "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
21
+ "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
22
+ "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
23
+ "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
24
+ "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
25
+ "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
26
+ "model.layers.1.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
27
+ "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
28
+ "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
29
+ "model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
30
+ "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
31
+ "model.layers.10.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
32
+ "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
33
+ "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
34
+ "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
35
+ "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
36
+ "model.layers.10.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
37
+ "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
38
+ "model.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
39
+ "model.layers.11.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
40
+ "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
41
+ "model.layers.11.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
42
+ "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
43
+ "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
44
+ "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
45
+ "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
46
+ "model.layers.11.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
47
+ "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
48
+ "model.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
49
+ "model.layers.12.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
50
+ "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
51
+ "model.layers.12.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
52
+ "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
53
+ "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
54
+ "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
55
+ "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
56
+ "model.layers.12.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
57
+ "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
58
+ "model.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
59
+ "model.layers.13.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
60
+ "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
61
+ "model.layers.13.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
62
+ "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
63
+ "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
64
+ "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
65
+ "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
66
+ "model.layers.13.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
67
+ "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
68
+ "model.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
69
+ "model.layers.14.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
70
+ "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
71
+ "model.layers.14.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
72
+ "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
73
+ "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
74
+ "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
75
+ "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
76
+ "model.layers.14.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
77
+ "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
78
+ "model.layers.15.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
79
+ "model.layers.15.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
80
+ "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
81
+ "model.layers.15.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
82
+ "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
83
+ "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
84
+ "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
85
+ "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
86
+ "model.layers.15.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
87
+ "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
88
+ "model.layers.16.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
89
+ "model.layers.16.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
90
+ "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
91
+ "model.layers.16.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
92
+ "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
93
+ "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
94
+ "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
95
+ "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
96
+ "model.layers.16.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
97
+ "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
98
+ "model.layers.17.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
99
+ "model.layers.17.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
100
+ "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
101
+ "model.layers.17.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
102
+ "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
103
+ "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
104
+ "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
105
+ "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
106
+ "model.layers.17.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
107
+ "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
108
+ "model.layers.18.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
109
+ "model.layers.18.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
110
+ "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
111
+ "model.layers.18.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
112
+ "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
113
+ "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
114
+ "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
115
+ "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
116
+ "model.layers.18.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
117
+ "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
118
+ "model.layers.19.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
119
+ "model.layers.19.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
120
+ "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
121
+ "model.layers.19.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
122
+ "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
123
+ "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
124
+ "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
125
+ "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
126
+ "model.layers.19.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
127
+ "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
128
+ "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
129
+ "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
130
+ "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
131
+ "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
132
+ "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
133
+ "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
134
+ "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
135
+ "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
136
+ "model.layers.2.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
137
+ "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
138
+ "model.layers.20.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
139
+ "model.layers.20.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
140
+ "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
141
+ "model.layers.20.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
142
+ "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
143
+ "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
144
+ "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
145
+ "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
146
+ "model.layers.20.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
147
+ "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
148
+ "model.layers.21.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
149
+ "model.layers.21.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
150
+ "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
151
+ "model.layers.21.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
152
+ "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
153
+ "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
154
+ "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
155
+ "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
156
+ "model.layers.21.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
157
+ "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
158
+ "model.layers.22.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
159
+ "model.layers.22.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
160
+ "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
161
+ "model.layers.22.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
162
+ "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
163
+ "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
164
+ "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
165
+ "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
166
+ "model.layers.22.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
167
+ "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
168
+ "model.layers.23.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
169
+ "model.layers.23.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
170
+ "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
171
+ "model.layers.23.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
172
+ "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
173
+ "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
174
+ "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
175
+ "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
176
+ "model.layers.23.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
177
+ "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
178
+ "model.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
179
+ "model.layers.24.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
180
+ "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
181
+ "model.layers.24.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
182
+ "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
183
+ "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
184
+ "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
185
+ "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
186
+ "model.layers.24.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
187
+ "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
188
+ "model.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
189
+ "model.layers.25.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
190
+ "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
191
+ "model.layers.25.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
192
+ "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
193
+ "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
194
+ "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
195
+ "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
196
+ "model.layers.25.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
197
+ "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
198
+ "model.layers.26.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
199
+ "model.layers.26.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
200
+ "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
201
+ "model.layers.26.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
202
+ "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
203
+ "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
204
+ "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
205
+ "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
206
+ "model.layers.26.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
207
+ "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
208
+ "model.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
209
+ "model.layers.27.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
210
+ "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
211
+ "model.layers.27.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
212
+ "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
213
+ "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
214
+ "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
215
+ "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
216
+ "model.layers.27.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
217
+ "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
218
+ "model.layers.28.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
219
+ "model.layers.28.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
220
+ "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
221
+ "model.layers.28.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
222
+ "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
223
+ "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
224
+ "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
225
+ "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
226
+ "model.layers.28.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
227
+ "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
228
+ "model.layers.29.input_layernorm.weight": "pytorch_model-00002-of-00003.bin",
229
+ "model.layers.29.mlp.down_proj.weight": "pytorch_model-00002-of-00003.bin",
230
+ "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
231
+ "model.layers.29.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
232
+ "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00002-of-00003.bin",
233
+ "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
234
+ "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
235
+ "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
236
+ "model.layers.29.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
237
+ "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
238
+ "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
239
+ "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
240
+ "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
241
+ "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
242
+ "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
243
+ "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
244
+ "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
245
+ "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
246
+ "model.layers.3.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
247
+ "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
248
+ "model.layers.30.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
249
+ "model.layers.30.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
250
+ "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00002-of-00003.bin",
251
+ "model.layers.30.mlp.up_proj.weight": "pytorch_model-00002-of-00003.bin",
252
+ "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
253
+ "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00002-of-00003.bin",
254
+ "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00002-of-00003.bin",
255
+ "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00002-of-00003.bin",
256
+ "model.layers.30.self_attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00003.bin",
257
+ "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00002-of-00003.bin",
258
+ "model.layers.31.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
259
+ "model.layers.31.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
260
+ "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
261
+ "model.layers.31.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
262
+ "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
263
+ "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
264
+ "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
265
+ "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
266
+ "model.layers.31.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
267
+ "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
268
+ "model.layers.32.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
269
+ "model.layers.32.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
270
+ "model.layers.32.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
271
+ "model.layers.32.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
272
+ "model.layers.32.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
273
+ "model.layers.32.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
274
+ "model.layers.32.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
275
+ "model.layers.32.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
276
+ "model.layers.32.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
277
+ "model.layers.32.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
278
+ "model.layers.33.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
279
+ "model.layers.33.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
280
+ "model.layers.33.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
281
+ "model.layers.33.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
282
+ "model.layers.33.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
283
+ "model.layers.33.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
284
+ "model.layers.33.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
285
+ "model.layers.33.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
286
+ "model.layers.33.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
287
+ "model.layers.33.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
288
+ "model.layers.34.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
289
+ "model.layers.34.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
290
+ "model.layers.34.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
291
+ "model.layers.34.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
292
+ "model.layers.34.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
293
+ "model.layers.34.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
294
+ "model.layers.34.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
295
+ "model.layers.34.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
296
+ "model.layers.34.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
297
+ "model.layers.34.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
298
+ "model.layers.35.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
299
+ "model.layers.35.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
300
+ "model.layers.35.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
301
+ "model.layers.35.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
302
+ "model.layers.35.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
303
+ "model.layers.35.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
304
+ "model.layers.35.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
305
+ "model.layers.35.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
306
+ "model.layers.35.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
307
+ "model.layers.35.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
308
+ "model.layers.36.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
309
+ "model.layers.36.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
310
+ "model.layers.36.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
311
+ "model.layers.36.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
312
+ "model.layers.36.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
313
+ "model.layers.36.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
314
+ "model.layers.36.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
315
+ "model.layers.36.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
316
+ "model.layers.36.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
317
+ "model.layers.36.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
318
+ "model.layers.37.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
319
+ "model.layers.37.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
320
+ "model.layers.37.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
321
+ "model.layers.37.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
322
+ "model.layers.37.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
323
+ "model.layers.37.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
324
+ "model.layers.37.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
325
+ "model.layers.37.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
326
+ "model.layers.37.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
327
+ "model.layers.37.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
328
+ "model.layers.38.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
329
+ "model.layers.38.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
330
+ "model.layers.38.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
331
+ "model.layers.38.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
332
+ "model.layers.38.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
333
+ "model.layers.38.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
334
+ "model.layers.38.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
335
+ "model.layers.38.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
336
+ "model.layers.38.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
337
+ "model.layers.38.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
338
+ "model.layers.39.input_layernorm.weight": "pytorch_model-00003-of-00003.bin",
339
+ "model.layers.39.mlp.down_proj.weight": "pytorch_model-00003-of-00003.bin",
340
+ "model.layers.39.mlp.gate_proj.weight": "pytorch_model-00003-of-00003.bin",
341
+ "model.layers.39.mlp.up_proj.weight": "pytorch_model-00003-of-00003.bin",
342
+ "model.layers.39.post_attention_layernorm.weight": "pytorch_model-00003-of-00003.bin",
343
+ "model.layers.39.self_attn.k_proj.weight": "pytorch_model-00003-of-00003.bin",
344
+ "model.layers.39.self_attn.o_proj.weight": "pytorch_model-00003-of-00003.bin",
345
+ "model.layers.39.self_attn.q_proj.weight": "pytorch_model-00003-of-00003.bin",
346
+ "model.layers.39.self_attn.rotary_emb.inv_freq": "pytorch_model-00003-of-00003.bin",
347
+ "model.layers.39.self_attn.v_proj.weight": "pytorch_model-00003-of-00003.bin",
348
+ "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
349
+ "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
350
+ "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
351
+ "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
352
+ "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
353
+ "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
354
+ "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
355
+ "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
356
+ "model.layers.4.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
357
+ "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
358
+ "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
359
+ "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
360
+ "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
361
+ "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
362
+ "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
363
+ "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
364
+ "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
365
+ "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
366
+ "model.layers.5.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
367
+ "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
368
+ "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
369
+ "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
370
+ "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
371
+ "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
372
+ "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
373
+ "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
374
+ "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
375
+ "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
376
+ "model.layers.6.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
377
+ "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
378
+ "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
379
+ "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
380
+ "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
381
+ "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
382
+ "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
383
+ "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
384
+ "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
385
+ "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
386
+ "model.layers.7.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
387
+ "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
388
+ "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
389
+ "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
390
+ "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
391
+ "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
392
+ "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
393
+ "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
394
+ "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
395
+ "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
396
+ "model.layers.8.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
397
+ "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
398
+ "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00003.bin",
399
+ "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00003.bin",
400
+ "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00003.bin",
401
+ "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00003.bin",
402
+ "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00003.bin",
403
+ "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00003.bin",
404
+ "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00003.bin",
405
+ "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00003.bin",
406
+ "model.layers.9.self_attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00003.bin",
407
+ "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00003.bin",
408
+ "model.norm.weight": "pytorch_model-00003-of-00003.bin"
409
+ }
410
+ }
auxiliary_decoder/base/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
auxiliary_decoder/base/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
auxiliary_decoder/base/tokenizer_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<s>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "clean_up_tokenization_spaces": false,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "</s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "legacy": false,
22
+ "model_max_length": 4096,
23
+ "pad_token": null,
24
+ "padding_side": "right",
25
+ "sp_model_kwargs": {},
26
+ "tokenizer_class": "LlamaTokenizer",
27
+ "unk_token": {
28
+ "__type": "AddedToken",
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ }
35
+ }
auxiliary_decoder/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84145a48123e11adfc821a71917d5d0e110486174b8584ed84e18a49d2c3fe76
3
+ size 104992197
auxiliary_decoder/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a304a1223d00604b753deb6594e0950ceef0a4d2fdda71164b4d52d543170ddc
3
+ size 21687
auxiliary_decoder/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a979ec0de41a50788cbb8512d05947cc7dc57d6beb224ad5b7c67f0b38f2a28d
3
+ size 21687
auxiliary_decoder/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72a3e1c9a9cf8d0873690d25bf8986faa7c538b5914f903f98ac122a3b980047
3
+ size 21687
auxiliary_decoder/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c22a2769ddb1302b35a5ad5f4db694dd213e8350fb26e94db60c5d16c81482bb
3
+ size 21687
auxiliary_decoder/rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c051f0d88a8b76383aa528d149af2d6eec75df71533fa31051fa1606718ea687
3
+ size 21687
auxiliary_decoder/rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14033fa8f06f18a243e0d1957095424ea2db37f82a8c1c539b0a2400b5fbe652
3
+ size 21687
auxiliary_decoder/rng_state_6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5acae21a2b65dacd523c73c632f2708b3f171bde456132dabc0df193601ba965
3
+ size 21687
auxiliary_decoder/rng_state_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5dfe66299eee82c5b064b5385a5ad783bf5691b8ef49d616f8808eaf4d0349bb
3
+ size 21687
auxiliary_decoder/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c767e531f5cc92dcf2b7622466a92b93376b191b8847a14de57d9649808db98
3
+ size 627
auxiliary_decoder/trainer_state.json ADDED
@@ -0,0 +1,787 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.3790667653083801,
3
+ "best_model_checkpoint": "exp/vicuna-13b-lora-sft-code_qa_desc_summ_triplet_r_16_alpha_32_8GPUs-0116/checkpoint-1200",
4
+ "epoch": 4.375569735642662,
5
+ "eval_steps": 200,
6
+ "global_step": 1200,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04,
13
+ "learning_rate": 2.9999999999999997e-05,
14
+ "loss": 1.374,
15
+ "step": 10
16
+ },
17
+ {
18
+ "epoch": 0.07,
19
+ "learning_rate": 5.9999999999999995e-05,
20
+ "loss": 1.416,
21
+ "step": 20
22
+ },
23
+ {
24
+ "epoch": 0.11,
25
+ "learning_rate": 8.999999999999999e-05,
26
+ "loss": 1.0841,
27
+ "step": 30
28
+ },
29
+ {
30
+ "epoch": 0.15,
31
+ "learning_rate": 0.00011999999999999999,
32
+ "loss": 0.6951,
33
+ "step": 40
34
+ },
35
+ {
36
+ "epoch": 0.18,
37
+ "learning_rate": 0.00015,
38
+ "loss": 0.6106,
39
+ "step": 50
40
+ },
41
+ {
42
+ "epoch": 0.22,
43
+ "learning_rate": 0.00017999999999999998,
44
+ "loss": 0.5182,
45
+ "step": 60
46
+ },
47
+ {
48
+ "epoch": 0.26,
49
+ "learning_rate": 0.00020999999999999998,
50
+ "loss": 0.5251,
51
+ "step": 70
52
+ },
53
+ {
54
+ "epoch": 0.29,
55
+ "learning_rate": 0.00023999999999999998,
56
+ "loss": 0.4988,
57
+ "step": 80
58
+ },
59
+ {
60
+ "epoch": 0.33,
61
+ "learning_rate": 0.00027,
62
+ "loss": 0.4838,
63
+ "step": 90
64
+ },
65
+ {
66
+ "epoch": 0.36,
67
+ "learning_rate": 0.0003,
68
+ "loss": 0.4929,
69
+ "step": 100
70
+ },
71
+ {
72
+ "epoch": 0.4,
73
+ "learning_rate": 0.0002976377952755905,
74
+ "loss": 0.4463,
75
+ "step": 110
76
+ },
77
+ {
78
+ "epoch": 0.44,
79
+ "learning_rate": 0.0002952755905511811,
80
+ "loss": 0.4402,
81
+ "step": 120
82
+ },
83
+ {
84
+ "epoch": 0.47,
85
+ "learning_rate": 0.00029291338582677163,
86
+ "loss": 0.4414,
87
+ "step": 130
88
+ },
89
+ {
90
+ "epoch": 0.51,
91
+ "learning_rate": 0.00029055118110236217,
92
+ "loss": 0.4278,
93
+ "step": 140
94
+ },
95
+ {
96
+ "epoch": 0.55,
97
+ "learning_rate": 0.0002881889763779527,
98
+ "loss": 0.4346,
99
+ "step": 150
100
+ },
101
+ {
102
+ "epoch": 0.58,
103
+ "learning_rate": 0.0002858267716535433,
104
+ "loss": 0.4163,
105
+ "step": 160
106
+ },
107
+ {
108
+ "epoch": 0.62,
109
+ "learning_rate": 0.00028346456692913383,
110
+ "loss": 0.4221,
111
+ "step": 170
112
+ },
113
+ {
114
+ "epoch": 0.66,
115
+ "learning_rate": 0.00028110236220472436,
116
+ "loss": 0.4257,
117
+ "step": 180
118
+ },
119
+ {
120
+ "epoch": 0.69,
121
+ "learning_rate": 0.00027874015748031495,
122
+ "loss": 0.4166,
123
+ "step": 190
124
+ },
125
+ {
126
+ "epoch": 0.73,
127
+ "learning_rate": 0.0002763779527559055,
128
+ "loss": 0.4244,
129
+ "step": 200
130
+ },
131
+ {
132
+ "epoch": 0.73,
133
+ "eval_loss": 0.4145399034023285,
134
+ "eval_runtime": 30.3529,
135
+ "eval_samples_per_second": 65.892,
136
+ "eval_steps_per_second": 1.054,
137
+ "step": 200
138
+ },
139
+ {
140
+ "epoch": 0.77,
141
+ "learning_rate": 0.0002740157480314961,
142
+ "loss": 0.407,
143
+ "step": 210
144
+ },
145
+ {
146
+ "epoch": 0.8,
147
+ "learning_rate": 0.00027165354330708656,
148
+ "loss": 0.4103,
149
+ "step": 220
150
+ },
151
+ {
152
+ "epoch": 0.84,
153
+ "learning_rate": 0.00026929133858267715,
154
+ "loss": 0.4116,
155
+ "step": 230
156
+ },
157
+ {
158
+ "epoch": 0.88,
159
+ "learning_rate": 0.0002669291338582677,
160
+ "loss": 0.4027,
161
+ "step": 240
162
+ },
163
+ {
164
+ "epoch": 0.91,
165
+ "learning_rate": 0.0002645669291338582,
166
+ "loss": 0.4217,
167
+ "step": 250
168
+ },
169
+ {
170
+ "epoch": 0.95,
171
+ "learning_rate": 0.0002622047244094488,
172
+ "loss": 0.3988,
173
+ "step": 260
174
+ },
175
+ {
176
+ "epoch": 0.98,
177
+ "learning_rate": 0.00025984251968503934,
178
+ "loss": 0.4146,
179
+ "step": 270
180
+ },
181
+ {
182
+ "epoch": 1.02,
183
+ "learning_rate": 0.00025748031496062993,
184
+ "loss": 0.4155,
185
+ "step": 280
186
+ },
187
+ {
188
+ "epoch": 1.06,
189
+ "learning_rate": 0.00025511811023622047,
190
+ "loss": 0.4022,
191
+ "step": 290
192
+ },
193
+ {
194
+ "epoch": 1.09,
195
+ "learning_rate": 0.000252755905511811,
196
+ "loss": 0.4044,
197
+ "step": 300
198
+ },
199
+ {
200
+ "epoch": 1.13,
201
+ "learning_rate": 0.00025039370078740154,
202
+ "loss": 0.4001,
203
+ "step": 310
204
+ },
205
+ {
206
+ "epoch": 1.17,
207
+ "learning_rate": 0.00024803149606299207,
208
+ "loss": 0.3979,
209
+ "step": 320
210
+ },
211
+ {
212
+ "epoch": 1.2,
213
+ "learning_rate": 0.00024566929133858266,
214
+ "loss": 0.405,
215
+ "step": 330
216
+ },
217
+ {
218
+ "epoch": 1.24,
219
+ "learning_rate": 0.0002433070866141732,
220
+ "loss": 0.3959,
221
+ "step": 340
222
+ },
223
+ {
224
+ "epoch": 1.28,
225
+ "learning_rate": 0.00024094488188976376,
226
+ "loss": 0.4061,
227
+ "step": 350
228
+ },
229
+ {
230
+ "epoch": 1.31,
231
+ "learning_rate": 0.00023858267716535432,
232
+ "loss": 0.3914,
233
+ "step": 360
234
+ },
235
+ {
236
+ "epoch": 1.35,
237
+ "learning_rate": 0.00023622047244094488,
238
+ "loss": 0.3978,
239
+ "step": 370
240
+ },
241
+ {
242
+ "epoch": 1.39,
243
+ "learning_rate": 0.0002338582677165354,
244
+ "loss": 0.4008,
245
+ "step": 380
246
+ },
247
+ {
248
+ "epoch": 1.42,
249
+ "learning_rate": 0.00023149606299212595,
250
+ "loss": 0.3908,
251
+ "step": 390
252
+ },
253
+ {
254
+ "epoch": 1.46,
255
+ "learning_rate": 0.00022913385826771652,
256
+ "loss": 0.4006,
257
+ "step": 400
258
+ },
259
+ {
260
+ "epoch": 1.46,
261
+ "eval_loss": 0.398809552192688,
262
+ "eval_runtime": 30.3867,
263
+ "eval_samples_per_second": 65.818,
264
+ "eval_steps_per_second": 1.053,
265
+ "step": 400
266
+ },
267
+ {
268
+ "epoch": 1.49,
269
+ "learning_rate": 0.00022677165354330705,
270
+ "loss": 0.3892,
271
+ "step": 410
272
+ },
273
+ {
274
+ "epoch": 1.53,
275
+ "learning_rate": 0.00022440944881889761,
276
+ "loss": 0.3955,
277
+ "step": 420
278
+ },
279
+ {
280
+ "epoch": 1.57,
281
+ "learning_rate": 0.00022204724409448818,
282
+ "loss": 0.3891,
283
+ "step": 430
284
+ },
285
+ {
286
+ "epoch": 1.6,
287
+ "learning_rate": 0.00021968503937007874,
288
+ "loss": 0.3867,
289
+ "step": 440
290
+ },
291
+ {
292
+ "epoch": 1.64,
293
+ "learning_rate": 0.00021732283464566927,
294
+ "loss": 0.3917,
295
+ "step": 450
296
+ },
297
+ {
298
+ "epoch": 1.68,
299
+ "learning_rate": 0.0002149606299212598,
300
+ "loss": 0.3808,
301
+ "step": 460
302
+ },
303
+ {
304
+ "epoch": 1.71,
305
+ "learning_rate": 0.00021259842519685037,
306
+ "loss": 0.3868,
307
+ "step": 470
308
+ },
309
+ {
310
+ "epoch": 1.75,
311
+ "learning_rate": 0.0002102362204724409,
312
+ "loss": 0.395,
313
+ "step": 480
314
+ },
315
+ {
316
+ "epoch": 1.79,
317
+ "learning_rate": 0.00020787401574803147,
318
+ "loss": 0.3739,
319
+ "step": 490
320
+ },
321
+ {
322
+ "epoch": 1.82,
323
+ "learning_rate": 0.00020551181102362203,
324
+ "loss": 0.3859,
325
+ "step": 500
326
+ },
327
+ {
328
+ "epoch": 1.86,
329
+ "learning_rate": 0.0002031496062992126,
330
+ "loss": 0.3882,
331
+ "step": 510
332
+ },
333
+ {
334
+ "epoch": 1.9,
335
+ "learning_rate": 0.00020078740157480313,
336
+ "loss": 0.3816,
337
+ "step": 520
338
+ },
339
+ {
340
+ "epoch": 1.93,
341
+ "learning_rate": 0.0001984251968503937,
342
+ "loss": 0.3943,
343
+ "step": 530
344
+ },
345
+ {
346
+ "epoch": 1.97,
347
+ "learning_rate": 0.00019606299212598423,
348
+ "loss": 0.3828,
349
+ "step": 540
350
+ },
351
+ {
352
+ "epoch": 2.01,
353
+ "learning_rate": 0.0001937007874015748,
354
+ "loss": 0.3927,
355
+ "step": 550
356
+ },
357
+ {
358
+ "epoch": 2.04,
359
+ "learning_rate": 0.00019133858267716532,
360
+ "loss": 0.3916,
361
+ "step": 560
362
+ },
363
+ {
364
+ "epoch": 2.08,
365
+ "learning_rate": 0.00018897637795275589,
366
+ "loss": 0.3769,
367
+ "step": 570
368
+ },
369
+ {
370
+ "epoch": 2.11,
371
+ "learning_rate": 0.00018661417322834645,
372
+ "loss": 0.3858,
373
+ "step": 580
374
+ },
375
+ {
376
+ "epoch": 2.15,
377
+ "learning_rate": 0.000184251968503937,
378
+ "loss": 0.3803,
379
+ "step": 590
380
+ },
381
+ {
382
+ "epoch": 2.19,
383
+ "learning_rate": 0.00018188976377952755,
384
+ "loss": 0.383,
385
+ "step": 600
386
+ },
387
+ {
388
+ "epoch": 2.19,
389
+ "eval_loss": 0.39037391543388367,
390
+ "eval_runtime": 30.346,
391
+ "eval_samples_per_second": 65.907,
392
+ "eval_steps_per_second": 1.055,
393
+ "step": 600
394
+ },
395
+ {
396
+ "epoch": 2.22,
397
+ "learning_rate": 0.0001795275590551181,
398
+ "loss": 0.3887,
399
+ "step": 610
400
+ },
401
+ {
402
+ "epoch": 2.26,
403
+ "learning_rate": 0.00017716535433070864,
404
+ "loss": 0.3715,
405
+ "step": 620
406
+ },
407
+ {
408
+ "epoch": 2.3,
409
+ "learning_rate": 0.00017480314960629918,
410
+ "loss": 0.3852,
411
+ "step": 630
412
+ },
413
+ {
414
+ "epoch": 2.33,
415
+ "learning_rate": 0.00017244094488188974,
416
+ "loss": 0.3744,
417
+ "step": 640
418
+ },
419
+ {
420
+ "epoch": 2.37,
421
+ "learning_rate": 0.0001700787401574803,
422
+ "loss": 0.3903,
423
+ "step": 650
424
+ },
425
+ {
426
+ "epoch": 2.41,
427
+ "learning_rate": 0.00016771653543307086,
428
+ "loss": 0.3829,
429
+ "step": 660
430
+ },
431
+ {
432
+ "epoch": 2.44,
433
+ "learning_rate": 0.0001653543307086614,
434
+ "loss": 0.3684,
435
+ "step": 670
436
+ },
437
+ {
438
+ "epoch": 2.48,
439
+ "learning_rate": 0.00016299212598425196,
440
+ "loss": 0.3819,
441
+ "step": 680
442
+ },
443
+ {
444
+ "epoch": 2.52,
445
+ "learning_rate": 0.00016062992125984252,
446
+ "loss": 0.3743,
447
+ "step": 690
448
+ },
449
+ {
450
+ "epoch": 2.55,
451
+ "learning_rate": 0.00015826771653543303,
452
+ "loss": 0.3763,
453
+ "step": 700
454
+ },
455
+ {
456
+ "epoch": 2.59,
457
+ "learning_rate": 0.0001559055118110236,
458
+ "loss": 0.3805,
459
+ "step": 710
460
+ },
461
+ {
462
+ "epoch": 2.63,
463
+ "learning_rate": 0.00015354330708661416,
464
+ "loss": 0.3734,
465
+ "step": 720
466
+ },
467
+ {
468
+ "epoch": 2.66,
469
+ "learning_rate": 0.00015118110236220472,
470
+ "loss": 0.3808,
471
+ "step": 730
472
+ },
473
+ {
474
+ "epoch": 2.7,
475
+ "learning_rate": 0.00014881889763779525,
476
+ "loss": 0.3675,
477
+ "step": 740
478
+ },
479
+ {
480
+ "epoch": 2.73,
481
+ "learning_rate": 0.00014645669291338582,
482
+ "loss": 0.3671,
483
+ "step": 750
484
+ },
485
+ {
486
+ "epoch": 2.77,
487
+ "learning_rate": 0.00014409448818897635,
488
+ "loss": 0.3686,
489
+ "step": 760
490
+ },
491
+ {
492
+ "epoch": 2.81,
493
+ "learning_rate": 0.00014173228346456691,
494
+ "loss": 0.3631,
495
+ "step": 770
496
+ },
497
+ {
498
+ "epoch": 2.84,
499
+ "learning_rate": 0.00013937007874015748,
500
+ "loss": 0.369,
501
+ "step": 780
502
+ },
503
+ {
504
+ "epoch": 2.88,
505
+ "learning_rate": 0.00013700787401574804,
506
+ "loss": 0.3604,
507
+ "step": 790
508
+ },
509
+ {
510
+ "epoch": 2.92,
511
+ "learning_rate": 0.00013464566929133857,
512
+ "loss": 0.3657,
513
+ "step": 800
514
+ },
515
+ {
516
+ "epoch": 2.92,
517
+ "eval_loss": 0.3851480185985565,
518
+ "eval_runtime": 30.443,
519
+ "eval_samples_per_second": 65.697,
520
+ "eval_steps_per_second": 1.051,
521
+ "step": 800
522
+ },
523
+ {
524
+ "epoch": 2.95,
525
+ "learning_rate": 0.0001322834645669291,
526
+ "loss": 0.3618,
527
+ "step": 810
528
+ },
529
+ {
530
+ "epoch": 2.99,
531
+ "learning_rate": 0.00012992125984251967,
532
+ "loss": 0.3723,
533
+ "step": 820
534
+ },
535
+ {
536
+ "epoch": 3.03,
537
+ "learning_rate": 0.00012755905511811023,
538
+ "loss": 0.3665,
539
+ "step": 830
540
+ },
541
+ {
542
+ "epoch": 3.06,
543
+ "learning_rate": 0.00012519685039370077,
544
+ "loss": 0.3738,
545
+ "step": 840
546
+ },
547
+ {
548
+ "epoch": 3.1,
549
+ "learning_rate": 0.00012283464566929133,
550
+ "loss": 0.3669,
551
+ "step": 850
552
+ },
553
+ {
554
+ "epoch": 3.14,
555
+ "learning_rate": 0.00012047244094488188,
556
+ "loss": 0.3683,
557
+ "step": 860
558
+ },
559
+ {
560
+ "epoch": 3.17,
561
+ "learning_rate": 0.00011811023622047244,
562
+ "loss": 0.3531,
563
+ "step": 870
564
+ },
565
+ {
566
+ "epoch": 3.21,
567
+ "learning_rate": 0.00011574803149606298,
568
+ "loss": 0.368,
569
+ "step": 880
570
+ },
571
+ {
572
+ "epoch": 3.25,
573
+ "learning_rate": 0.00011338582677165353,
574
+ "loss": 0.367,
575
+ "step": 890
576
+ },
577
+ {
578
+ "epoch": 3.28,
579
+ "learning_rate": 0.00011102362204724409,
580
+ "loss": 0.3628,
581
+ "step": 900
582
+ },
583
+ {
584
+ "epoch": 3.32,
585
+ "learning_rate": 0.00010866141732283464,
586
+ "loss": 0.3715,
587
+ "step": 910
588
+ },
589
+ {
590
+ "epoch": 3.35,
591
+ "learning_rate": 0.00010629921259842519,
592
+ "loss": 0.3553,
593
+ "step": 920
594
+ },
595
+ {
596
+ "epoch": 3.39,
597
+ "learning_rate": 0.00010393700787401573,
598
+ "loss": 0.3806,
599
+ "step": 930
600
+ },
601
+ {
602
+ "epoch": 3.43,
603
+ "learning_rate": 0.0001015748031496063,
604
+ "loss": 0.3651,
605
+ "step": 940
606
+ },
607
+ {
608
+ "epoch": 3.46,
609
+ "learning_rate": 9.921259842519685e-05,
610
+ "loss": 0.3537,
611
+ "step": 950
612
+ },
613
+ {
614
+ "epoch": 3.5,
615
+ "learning_rate": 9.68503937007874e-05,
616
+ "loss": 0.3694,
617
+ "step": 960
618
+ },
619
+ {
620
+ "epoch": 3.54,
621
+ "learning_rate": 9.448818897637794e-05,
622
+ "loss": 0.3569,
623
+ "step": 970
624
+ },
625
+ {
626
+ "epoch": 3.57,
627
+ "learning_rate": 9.21259842519685e-05,
628
+ "loss": 0.3589,
629
+ "step": 980
630
+ },
631
+ {
632
+ "epoch": 3.61,
633
+ "learning_rate": 8.976377952755905e-05,
634
+ "loss": 0.3685,
635
+ "step": 990
636
+ },
637
+ {
638
+ "epoch": 3.65,
639
+ "learning_rate": 8.740157480314959e-05,
640
+ "loss": 0.3709,
641
+ "step": 1000
642
+ },
643
+ {
644
+ "epoch": 3.65,
645
+ "eval_loss": 0.3818259835243225,
646
+ "eval_runtime": 30.3549,
647
+ "eval_samples_per_second": 65.887,
648
+ "eval_steps_per_second": 1.054,
649
+ "step": 1000
650
+ },
651
+ {
652
+ "epoch": 3.68,
653
+ "learning_rate": 8.503937007874015e-05,
654
+ "loss": 0.3608,
655
+ "step": 1010
656
+ },
657
+ {
658
+ "epoch": 3.72,
659
+ "learning_rate": 8.26771653543307e-05,
660
+ "loss": 0.3591,
661
+ "step": 1020
662
+ },
663
+ {
664
+ "epoch": 3.76,
665
+ "learning_rate": 8.031496062992126e-05,
666
+ "loss": 0.3599,
667
+ "step": 1030
668
+ },
669
+ {
670
+ "epoch": 3.79,
671
+ "learning_rate": 7.79527559055118e-05,
672
+ "loss": 0.3601,
673
+ "step": 1040
674
+ },
675
+ {
676
+ "epoch": 3.83,
677
+ "learning_rate": 7.559055118110236e-05,
678
+ "loss": 0.3546,
679
+ "step": 1050
680
+ },
681
+ {
682
+ "epoch": 3.87,
683
+ "learning_rate": 7.322834645669291e-05,
684
+ "loss": 0.3629,
685
+ "step": 1060
686
+ },
687
+ {
688
+ "epoch": 3.9,
689
+ "learning_rate": 7.086614173228346e-05,
690
+ "loss": 0.3567,
691
+ "step": 1070
692
+ },
693
+ {
694
+ "epoch": 3.94,
695
+ "learning_rate": 6.850393700787402e-05,
696
+ "loss": 0.3514,
697
+ "step": 1080
698
+ },
699
+ {
700
+ "epoch": 3.97,
701
+ "learning_rate": 6.614173228346455e-05,
702
+ "loss": 0.356,
703
+ "step": 1090
704
+ },
705
+ {
706
+ "epoch": 4.01,
707
+ "learning_rate": 6.377952755905512e-05,
708
+ "loss": 0.3606,
709
+ "step": 1100
710
+ },
711
+ {
712
+ "epoch": 4.05,
713
+ "learning_rate": 6.141732283464567e-05,
714
+ "loss": 0.3586,
715
+ "step": 1110
716
+ },
717
+ {
718
+ "epoch": 4.08,
719
+ "learning_rate": 5.905511811023622e-05,
720
+ "loss": 0.3528,
721
+ "step": 1120
722
+ },
723
+ {
724
+ "epoch": 4.12,
725
+ "learning_rate": 5.669291338582676e-05,
726
+ "loss": 0.3472,
727
+ "step": 1130
728
+ },
729
+ {
730
+ "epoch": 4.16,
731
+ "learning_rate": 5.433070866141732e-05,
732
+ "loss": 0.3612,
733
+ "step": 1140
734
+ },
735
+ {
736
+ "epoch": 4.19,
737
+ "learning_rate": 5.196850393700787e-05,
738
+ "loss": 0.3482,
739
+ "step": 1150
740
+ },
741
+ {
742
+ "epoch": 4.23,
743
+ "learning_rate": 4.960629921259842e-05,
744
+ "loss": 0.3538,
745
+ "step": 1160
746
+ },
747
+ {
748
+ "epoch": 4.27,
749
+ "learning_rate": 4.724409448818897e-05,
750
+ "loss": 0.3588,
751
+ "step": 1170
752
+ },
753
+ {
754
+ "epoch": 4.3,
755
+ "learning_rate": 4.488188976377953e-05,
756
+ "loss": 0.357,
757
+ "step": 1180
758
+ },
759
+ {
760
+ "epoch": 4.34,
761
+ "learning_rate": 4.2519685039370076e-05,
762
+ "loss": 0.3615,
763
+ "step": 1190
764
+ },
765
+ {
766
+ "epoch": 4.38,
767
+ "learning_rate": 4.015748031496063e-05,
768
+ "loss": 0.3485,
769
+ "step": 1200
770
+ },
771
+ {
772
+ "epoch": 4.38,
773
+ "eval_loss": 0.3790667653083801,
774
+ "eval_runtime": 30.2759,
775
+ "eval_samples_per_second": 66.059,
776
+ "eval_steps_per_second": 1.057,
777
+ "step": 1200
778
+ }
779
+ ],
780
+ "logging_steps": 10,
781
+ "max_steps": 1370,
782
+ "num_train_epochs": 5,
783
+ "save_steps": 200,
784
+ "total_flos": 4.3556106687039406e+18,
785
+ "trial_name": null,
786
+ "trial_params": null
787
+ }
auxiliary_decoder/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f18572aa1929c77ecc29794b956d0ce03304be88b751f7d297b80e07cc13a70d
3
+ size 4155
base_decoder/config.json ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "_name_or_path": "/cpfs01/user/zhangbo/code/StructChart/output/cfgs/merge_chartqa_simdata/structchart_large_sft_2_merge_v2_updated_few_2/default/ckpt/checkpoint_epoch_2/",
4
+ "architectures": [
5
+ "Pix2StructForConditionalGeneration"
6
+ ],
7
+ "decoder_start_token_id": 0,
8
+ "eos_token_id": 1,
9
+ "initializer_factor": 1.0,
10
+ "initializer_range": 0.02,
11
+ "is_encoder_decoder": true,
12
+ "is_vqa": false,
13
+ "model_type": "pix2struct",
14
+ "pad_token_id": 0,
15
+ "text_config": {
16
+ "_name_or_path": "",
17
+ "add_cross_attention": false,
18
+ "architectures": null,
19
+ "bad_words_ids": null,
20
+ "begin_suppress_tokens": null,
21
+ "bos_token_id": null,
22
+ "chunk_size_feed_forward": 0,
23
+ "cross_attention_hidden_size": null,
24
+ "d_ff": 3968,
25
+ "d_kv": 64,
26
+ "decoder_start_token_id": 0,
27
+ "dense_act_fn": "gelu_new",
28
+ "diversity_penalty": 0.0,
29
+ "do_sample": false,
30
+ "dropout_rate": 0.1,
31
+ "early_stopping": false,
32
+ "encoder_hidden_size": 1536,
33
+ "encoder_no_repeat_ngram_size": 0,
34
+ "eos_token_id": 1,
35
+ "exponential_decay_length_penalty": null,
36
+ "finetuning_task": null,
37
+ "forced_bos_token_id": null,
38
+ "forced_eos_token_id": null,
39
+ "hidden_size": 1536,
40
+ "id2label": {
41
+ "0": "LABEL_0",
42
+ "1": "LABEL_1"
43
+ },
44
+ "initializer_factor": 1.0,
45
+ "initializer_range": 0.02,
46
+ "is_decoder": true,
47
+ "is_encoder_decoder": false,
48
+ "label2id": {
49
+ "LABEL_0": 0,
50
+ "LABEL_1": 1
51
+ },
52
+ "layer_norm_epsilon": 1e-06,
53
+ "length_penalty": 1.0,
54
+ "max_length": 20,
55
+ "min_length": 0,
56
+ "model_type": "pix2struct_text_model",
57
+ "no_repeat_ngram_size": 0,
58
+ "num_beam_groups": 1,
59
+ "num_beams": 1,
60
+ "num_heads": 24,
61
+ "num_layers": 18,
62
+ "num_return_sequences": 1,
63
+ "output_attentions": false,
64
+ "output_hidden_states": false,
65
+ "output_scores": false,
66
+ "pad_token_id": 0,
67
+ "prefix": null,
68
+ "problem_type": null,
69
+ "pruned_heads": {},
70
+ "relative_attention_max_distance": 128,
71
+ "relative_attention_num_buckets": 32,
72
+ "remove_invalid_values": false,
73
+ "repetition_penalty": 1.0,
74
+ "return_dict": true,
75
+ "return_dict_in_generate": false,
76
+ "sep_token_id": null,
77
+ "suppress_tokens": null,
78
+ "task_specific_params": null,
79
+ "temperature": 1.0,
80
+ "tf_legacy_loss": false,
81
+ "tie_encoder_decoder": false,
82
+ "tie_word_embeddings": false,
83
+ "tokenizer_class": null,
84
+ "top_k": 50,
85
+ "top_p": 1.0,
86
+ "torch_dtype": null,
87
+ "torchscript": false,
88
+ "transformers_version": "4.29.1",
89
+ "typical_p": 1.0,
90
+ "use_bfloat16": false,
91
+ "use_cache": false,
92
+ "vocab_size": 50244
93
+ },
94
+ "tie_word_embeddings": false,
95
+ "torch_dtype": "float32",
96
+ "transformers_version": null,
97
+ "vision_config": {
98
+ "_name_or_path": "",
99
+ "add_cross_attention": false,
100
+ "architectures": null,
101
+ "attention_dropout": 0.0,
102
+ "bad_words_ids": null,
103
+ "begin_suppress_tokens": null,
104
+ "bos_token_id": null,
105
+ "chunk_size_feed_forward": 0,
106
+ "cross_attention_hidden_size": null,
107
+ "d_ff": 3968,
108
+ "d_kv": 64,
109
+ "decoder_start_token_id": null,
110
+ "dense_act_fn": "gelu_new",
111
+ "diversity_penalty": 0.0,
112
+ "do_sample": false,
113
+ "dropout_rate": 0.0,
114
+ "early_stopping": false,
115
+ "encoder_no_repeat_ngram_size": 0,
116
+ "eos_token_id": null,
117
+ "exponential_decay_length_penalty": null,
118
+ "finetuning_task": null,
119
+ "forced_bos_token_id": null,
120
+ "forced_eos_token_id": null,
121
+ "hidden_size": 1536,
122
+ "id2label": {
123
+ "0": "LABEL_0",
124
+ "1": "LABEL_1"
125
+ },
126
+ "initializer_factor": 1.0,
127
+ "initializer_range": 0.02,
128
+ "is_decoder": false,
129
+ "is_encoder_decoder": false,
130
+ "label2id": {
131
+ "LABEL_0": 0,
132
+ "LABEL_1": 1
133
+ },
134
+ "layer_norm_bias": false,
135
+ "layer_norm_eps": 1e-06,
136
+ "length_penalty": 1.0,
137
+ "max_length": 20,
138
+ "min_length": 0,
139
+ "model_type": "pix2struct_vision_model",
140
+ "no_repeat_ngram_size": 0,
141
+ "num_attention_heads": 24,
142
+ "num_beam_groups": 1,
143
+ "num_beams": 1,
144
+ "num_channels": 3,
145
+ "num_hidden_layers": 18,
146
+ "num_return_sequences": 1,
147
+ "output_attentions": false,
148
+ "output_hidden_states": false,
149
+ "output_scores": false,
150
+ "pad_token_id": null,
151
+ "patch_embed_hidden_size": 768,
152
+ "patch_size": 16,
153
+ "prefix": null,
154
+ "problem_type": null,
155
+ "projection_dim": 768,
156
+ "pruned_heads": {},
157
+ "relative_attention_max_distance": 128,
158
+ "relative_attention_num_buckets": 32,
159
+ "remove_invalid_values": false,
160
+ "repetition_penalty": 1.0,
161
+ "return_dict": true,
162
+ "return_dict_in_generate": false,
163
+ "sep_token_id": null,
164
+ "seq_len": 4096,
165
+ "suppress_tokens": null,
166
+ "task_specific_params": null,
167
+ "temperature": 1.0,
168
+ "tf_legacy_loss": false,
169
+ "tie_encoder_decoder": false,
170
+ "tie_word_embeddings": true,
171
+ "tokenizer_class": null,
172
+ "top_k": 50,
173
+ "top_p": 1.0,
174
+ "torch_dtype": null,
175
+ "torchscript": false,
176
+ "transformers_version": "4.29.1",
177
+ "typical_p": 1.0,
178
+ "use_bfloat16": false
179
+ }
180
+ }
base_decoder/generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.29.1",
7
+ "use_cache": false
8
+ }
base_decoder/preprocessor_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": true,
3
+ "do_normalize": true,
4
+ "image_processor_type": "Pix2StructImageProcessor",
5
+ "is_vqa": true,
6
+ "max_patches": 2048,
7
+ "patch_size": {
8
+ "height": 16,
9
+ "width": 16
10
+ },
11
+ "processor_class": "Pix2StructProcessor"
12
+ }
base_decoder/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bad8d6f0fbb159629c47ea99ecd3ff6f2bc8f23ea5a10d8338b7abb77612043e
3
+ size 5344577573
base_decoder/special_tokens_map.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
+ "eos_token": "</s>",
105
+ "pad_token": "<pad>",
106
+ "unk_token": "<unk>"
107
+ }
base_decoder/state_dict.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:656e9fe5d7a7b443752b08bb1e5bdd9e5b00a1c45bc1af33fb39ae6b218ec495
3
+ size 10689225585
base_decoder/title_type/config.json ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "_name_or_path": "/cpfs01/shared/ADLab/hug_ckpts/pix2struct-base",
4
+ "architectures": [
5
+ "Pix2StructForConditionalGeneration"
6
+ ],
7
+ "decoder_start_token_id": 0,
8
+ "eos_token_id": 1,
9
+ "initializer_factor": 1.0,
10
+ "initializer_range": 0.02,
11
+ "is_encoder_decoder": true,
12
+ "is_vqa": false,
13
+ "model_type": "pix2struct",
14
+ "pad_token_id": 0,
15
+ "text_config": {
16
+ "_name_or_path": "",
17
+ "add_cross_attention": false,
18
+ "architectures": null,
19
+ "bad_words_ids": null,
20
+ "begin_suppress_tokens": null,
21
+ "bos_token_id": null,
22
+ "chunk_size_feed_forward": 0,
23
+ "cross_attention_hidden_size": null,
24
+ "d_ff": 2048,
25
+ "d_kv": 64,
26
+ "decoder_start_token_id": 0,
27
+ "dense_act_fn": "gelu_new",
28
+ "diversity_penalty": 0.0,
29
+ "do_sample": false,
30
+ "dropout_rate": 0.2,
31
+ "early_stopping": false,
32
+ "encoder_hidden_size": 768,
33
+ "encoder_no_repeat_ngram_size": 0,
34
+ "eos_token_id": 1,
35
+ "exponential_decay_length_penalty": null,
36
+ "finetuning_task": null,
37
+ "forced_bos_token_id": null,
38
+ "forced_eos_token_id": null,
39
+ "hidden_size": 768,
40
+ "id2label": {
41
+ "0": "LABEL_0",
42
+ "1": "LABEL_1"
43
+ },
44
+ "initializer_factor": 1.0,
45
+ "initializer_range": 0.02,
46
+ "is_decoder": true,
47
+ "is_encoder_decoder": false,
48
+ "label2id": {
49
+ "LABEL_0": 0,
50
+ "LABEL_1": 1
51
+ },
52
+ "layer_norm_epsilon": 1e-06,
53
+ "length_penalty": 1.0,
54
+ "max_length": 20,
55
+ "min_length": 0,
56
+ "model_type": "pix2struct_text_model",
57
+ "no_repeat_ngram_size": 0,
58
+ "num_beam_groups": 1,
59
+ "num_beams": 1,
60
+ "num_heads": 12,
61
+ "num_layers": 12,
62
+ "num_return_sequences": 1,
63
+ "output_attentions": false,
64
+ "output_hidden_states": false,
65
+ "output_scores": false,
66
+ "pad_token_id": 0,
67
+ "prefix": null,
68
+ "problem_type": null,
69
+ "pruned_heads": {},
70
+ "relative_attention_max_distance": 128,
71
+ "relative_attention_num_buckets": 32,
72
+ "remove_invalid_values": false,
73
+ "repetition_penalty": 1.0,
74
+ "return_dict": true,
75
+ "return_dict_in_generate": false,
76
+ "sep_token_id": null,
77
+ "suppress_tokens": null,
78
+ "task_specific_params": null,
79
+ "temperature": 1.0,
80
+ "tf_legacy_loss": false,
81
+ "tie_encoder_decoder": false,
82
+ "tie_word_embeddings": false,
83
+ "tokenizer_class": null,
84
+ "top_k": 50,
85
+ "top_p": 1.0,
86
+ "torch_dtype": null,
87
+ "torchscript": false,
88
+ "transformers_version": "4.31.0",
89
+ "typical_p": 1.0,
90
+ "use_bfloat16": false,
91
+ "use_cache": false,
92
+ "vocab_size": 50244
93
+ },
94
+ "tie_word_embeddings": false,
95
+ "torch_dtype": "float32",
96
+ "transformers_version": null,
97
+ "vision_config": {
98
+ "_name_or_path": "",
99
+ "add_cross_attention": false,
100
+ "architectures": null,
101
+ "attention_dropout": 0.2,
102
+ "bad_words_ids": null,
103
+ "begin_suppress_tokens": null,
104
+ "bos_token_id": null,
105
+ "chunk_size_feed_forward": 0,
106
+ "cross_attention_hidden_size": null,
107
+ "d_ff": 2048,
108
+ "d_kv": 64,
109
+ "decoder_start_token_id": null,
110
+ "dense_act_fn": "gelu_new",
111
+ "diversity_penalty": 0.0,
112
+ "do_sample": false,
113
+ "dropout_rate": 0.2,
114
+ "early_stopping": false,
115
+ "encoder_no_repeat_ngram_size": 0,
116
+ "eos_token_id": null,
117
+ "exponential_decay_length_penalty": null,
118
+ "finetuning_task": null,
119
+ "forced_bos_token_id": null,
120
+ "forced_eos_token_id": null,
121
+ "hidden_dropout_prob": 0.2,
122
+ "hidden_size": 768,
123
+ "id2label": {
124
+ "0": "LABEL_0",
125
+ "1": "LABEL_1"
126
+ },
127
+ "initializer_factor": 1.0,
128
+ "initializer_range": 0.02,
129
+ "is_decoder": false,
130
+ "is_encoder_decoder": false,
131
+ "label2id": {
132
+ "LABEL_0": 0,
133
+ "LABEL_1": 1
134
+ },
135
+ "layer_norm_bias": false,
136
+ "layer_norm_eps": 1e-06,
137
+ "length_penalty": 1.0,
138
+ "max_length": 20,
139
+ "min_length": 0,
140
+ "model_type": "pix2struct_vision_model",
141
+ "no_repeat_ngram_size": 0,
142
+ "num_attention_heads": 12,
143
+ "num_beam_groups": 1,
144
+ "num_beams": 1,
145
+ "num_channels": 3,
146
+ "num_hidden_layers": 12,
147
+ "num_return_sequences": 1,
148
+ "output_attentions": false,
149
+ "output_hidden_states": false,
150
+ "output_scores": false,
151
+ "pad_token_id": null,
152
+ "patch_embed_hidden_size": 768,
153
+ "patch_size": 16,
154
+ "prefix": null,
155
+ "problem_type": null,
156
+ "projection_dim": 768,
157
+ "pruned_heads": {},
158
+ "relative_attention_max_distance": 128,
159
+ "relative_attention_num_buckets": 32,
160
+ "remove_invalid_values": false,
161
+ "repetition_penalty": 1.0,
162
+ "return_dict": true,
163
+ "return_dict_in_generate": false,
164
+ "sep_token_id": null,
165
+ "seq_len": 4096,
166
+ "suppress_tokens": null,
167
+ "task_specific_params": null,
168
+ "temperature": 1.0,
169
+ "tf_legacy_loss": false,
170
+ "tie_encoder_decoder": false,
171
+ "tie_word_embeddings": true,
172
+ "tokenizer_class": null,
173
+ "top_k": 50,
174
+ "top_p": 1.0,
175
+ "torch_dtype": null,
176
+ "torchscript": false,
177
+ "transformers_version": "4.31.0",
178
+ "typical_p": 1.0,
179
+ "use_bfloat16": false
180
+ }
181
+ }
base_decoder/title_type/generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.31.0",
7
+ "use_cache": false
8
+ }
base_decoder/title_type/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:711784f67909211daed7eb2ff752012cca710c783d9bfd895d694c9b3350504a
3
+ size 1129242049
base_decoder/title_type/state_dict.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:062b5f84cb97000f82e8029bf2f023c1d14db5ecd5b35ee751b59e442bfa4963
3
+ size 2258526302
base_decoder/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
base_decoder/tokenizer_config.json ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
+ "eos_token": "</s>",
105
+ "extra_ids": 100,
106
+ "model_max_length": 1000000000000000019884624838656,
107
+ "pad_token": "<pad>",
108
+ "processor_class": "Pix2StructProcessor",
109
+ "sp_model_kwargs": {},
110
+ "special_tokens_map_file": "/cpfs01/user/zhangbo/code/hug_ckpts/pix2struct-chartqa-base/special_tokens_map.json",
111
+ "tokenizer_class": "T5Tokenizer",
112
+ "unk_token": "<unk>"
113
+ }
instruction_adapter/mlp_classifier.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d97331e07f2ff53fbef467e859744e479a5388bf4cff79d3761ad28597f41cd
3
+ size 3536585
instruction_adapter/vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91f29d95ca32b437a3accdb8d5ff49ef5e8e81e00cc3d1c3edab69726ac7e8e1
3
+ size 50248