chrlu commited on
Commit
792bab4
1 Parent(s): 7188c8a

Model save

Browse files
README.md CHANGED
@@ -1,17 +1,11 @@
1
  ---
2
  license: other
3
- base_model: Columbia-NLP/gemma-2b-zephyr-sft
4
  tags:
5
- - alignment-handbook
6
- - trl
7
- - dpo
8
- - generated_from_trainer
9
  - trl
10
  - dpo
11
  - alignment-handbook
12
  - generated_from_trainer
13
- datasets:
14
- - argilla/dpo-mix-7k
15
  model-index:
16
  - name: zephyr-7b-gemma-dpo
17
  results: []
@@ -22,17 +16,17 @@ should probably proofread and complete it, then remove this comment. -->
22
 
23
  # zephyr-7b-gemma-dpo
24
 
25
- This model is a fine-tuned version of [Columbia-NLP/gemma-2b-zephyr-sft](https://huggingface.co/Columbia-NLP/gemma-2b-zephyr-sft) on the argilla/dpo-mix-7k dataset.
26
  It achieves the following results on the evaluation set:
27
- - Loss: 0.5964
28
- - Rewards/chosen: 0.3541
29
- - Rewards/rejected: 0.1118
30
- - Rewards/accuracies: 0.6562
31
- - Rewards/margins: 0.2424
32
- - Logps/rejected: -373.3665
33
- - Logps/chosen: -371.9911
34
- - Logits/rejected: -12.6656
35
- - Logits/chosen: -13.2953
36
 
37
  ## Model description
38
 
@@ -69,7 +63,7 @@ The following hyperparameters were used during training:
69
 
70
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
71
  |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
72
- | 0.5732 | 1.8957 | 100 | 0.5972 | 0.3534 | 0.1096 | 0.6771 | 0.2437 | -373.4094 | -372.0067 | -12.6654 | -13.2926 |
73
 
74
 
75
  ### Framework versions
 
1
  ---
2
  license: other
3
+ base_model: HuggingFaceH4/zephyr-7b-gemma-sft-v0.1
4
  tags:
 
 
 
 
5
  - trl
6
  - dpo
7
  - alignment-handbook
8
  - generated_from_trainer
 
 
9
  model-index:
10
  - name: zephyr-7b-gemma-dpo
11
  results: []
 
16
 
17
  # zephyr-7b-gemma-dpo
18
 
19
+ This model is a fine-tuned version of [HuggingFaceH4/zephyr-7b-gemma-sft-v0.1](https://huggingface.co/HuggingFaceH4/zephyr-7b-gemma-sft-v0.1) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.4673
22
+ - Rewards/chosen: -4.1293
23
+ - Rewards/rejected: -5.7148
24
+ - Rewards/accuracies: 0.6979
25
+ - Rewards/margins: 1.5855
26
+ - Logps/rejected: -476.3664
27
+ - Logps/chosen: -446.2537
28
+ - Logits/rejected: 91.1323
29
+ - Logits/chosen: 96.7315
30
 
31
  ## Model description
32
 
 
63
 
64
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
65
  |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
66
+ | 0.1585 | 1.8957 | 100 | 0.4673 | -4.1293 | -5.7148 | 0.6979 | 1.5855 | -476.3664 | -446.2537 | 91.1323 | 96.7315 |
67
 
68
 
69
  ### Framework versions
all_results.json CHANGED
@@ -14,9 +14,9 @@
14
  "eval_samples_per_second": 51.65,
15
  "eval_steps_per_second": 1.653,
16
  "total_flos": 0.0,
17
- "train_loss": 0.627926590350958,
18
- "train_runtime": 756.4701,
19
  "train_samples": 6750,
20
- "train_samples_per_second": 17.846,
21
- "train_steps_per_second": 0.137
22
  }
 
14
  "eval_samples_per_second": 51.65,
15
  "eval_steps_per_second": 1.653,
16
  "total_flos": 0.0,
17
+ "train_loss": 0.39153398688022906,
18
+ "train_runtime": 2311.0387,
19
  "train_samples": 6750,
20
+ "train_samples_per_second": 5.842,
21
+ "train_steps_per_second": 0.045
22
  }
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "Columbia-NLP/gemma-2b-zephyr-sft",
3
  "architectures": [
4
  "GemmaForCausalLM"
5
  ],
@@ -10,20 +10,20 @@
10
  "head_dim": 256,
11
  "hidden_act": "gelu",
12
  "hidden_activation": null,
13
- "hidden_size": 2048,
14
  "initializer_range": 0.02,
15
- "intermediate_size": 16384,
16
  "max_position_embeddings": 8192,
17
  "model_type": "gemma",
18
- "num_attention_heads": 8,
19
- "num_hidden_layers": 18,
20
- "num_key_value_heads": 1,
21
  "pad_token_id": 0,
22
  "rms_norm_eps": 1e-06,
23
  "rope_scaling": null,
24
  "rope_theta": 10000.0,
25
  "torch_dtype": "bfloat16",
26
  "transformers_version": "4.40.1",
27
- "use_cache": true,
28
  "vocab_size": 256000
29
  }
 
1
  {
2
+ "_name_or_path": "HuggingFaceH4/zephyr-7b-gemma-sft-v0.1",
3
  "architectures": [
4
  "GemmaForCausalLM"
5
  ],
 
10
  "head_dim": 256,
11
  "hidden_act": "gelu",
12
  "hidden_activation": null,
13
+ "hidden_size": 3072,
14
  "initializer_range": 0.02,
15
+ "intermediate_size": 24576,
16
  "max_position_embeddings": 8192,
17
  "model_type": "gemma",
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 28,
20
+ "num_key_value_heads": 16,
21
  "pad_token_id": 0,
22
  "rms_norm_eps": 1e-06,
23
  "rope_scaling": null,
24
  "rope_theta": 10000.0,
25
  "torch_dtype": "bfloat16",
26
  "transformers_version": "4.40.1",
27
+ "use_cache": false,
28
  "vocab_size": 256000
29
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c5a7396241c9049d5aa615a0081a16f922ac27a595fad308da7be35b297a132
3
  size 4995496656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b26faf7dcff7b7ca7bbf4ebc9d54968ab56cd1bbe5b3f4059d0ec34b7b1ccdd2
3
  size 4995496656
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd5dd371acf66dbd26a13036c2b42efee4bf540edd178d9cb3a225eccd87d21c
3
  size 4982953168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98a41992ae6f80c80eaa24e7d8dbecab5d07c2802028c109568fe70565b4c6d8
3
  size 4982953168
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:836f574fd62d28f4d282b81d3712708535d3eadc45ddd0509c932611129327c5
3
  size 4982953200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d1d7ab5de3f2e26234060bf0c99e343d3a84489614f455b267bd22f059fc862
3
  size 4982953200
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:85fb13399e2932fccc4212b3907e81ae00bcdf3dbbb785a234bceaa501d29091
3
  size 2113988336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07f5983e107d05b629942a14afa7af7fe7e3836b05bc872e472789542c0f95b6
3
  size 2113988336
model.safetensors.index.json CHANGED
@@ -1,171 +1,261 @@
1
  {
2
  "metadata": {
3
- "total_size": 5012344832
4
  },
5
  "weight_map": {
6
- "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
7
- "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
8
- "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
9
- "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
10
- "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
11
- "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
12
- "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
13
- "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
14
- "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
15
- "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
16
- "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
17
- "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
18
- "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
19
- "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
20
- "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
21
- "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
22
- "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
23
- "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
24
- "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
25
- "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
26
- "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
27
- "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
28
- "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
29
- "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
30
- "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
31
- "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
32
- "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
33
- "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
34
- "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
35
- "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
36
- "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
37
- "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
38
- "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
39
- "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
40
- "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
41
- "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
42
- "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
43
- "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
44
- "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
45
- "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
46
- "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
47
- "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
48
- "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
49
- "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
50
- "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
51
- "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
52
- "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
53
- "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
54
- "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
55
- "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
56
- "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
57
- "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
58
- "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
59
- "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
60
- "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
61
- "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
62
- "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
63
- "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
64
- "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
65
- "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
66
- "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
67
- "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
68
- "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
69
- "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
70
- "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
71
- "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
72
- "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
73
- "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
74
- "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
75
- "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
76
- "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
77
- "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
78
- "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
79
- "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
80
- "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
81
- "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
82
- "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
83
- "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
84
- "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
85
- "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
86
- "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
87
- "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
88
- "model.layers.17.input_layernorm.weight": "model-00002-of-00002.safetensors",
89
- "model.layers.17.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
90
- "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
91
- "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
92
- "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
93
- "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
94
- "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
95
- "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
96
- "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
97
- "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
98
- "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
99
- "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
100
- "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
101
- "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
102
- "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
103
- "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
104
- "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
105
- "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
106
- "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
107
- "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
108
- "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
109
- "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
110
- "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
111
- "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
112
- "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
113
- "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
114
- "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
115
- "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
116
- "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
117
- "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
118
- "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
119
- "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
120
- "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
121
- "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
122
- "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
123
- "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
124
- "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
125
- "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
126
- "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
127
- "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
128
- "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
129
- "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
130
- "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
131
- "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
132
- "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
133
- "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
134
- "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
135
- "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
136
- "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
137
- "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
138
- "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
139
- "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
140
- "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
141
- "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
142
- "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
143
- "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
144
- "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
145
- "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
146
- "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
147
- "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
148
- "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
149
- "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
150
- "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
151
- "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
152
- "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
153
- "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
154
- "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
155
- "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
156
- "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
157
- "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
158
- "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
159
- "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
160
- "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
161
- "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
162
- "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
163
- "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
164
- "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
165
- "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
166
- "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
167
- "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
168
- "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
169
- "model.norm.weight": "model-00002-of-00002.safetensors"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  }
171
  }
 
1
  {
2
  "metadata": {
3
+ "total_size": 17075361792
4
  },
5
  "weight_map": {
6
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
7
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
8
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
26
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
27
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
28
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
29
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
30
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
31
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
32
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
33
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
38
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
41
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
43
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
50
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
55
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
62
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
67
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.15.input_layernorm.weight": "model-00003-of-00004.safetensors",
71
+ "model.layers.15.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
72
+ "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
73
+ "model.layers.15.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
74
+ "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
75
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.16.input_layernorm.weight": "model-00003-of-00004.safetensors",
80
+ "model.layers.16.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
81
+ "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
82
+ "model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
83
+ "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
84
+ "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
85
+ "model.layers.16.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
86
+ "model.layers.16.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
87
+ "model.layers.16.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
88
+ "model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
89
+ "model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
90
+ "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
91
+ "model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
92
+ "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
93
+ "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
94
+ "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
95
+ "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
96
+ "model.layers.17.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
97
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
98
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
99
+ "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
100
+ "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
101
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
102
+ "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
103
+ "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
104
+ "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
105
+ "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
106
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
107
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
108
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
109
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
110
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
111
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
112
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
113
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
114
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
115
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
116
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
117
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
118
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
119
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
120
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
121
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
122
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
123
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
124
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
125
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
126
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
127
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
128
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
129
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
130
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
131
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
132
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
133
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
134
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
135
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
136
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
137
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
138
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
139
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
140
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
141
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
146
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
147
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
149
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
150
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
151
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
152
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
153
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
154
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
155
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
156
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
157
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
158
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
159
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
160
+ "model.layers.24.input_layernorm.weight": "model-00004-of-00004.safetensors",
161
+ "model.layers.24.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
162
+ "model.layers.24.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
163
+ "model.layers.24.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
164
+ "model.layers.24.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
165
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.25.input_layernorm.weight": "model-00004-of-00004.safetensors",
170
+ "model.layers.25.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
171
+ "model.layers.25.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
172
+ "model.layers.25.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
173
+ "model.layers.25.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
174
+ "model.layers.25.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
175
+ "model.layers.25.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
176
+ "model.layers.25.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
177
+ "model.layers.25.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
178
+ "model.layers.26.input_layernorm.weight": "model-00004-of-00004.safetensors",
179
+ "model.layers.26.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
180
+ "model.layers.26.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
181
+ "model.layers.26.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
182
+ "model.layers.26.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
183
+ "model.layers.26.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
184
+ "model.layers.26.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
185
+ "model.layers.26.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
186
+ "model.layers.26.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
187
+ "model.layers.27.input_layernorm.weight": "model-00004-of-00004.safetensors",
188
+ "model.layers.27.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
189
+ "model.layers.27.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
190
+ "model.layers.27.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
191
+ "model.layers.27.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
192
+ "model.layers.27.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
193
+ "model.layers.27.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
194
+ "model.layers.27.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
195
+ "model.layers.27.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
196
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
197
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
198
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
199
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
200
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
201
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
202
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
203
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
204
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
205
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
206
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
207
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
208
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
209
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
210
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
211
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
212
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
213
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
214
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
215
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
216
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
217
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
218
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
219
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
220
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
221
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
222
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
223
+ "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
224
+ "model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
225
+ "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
226
+ "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
227
+ "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
228
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
229
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
230
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
231
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
232
+ "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
233
+ "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
234
+ "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
235
+ "model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
236
+ "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
237
+ "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
238
+ "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
239
+ "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
240
+ "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
241
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
242
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
243
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
244
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
245
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
246
+ "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
247
+ "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
248
+ "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
249
+ "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
250
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
251
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
252
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
253
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
254
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
255
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
256
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
257
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
258
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
259
+ "model.norm.weight": "model-00004-of-00004.safetensors"
260
  }
261
  }
runs/Apr27_16-12-58_660111d13776/events.out.tfevents.1714230903.660111d13776.64079.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b97fd433bfa6de3447258f58730d6d4afae52966b1f8d0359a70e2097f3673e6
3
+ size 9663
runs/Apr27_19-16-30_660111d13776/events.out.tfevents.1714241901.660111d13776.66670.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:945218278ec2aa200d3ae01a36d4e1429890a97741bbdfd88e455df07f081f0c
3
+ size 13446
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.971563981042654,
3
  "total_flos": 0.0,
4
- "train_loss": 0.627926590350958,
5
- "train_runtime": 756.4701,
6
  "train_samples": 6750,
7
- "train_samples_per_second": 17.846,
8
- "train_steps_per_second": 0.137
9
  }
 
1
  {
2
  "epoch": 1.971563981042654,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.39153398688022906,
5
+ "train_runtime": 2311.0387,
6
  "train_samples": 6750,
7
+ "train_samples_per_second": 5.842,
8
+ "train_steps_per_second": 0.045
9
  }
trainer_state.json CHANGED
@@ -10,12 +10,12 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.018957345971563982,
13
- "grad_norm": 15.786988646394411,
14
  "learning_rate": 4.545454545454545e-08,
15
- "logits/chosen": -13.905267715454102,
16
- "logits/rejected": -14.118387222290039,
17
- "logps/chosen": -350.8895263671875,
18
- "logps/rejected": -446.6286926269531,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
@@ -25,178 +25,178 @@
25
  },
26
  {
27
  "epoch": 0.1895734597156398,
28
- "grad_norm": 15.908099576913655,
29
  "learning_rate": 4.545454545454545e-07,
30
- "logits/chosen": -14.040081024169922,
31
- "logits/rejected": -14.157392501831055,
32
- "logps/chosen": -416.2701416015625,
33
- "logps/rejected": -449.4697265625,
34
- "loss": 0.693,
35
- "rewards/accuracies": 0.5138888955116272,
36
- "rewards/chosen": 0.0006088384543545544,
37
- "rewards/margins": 0.008209776133298874,
38
- "rewards/rejected": -0.007600938435643911,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.3791469194312796,
43
- "grad_norm": 14.229474825008781,
44
  "learning_rate": 4.885348141000122e-07,
45
- "logits/chosen": -13.39338207244873,
46
- "logits/rejected": -13.542058944702148,
47
- "logps/chosen": -392.9753723144531,
48
- "logps/rejected": -427.68096923828125,
49
- "loss": 0.6892,
50
- "rewards/accuracies": 0.5062500238418579,
51
- "rewards/chosen": 0.010071685537695885,
52
- "rewards/margins": 0.003802267834544182,
53
- "rewards/rejected": 0.006269416771829128,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.5687203791469194,
58
- "grad_norm": 15.853985724357454,
59
  "learning_rate": 4.5025027361734613e-07,
60
- "logits/chosen": -14.269427299499512,
61
- "logits/rejected": -13.808093070983887,
62
- "logps/chosen": -412.9443359375,
63
- "logps/rejected": -428.38494873046875,
64
- "loss": 0.674,
65
- "rewards/accuracies": 0.612500011920929,
66
- "rewards/chosen": 0.04771440848708153,
67
- "rewards/margins": 0.035354893654584885,
68
- "rewards/rejected": 0.012359511107206345,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.7582938388625592,
73
- "grad_norm": 14.687978809678542,
74
  "learning_rate": 3.893311157806091e-07,
75
- "logits/chosen": -13.886492729187012,
76
- "logits/rejected": -13.28197956085205,
77
- "logps/chosen": -374.98211669921875,
78
- "logps/rejected": -366.5968322753906,
79
- "loss": 0.657,
80
- "rewards/accuracies": 0.65625,
81
- "rewards/chosen": 0.13442906737327576,
82
- "rewards/margins": 0.07902240008115768,
83
- "rewards/rejected": 0.05540664866566658,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.9478672985781991,
88
- "grad_norm": 15.872142673244408,
89
  "learning_rate": 3.126631330646801e-07,
90
- "logits/chosen": -14.917936325073242,
91
- "logits/rejected": -14.90648078918457,
92
- "logps/chosen": -429.6836853027344,
93
- "logps/rejected": -480.3504943847656,
94
- "loss": 0.6344,
95
- "rewards/accuracies": 0.6875,
96
- "rewards/chosen": 0.24091288447380066,
97
- "rewards/margins": 0.1330389827489853,
98
- "rewards/rejected": 0.10787389427423477,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 1.1374407582938388,
103
- "grad_norm": 14.061428605486398,
104
  "learning_rate": 2.2891223348923882e-07,
105
- "logits/chosen": -14.622962951660156,
106
- "logits/rejected": -14.403157234191895,
107
- "logps/chosen": -415.7464904785156,
108
- "logps/rejected": -441.731201171875,
109
- "loss": 0.6063,
110
- "rewards/accuracies": 0.7437499761581421,
111
- "rewards/chosen": 0.3395090103149414,
112
- "rewards/margins": 0.22218124568462372,
113
- "rewards/rejected": 0.11732780933380127,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 1.3270142180094786,
118
- "grad_norm": 12.963152293888875,
119
  "learning_rate": 1.4754491880085317e-07,
120
- "logits/chosen": -14.022384643554688,
121
- "logits/rejected": -13.828951835632324,
122
- "logps/chosen": -382.23468017578125,
123
- "logps/rejected": -418.2818908691406,
124
- "loss": 0.6011,
125
- "rewards/accuracies": 0.706250011920929,
126
- "rewards/chosen": 0.3396778702735901,
127
- "rewards/margins": 0.22157195210456848,
128
- "rewards/rejected": 0.118105947971344,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 1.5165876777251186,
133
- "grad_norm": 12.394681314131397,
134
  "learning_rate": 7.775827023107834e-08,
135
- "logits/chosen": -13.705121040344238,
136
- "logits/rejected": -14.205709457397461,
137
- "logps/chosen": -367.263427734375,
138
- "logps/rejected": -423.30841064453125,
139
- "loss": 0.5788,
140
- "rewards/accuracies": 0.706250011920929,
141
- "rewards/chosen": 0.36119210720062256,
142
- "rewards/margins": 0.3365553319454193,
143
- "rewards/rejected": 0.024636749178171158,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 1.7061611374407581,
148
- "grad_norm": 14.456589635016153,
149
  "learning_rate": 2.7440387297912122e-08,
150
- "logits/chosen": -13.98394775390625,
151
- "logits/rejected": -14.161648750305176,
152
- "logps/chosen": -399.45458984375,
153
- "logps/rejected": -447.48828125,
154
- "loss": 0.5766,
155
- "rewards/accuracies": 0.78125,
156
- "rewards/chosen": 0.3995341658592224,
157
- "rewards/margins": 0.34082064032554626,
158
- "rewards/rejected": 0.05871356278657913,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 1.8957345971563981,
163
- "grad_norm": 13.44211674398592,
164
  "learning_rate": 2.27878296044029e-09,
165
- "logits/chosen": -14.160197257995605,
166
- "logits/rejected": -14.141824722290039,
167
- "logps/chosen": -392.3072509765625,
168
- "logps/rejected": -421.604248046875,
169
- "loss": 0.5732,
170
- "rewards/accuracies": 0.7250000238418579,
171
- "rewards/chosen": 0.4029002785682678,
172
- "rewards/margins": 0.27652695775032043,
173
- "rewards/rejected": 0.1263733208179474,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 1.8957345971563981,
178
- "eval_logits/chosen": -13.292621612548828,
179
- "eval_logits/rejected": -12.66539478302002,
180
- "eval_logps/chosen": -372.0066833496094,
181
- "eval_logps/rejected": -373.4093933105469,
182
- "eval_loss": 0.5971602201461792,
183
- "eval_rewards/accuracies": 0.6770833134651184,
184
- "eval_rewards/chosen": 0.3533553183078766,
185
- "eval_rewards/margins": 0.24372106790542603,
186
- "eval_rewards/rejected": 0.10963428020477295,
187
- "eval_runtime": 20.0916,
188
- "eval_samples_per_second": 37.329,
189
- "eval_steps_per_second": 1.195,
190
  "step": 100
191
  },
192
  {
193
  "epoch": 1.971563981042654,
194
  "step": 104,
195
  "total_flos": 0.0,
196
- "train_loss": 0.627926590350958,
197
- "train_runtime": 756.4701,
198
- "train_samples_per_second": 17.846,
199
- "train_steps_per_second": 0.137
200
  }
201
  ],
202
  "logging_steps": 10,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.018957345971563982,
13
+ "grad_norm": 132.15360444004384,
14
  "learning_rate": 4.545454545454545e-08,
15
+ "logits/chosen": 117.53560638427734,
16
+ "logits/rejected": 126.8960952758789,
17
+ "logps/chosen": -335.40118408203125,
18
+ "logps/rejected": -439.16552734375,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
 
25
  },
26
  {
27
  "epoch": 0.1895734597156398,
28
+ "grad_norm": 132.3674027987073,
29
  "learning_rate": 4.545454545454545e-07,
30
+ "logits/chosen": 135.01699829101562,
31
+ "logits/rejected": 138.37664794921875,
32
+ "logps/chosen": -396.05718994140625,
33
+ "logps/rejected": -439.1203918457031,
34
+ "loss": 0.7127,
35
+ "rewards/accuracies": 0.4583333432674408,
36
+ "rewards/chosen": -0.0030322629027068615,
37
+ "rewards/margins": -0.013390823267400265,
38
+ "rewards/rejected": 0.010358559899032116,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.3791469194312796,
43
+ "grad_norm": 131.21733523095625,
44
  "learning_rate": 4.885348141000122e-07,
45
+ "logits/chosen": 121.60444641113281,
46
+ "logits/rejected": 125.29842376708984,
47
+ "logps/chosen": -370.2664489746094,
48
+ "logps/rejected": -422.78851318359375,
49
+ "loss": 0.6459,
50
+ "rewards/accuracies": 0.612500011920929,
51
+ "rewards/chosen": 0.10727670043706894,
52
+ "rewards/margins": 0.247134730219841,
53
+ "rewards/rejected": -0.13985800743103027,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.5687203791469194,
58
+ "grad_norm": 117.90232463642135,
59
  "learning_rate": 4.5025027361734613e-07,
60
+ "logits/chosen": 142.974853515625,
61
+ "logits/rejected": 136.52386474609375,
62
+ "logps/chosen": -424.7781677246094,
63
+ "logps/rejected": -469.64813232421875,
64
+ "loss": 0.5746,
65
+ "rewards/accuracies": 0.6937500238418579,
66
+ "rewards/chosen": -1.6156466007232666,
67
+ "rewards/margins": 0.8666501045227051,
68
+ "rewards/rejected": -2.4822967052459717,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.7582938388625592,
73
+ "grad_norm": 104.91283452119073,
74
  "learning_rate": 3.893311157806091e-07,
75
+ "logits/chosen": 126.9936752319336,
76
+ "logits/rejected": 115.53365325927734,
77
+ "logps/chosen": -399.81353759765625,
78
+ "logps/rejected": -426.99853515625,
79
+ "loss": 0.5456,
80
+ "rewards/accuracies": 0.737500011920929,
81
+ "rewards/chosen": -2.2809689044952393,
82
+ "rewards/margins": 1.1751956939697266,
83
+ "rewards/rejected": -3.456164598464966,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.9478672985781991,
88
+ "grad_norm": 123.57780236639618,
89
  "learning_rate": 3.126631330646801e-07,
90
+ "logits/chosen": 142.1190643310547,
91
+ "logits/rejected": 146.2515411376953,
92
+ "logps/chosen": -456.97979736328125,
93
+ "logps/rejected": -540.1392822265625,
94
+ "loss": 0.489,
95
+ "rewards/accuracies": 0.7875000238418579,
96
+ "rewards/chosen": -1.891798734664917,
97
+ "rewards/margins": 1.2988468408584595,
98
+ "rewards/rejected": -3.190645456314087,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 1.1374407582938388,
103
+ "grad_norm": 67.1680971334559,
104
  "learning_rate": 2.2891223348923882e-07,
105
+ "logits/chosen": 133.56114196777344,
106
+ "logits/rejected": 137.20738220214844,
107
+ "logps/chosen": -449.55303955078125,
108
+ "logps/rejected": -534.8367919921875,
109
+ "loss": 0.3117,
110
+ "rewards/accuracies": 0.887499988079071,
111
+ "rewards/chosen": -2.5773684978485107,
112
+ "rewards/margins": 2.346193790435791,
113
+ "rewards/rejected": -4.923562049865723,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 1.3270142180094786,
118
+ "grad_norm": 48.54475300946312,
119
  "learning_rate": 1.4754491880085317e-07,
120
+ "logits/chosen": 125.71492004394531,
121
+ "logits/rejected": 127.68719482421875,
122
+ "logps/chosen": -426.90228271484375,
123
+ "logps/rejected": -528.0679321289062,
124
+ "loss": 0.195,
125
+ "rewards/accuracies": 0.9437500238418579,
126
+ "rewards/chosen": -2.9879212379455566,
127
+ "rewards/margins": 2.7397806644439697,
128
+ "rewards/rejected": -5.727701663970947,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 1.5165876777251186,
133
+ "grad_norm": 55.371866892062,
134
  "learning_rate": 7.775827023107834e-08,
135
+ "logits/chosen": 111.7248306274414,
136
+ "logits/rejected": 128.3420867919922,
137
+ "logps/chosen": -427.53106689453125,
138
+ "logps/rejected": -546.7640991210938,
139
+ "loss": 0.1651,
140
+ "rewards/accuracies": 0.949999988079071,
141
+ "rewards/chosen": -3.693999767303467,
142
+ "rewards/margins": 2.9823195934295654,
143
+ "rewards/rejected": -6.676319122314453,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 1.7061611374407581,
148
+ "grad_norm": 40.99464664899818,
149
  "learning_rate": 2.7440387297912122e-08,
150
+ "logits/chosen": 110.8941879272461,
151
+ "logits/rejected": 123.70848083496094,
152
+ "logps/chosen": -457.2183532714844,
153
+ "logps/rejected": -575.8634033203125,
154
+ "loss": 0.1557,
155
+ "rewards/accuracies": 0.9750000238418579,
156
+ "rewards/chosen": -3.5393664836883545,
157
+ "rewards/margins": 3.369715929031372,
158
+ "rewards/rejected": -6.909082889556885,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 1.8957345971563981,
163
+ "grad_norm": 45.385328063823785,
164
  "learning_rate": 2.27878296044029e-09,
165
+ "logits/chosen": 117.1551284790039,
166
+ "logits/rejected": 117.0487060546875,
167
+ "logps/chosen": -446.9934997558594,
168
+ "logps/rejected": -541.2728881835938,
169
+ "loss": 0.1585,
170
+ "rewards/accuracies": 0.925000011920929,
171
+ "rewards/chosen": -3.250919818878174,
172
+ "rewards/margins": 2.9745240211486816,
173
+ "rewards/rejected": -6.2254438400268555,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 1.8957345971563981,
178
+ "eval_logits/chosen": 96.73149871826172,
179
+ "eval_logits/rejected": 91.1323013305664,
180
+ "eval_logps/chosen": -446.253662109375,
181
+ "eval_logps/rejected": -476.3663635253906,
182
+ "eval_loss": 0.46732592582702637,
183
+ "eval_rewards/accuracies": 0.6979166865348816,
184
+ "eval_rewards/chosen": -4.1292724609375,
185
+ "eval_rewards/margins": 1.5854991674423218,
186
+ "eval_rewards/rejected": -5.714771270751953,
187
+ "eval_runtime": 120.4793,
188
+ "eval_samples_per_second": 6.225,
189
+ "eval_steps_per_second": 0.199,
190
  "step": 100
191
  },
192
  {
193
  "epoch": 1.971563981042654,
194
  "step": 104,
195
  "total_flos": 0.0,
196
+ "train_loss": 0.39153398688022906,
197
+ "train_runtime": 2311.0387,
198
+ "train_samples_per_second": 5.842,
199
+ "train_steps_per_second": 0.045
200
  }
201
  ],
202
  "logging_steps": 10,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c83a4cd67c56c86e8779774ef2b3c0c2d20d775dd7b0aa4eba03778d916c3903
3
  size 6264
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79ee78a4306adfc04ffa07fc0ca8acbb9d3417b9d7c9f4adaf815a8d83ea6a24
3
  size 6264