pszemraj commited on
Commit
cba3201
1 Parent(s): 39bb379
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "do_sample": true,
5
+ "eos_token_id": 2,
6
+ "transformers_version": "4.40.2"
7
+ }
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46c5e3e1c7f14c651b8954733db0fe00d1853e8a5bd0eccae1b0073ec7067bf4
3
+ size 4949453792
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:259fe40447303fc97bb4d606b03ee2f84df297340acc35ec9d0de47df6cb070a
3
+ size 4999819336
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b698cf45add049b21c866559ed166c0aff5ef78d4582a2cc8b37d4242b239cde
3
+ size 1929457496
model.safetensors.index.json ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 11878703104
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00003-of-00003.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
15
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
16
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
17
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
18
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
19
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
20
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
21
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
22
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
23
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
24
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
25
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
26
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors",
27
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
28
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
29
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
30
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
31
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
32
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
33
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
34
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
35
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
36
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
37
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
38
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
39
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
40
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
41
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
42
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
43
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
44
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
45
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
46
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
47
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
48
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
49
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
50
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
51
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
52
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
53
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
54
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
55
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
56
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
57
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
58
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
59
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
60
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
61
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
62
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
63
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
64
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
65
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
66
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
67
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
68
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
69
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
70
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
71
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
72
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
73
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
74
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
75
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
76
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
77
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
78
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
79
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
80
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
81
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
82
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
83
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
84
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
85
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
86
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
87
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
88
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
89
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
90
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
91
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
92
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
93
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
94
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
95
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
96
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
97
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
98
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
99
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
100
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
101
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
102
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
103
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
104
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
105
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
106
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
107
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
108
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
109
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
110
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
111
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
112
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
113
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
114
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
115
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
116
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
117
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
118
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
119
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
120
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
121
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
122
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
123
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
124
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
125
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
126
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
127
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
128
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
129
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
130
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
131
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
132
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
133
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
134
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
135
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
136
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
137
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
138
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
139
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
140
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
141
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
142
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
143
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00003.safetensors",
144
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
145
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
146
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
147
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
148
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
149
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
150
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
151
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
152
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
153
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
154
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
155
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
156
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
157
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
158
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
159
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
160
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
161
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
162
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
163
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
164
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
165
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
166
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
167
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
168
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
169
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
170
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
171
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
172
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
173
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
174
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
175
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
176
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
177
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
178
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
179
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
180
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
181
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
182
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
183
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
184
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
185
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
186
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
187
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
188
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
189
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
190
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
191
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
192
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
193
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
194
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
195
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
196
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
197
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
198
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
199
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
200
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
201
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
202
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
203
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
204
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
205
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
206
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
207
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
208
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
209
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
210
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
211
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
212
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
213
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
214
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
215
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
216
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
217
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
218
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
219
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
220
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
221
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
222
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
223
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
224
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
225
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
226
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
227
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
228
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
229
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
230
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
231
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
232
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
233
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
234
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
235
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
236
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
237
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
238
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
239
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
240
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
241
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
242
+ "model.norm.weight": "model-00003-of-00003.safetensors"
243
+ }
244
+ }
trainer_state.json ADDED
@@ -0,0 +1,1923 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.3821579984474831,
5
+ "eval_steps": 400,
6
+ "global_step": 1600,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.00023884874902967696,
13
+ "eval_loss": 1.5979785919189453,
14
+ "eval_runtime": 224.9995,
15
+ "eval_samples_per_second": 3.778,
16
+ "eval_steps_per_second": 3.778,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.0014330924941780617,
21
+ "grad_norm": 20.875,
22
+ "learning_rate": 6.000000000000001e-07,
23
+ "loss": 1.8691,
24
+ "step": 6
25
+ },
26
+ {
27
+ "epoch": 0.0028661849883561234,
28
+ "grad_norm": 14.0625,
29
+ "learning_rate": 1.2000000000000002e-06,
30
+ "loss": 1.8156,
31
+ "step": 12
32
+ },
33
+ {
34
+ "epoch": 0.004299277482534185,
35
+ "grad_norm": 11.1875,
36
+ "learning_rate": 1.8000000000000001e-06,
37
+ "loss": 1.6925,
38
+ "step": 18
39
+ },
40
+ {
41
+ "epoch": 0.005732369976712247,
42
+ "grad_norm": 7.15625,
43
+ "learning_rate": 2.4000000000000003e-06,
44
+ "loss": 1.612,
45
+ "step": 24
46
+ },
47
+ {
48
+ "epoch": 0.0071654624708903086,
49
+ "grad_norm": 7.25,
50
+ "learning_rate": 3e-06,
51
+ "loss": 1.8222,
52
+ "step": 30
53
+ },
54
+ {
55
+ "epoch": 0.00859855496506837,
56
+ "grad_norm": 5.71875,
57
+ "learning_rate": 3.6000000000000003e-06,
58
+ "loss": 1.6277,
59
+ "step": 36
60
+ },
61
+ {
62
+ "epoch": 0.010031647459246432,
63
+ "grad_norm": 5.65625,
64
+ "learning_rate": 4.2000000000000004e-06,
65
+ "loss": 1.5655,
66
+ "step": 42
67
+ },
68
+ {
69
+ "epoch": 0.011464739953424494,
70
+ "grad_norm": 6.90625,
71
+ "learning_rate": 4.800000000000001e-06,
72
+ "loss": 1.7691,
73
+ "step": 48
74
+ },
75
+ {
76
+ "epoch": 0.012897832447602555,
77
+ "grad_norm": 6.96875,
78
+ "learning_rate": 5.400000000000001e-06,
79
+ "loss": 1.7085,
80
+ "step": 54
81
+ },
82
+ {
83
+ "epoch": 0.014330924941780617,
84
+ "grad_norm": 5.3125,
85
+ "learning_rate": 6e-06,
86
+ "loss": 1.4649,
87
+ "step": 60
88
+ },
89
+ {
90
+ "epoch": 0.01576401743595868,
91
+ "grad_norm": 15.8125,
92
+ "learning_rate": 6.600000000000001e-06,
93
+ "loss": 1.6534,
94
+ "step": 66
95
+ },
96
+ {
97
+ "epoch": 0.01719710993013674,
98
+ "grad_norm": 42.75,
99
+ "learning_rate": 7.2000000000000005e-06,
100
+ "loss": 1.673,
101
+ "step": 72
102
+ },
103
+ {
104
+ "epoch": 0.018630202424314804,
105
+ "grad_norm": 5.5,
106
+ "learning_rate": 7.800000000000002e-06,
107
+ "loss": 1.429,
108
+ "step": 78
109
+ },
110
+ {
111
+ "epoch": 0.020063294918492864,
112
+ "grad_norm": 3.875,
113
+ "learning_rate": 8.400000000000001e-06,
114
+ "loss": 1.6067,
115
+ "step": 84
116
+ },
117
+ {
118
+ "epoch": 0.021496387412670927,
119
+ "grad_norm": 4.53125,
120
+ "learning_rate": 9e-06,
121
+ "loss": 1.4336,
122
+ "step": 90
123
+ },
124
+ {
125
+ "epoch": 0.022929479906848987,
126
+ "grad_norm": 4.40625,
127
+ "learning_rate": 9.600000000000001e-06,
128
+ "loss": 1.5998,
129
+ "step": 96
130
+ },
131
+ {
132
+ "epoch": 0.02436257240102705,
133
+ "grad_norm": 5.40625,
134
+ "learning_rate": 1.02e-05,
135
+ "loss": 1.5259,
136
+ "step": 102
137
+ },
138
+ {
139
+ "epoch": 0.02579566489520511,
140
+ "grad_norm": 9.0,
141
+ "learning_rate": 1.0800000000000002e-05,
142
+ "loss": 1.5255,
143
+ "step": 108
144
+ },
145
+ {
146
+ "epoch": 0.027228757389383174,
147
+ "grad_norm": 5.34375,
148
+ "learning_rate": 1.14e-05,
149
+ "loss": 1.5375,
150
+ "step": 114
151
+ },
152
+ {
153
+ "epoch": 0.028661849883561234,
154
+ "grad_norm": 4.625,
155
+ "learning_rate": 1.2e-05,
156
+ "loss": 1.4729,
157
+ "step": 120
158
+ },
159
+ {
160
+ "epoch": 0.030094942377739298,
161
+ "grad_norm": 5.78125,
162
+ "learning_rate": 1.2600000000000001e-05,
163
+ "loss": 1.5446,
164
+ "step": 126
165
+ },
166
+ {
167
+ "epoch": 0.03152803487191736,
168
+ "grad_norm": 5.15625,
169
+ "learning_rate": 1.3200000000000002e-05,
170
+ "loss": 1.6895,
171
+ "step": 132
172
+ },
173
+ {
174
+ "epoch": 0.03296112736609542,
175
+ "grad_norm": 4.59375,
176
+ "learning_rate": 1.38e-05,
177
+ "loss": 1.6145,
178
+ "step": 138
179
+ },
180
+ {
181
+ "epoch": 0.03439421986027348,
182
+ "grad_norm": 4.96875,
183
+ "learning_rate": 1.4400000000000001e-05,
184
+ "loss": 1.4316,
185
+ "step": 144
186
+ },
187
+ {
188
+ "epoch": 0.035827312354451545,
189
+ "grad_norm": 4.71875,
190
+ "learning_rate": 1.5000000000000002e-05,
191
+ "loss": 1.5619,
192
+ "step": 150
193
+ },
194
+ {
195
+ "epoch": 0.03726040484862961,
196
+ "grad_norm": 7.9375,
197
+ "learning_rate": 1.5600000000000003e-05,
198
+ "loss": 1.6608,
199
+ "step": 156
200
+ },
201
+ {
202
+ "epoch": 0.038693497342807665,
203
+ "grad_norm": 4.34375,
204
+ "learning_rate": 1.62e-05,
205
+ "loss": 1.6418,
206
+ "step": 162
207
+ },
208
+ {
209
+ "epoch": 0.04012658983698573,
210
+ "grad_norm": 4.8125,
211
+ "learning_rate": 1.6800000000000002e-05,
212
+ "loss": 1.5532,
213
+ "step": 168
214
+ },
215
+ {
216
+ "epoch": 0.04155968233116379,
217
+ "grad_norm": 7.90625,
218
+ "learning_rate": 1.7400000000000003e-05,
219
+ "loss": 1.6124,
220
+ "step": 174
221
+ },
222
+ {
223
+ "epoch": 0.042992774825341855,
224
+ "grad_norm": 5.90625,
225
+ "learning_rate": 1.8e-05,
226
+ "loss": 1.5629,
227
+ "step": 180
228
+ },
229
+ {
230
+ "epoch": 0.04442586731951991,
231
+ "grad_norm": 9.4375,
232
+ "learning_rate": 1.86e-05,
233
+ "loss": 1.5727,
234
+ "step": 186
235
+ },
236
+ {
237
+ "epoch": 0.045858959813697975,
238
+ "grad_norm": 6.34375,
239
+ "learning_rate": 1.9200000000000003e-05,
240
+ "loss": 1.4866,
241
+ "step": 192
242
+ },
243
+ {
244
+ "epoch": 0.04729205230787604,
245
+ "grad_norm": 10.9375,
246
+ "learning_rate": 1.98e-05,
247
+ "loss": 1.6203,
248
+ "step": 198
249
+ },
250
+ {
251
+ "epoch": 0.0487251448020541,
252
+ "grad_norm": 5.46875,
253
+ "learning_rate": 1.9999756307053947e-05,
254
+ "loss": 1.6003,
255
+ "step": 204
256
+ },
257
+ {
258
+ "epoch": 0.05015823729623216,
259
+ "grad_norm": 7.34375,
260
+ "learning_rate": 1.9998476951563914e-05,
261
+ "loss": 1.7795,
262
+ "step": 210
263
+ },
264
+ {
265
+ "epoch": 0.05159132979041022,
266
+ "grad_norm": 5.03125,
267
+ "learning_rate": 1.9996101150403543e-05,
268
+ "loss": 1.6262,
269
+ "step": 216
270
+ },
271
+ {
272
+ "epoch": 0.053024422284588285,
273
+ "grad_norm": 6.03125,
274
+ "learning_rate": 1.999262916410621e-05,
275
+ "loss": 1.5033,
276
+ "step": 222
277
+ },
278
+ {
279
+ "epoch": 0.05445751477876635,
280
+ "grad_norm": 6.375,
281
+ "learning_rate": 1.9988061373414342e-05,
282
+ "loss": 1.528,
283
+ "step": 228
284
+ },
285
+ {
286
+ "epoch": 0.055890607272944405,
287
+ "grad_norm": 5.375,
288
+ "learning_rate": 1.9982398279237657e-05,
289
+ "loss": 1.6706,
290
+ "step": 234
291
+ },
292
+ {
293
+ "epoch": 0.05732369976712247,
294
+ "grad_norm": 5.3125,
295
+ "learning_rate": 1.9975640502598243e-05,
296
+ "loss": 1.8826,
297
+ "step": 240
298
+ },
299
+ {
300
+ "epoch": 0.05875679226130053,
301
+ "grad_norm": 7.21875,
302
+ "learning_rate": 1.9967788784562474e-05,
303
+ "loss": 1.6844,
304
+ "step": 246
305
+ },
306
+ {
307
+ "epoch": 0.060189884755478595,
308
+ "grad_norm": 14.0,
309
+ "learning_rate": 1.9958843986159705e-05,
310
+ "loss": 1.6681,
311
+ "step": 252
312
+ },
313
+ {
314
+ "epoch": 0.06162297724965665,
315
+ "grad_norm": 5.3125,
316
+ "learning_rate": 1.9948807088287884e-05,
317
+ "loss": 1.5271,
318
+ "step": 258
319
+ },
320
+ {
321
+ "epoch": 0.06305606974383472,
322
+ "grad_norm": 5.78125,
323
+ "learning_rate": 1.9937679191605964e-05,
324
+ "loss": 1.5941,
325
+ "step": 264
326
+ },
327
+ {
328
+ "epoch": 0.06448916223801278,
329
+ "grad_norm": 7.75,
330
+ "learning_rate": 1.9925461516413224e-05,
331
+ "loss": 1.6754,
332
+ "step": 270
333
+ },
334
+ {
335
+ "epoch": 0.06592225473219084,
336
+ "grad_norm": 5.03125,
337
+ "learning_rate": 1.991215540251542e-05,
338
+ "loss": 1.6616,
339
+ "step": 276
340
+ },
341
+ {
342
+ "epoch": 0.0673553472263689,
343
+ "grad_norm": 5.46875,
344
+ "learning_rate": 1.989776230907789e-05,
345
+ "loss": 1.7207,
346
+ "step": 282
347
+ },
348
+ {
349
+ "epoch": 0.06878843972054696,
350
+ "grad_norm": 4.84375,
351
+ "learning_rate": 1.988228381446553e-05,
352
+ "loss": 1.6092,
353
+ "step": 288
354
+ },
355
+ {
356
+ "epoch": 0.07022153221472502,
357
+ "grad_norm": 15.625,
358
+ "learning_rate": 1.9865721616069695e-05,
359
+ "loss": 1.6828,
360
+ "step": 294
361
+ },
362
+ {
363
+ "epoch": 0.07165462470890309,
364
+ "grad_norm": 7.125,
365
+ "learning_rate": 1.9848077530122083e-05,
366
+ "loss": 1.7341,
367
+ "step": 300
368
+ },
369
+ {
370
+ "epoch": 0.07308771720308115,
371
+ "grad_norm": 10.625,
372
+ "learning_rate": 1.9829353491495545e-05,
373
+ "loss": 1.6181,
374
+ "step": 306
375
+ },
376
+ {
377
+ "epoch": 0.07452080969725922,
378
+ "grad_norm": 4.75,
379
+ "learning_rate": 1.9809551553491918e-05,
380
+ "loss": 1.548,
381
+ "step": 312
382
+ },
383
+ {
384
+ "epoch": 0.07595390219143727,
385
+ "grad_norm": 6.9375,
386
+ "learning_rate": 1.9788673887616852e-05,
387
+ "loss": 1.5703,
388
+ "step": 318
389
+ },
390
+ {
391
+ "epoch": 0.07738699468561533,
392
+ "grad_norm": 6.71875,
393
+ "learning_rate": 1.9766722783341682e-05,
394
+ "loss": 1.7147,
395
+ "step": 324
396
+ },
397
+ {
398
+ "epoch": 0.0788200871797934,
399
+ "grad_norm": 6.8125,
400
+ "learning_rate": 1.9743700647852356e-05,
401
+ "loss": 1.7598,
402
+ "step": 330
403
+ },
404
+ {
405
+ "epoch": 0.08025317967397146,
406
+ "grad_norm": 5.0625,
407
+ "learning_rate": 1.9719610005785466e-05,
408
+ "loss": 1.7136,
409
+ "step": 336
410
+ },
411
+ {
412
+ "epoch": 0.08168627216814951,
413
+ "grad_norm": 6.03125,
414
+ "learning_rate": 1.9694453498951392e-05,
415
+ "loss": 1.7161,
416
+ "step": 342
417
+ },
418
+ {
419
+ "epoch": 0.08311936466232758,
420
+ "grad_norm": 7.34375,
421
+ "learning_rate": 1.9668233886044597e-05,
422
+ "loss": 1.6319,
423
+ "step": 348
424
+ },
425
+ {
426
+ "epoch": 0.08455245715650564,
427
+ "grad_norm": 5.21875,
428
+ "learning_rate": 1.96409540423411e-05,
429
+ "loss": 1.5857,
430
+ "step": 354
431
+ },
432
+ {
433
+ "epoch": 0.08598554965068371,
434
+ "grad_norm": 10.6875,
435
+ "learning_rate": 1.961261695938319e-05,
436
+ "loss": 1.7632,
437
+ "step": 360
438
+ },
439
+ {
440
+ "epoch": 0.08741864214486177,
441
+ "grad_norm": 6.21875,
442
+ "learning_rate": 1.9583225744651334e-05,
443
+ "loss": 1.4205,
444
+ "step": 366
445
+ },
446
+ {
447
+ "epoch": 0.08885173463903982,
448
+ "grad_norm": 5.875,
449
+ "learning_rate": 1.9552783621223437e-05,
450
+ "loss": 1.7812,
451
+ "step": 372
452
+ },
453
+ {
454
+ "epoch": 0.0902848271332179,
455
+ "grad_norm": 4.46875,
456
+ "learning_rate": 1.9521293927421388e-05,
457
+ "loss": 1.5759,
458
+ "step": 378
459
+ },
460
+ {
461
+ "epoch": 0.09171791962739595,
462
+ "grad_norm": 6.53125,
463
+ "learning_rate": 1.9488760116444966e-05,
464
+ "loss": 1.6537,
465
+ "step": 384
466
+ },
467
+ {
468
+ "epoch": 0.09315101212157402,
469
+ "grad_norm": 10.8125,
470
+ "learning_rate": 1.945518575599317e-05,
471
+ "loss": 1.4973,
472
+ "step": 390
473
+ },
474
+ {
475
+ "epoch": 0.09458410461575208,
476
+ "grad_norm": 4.1875,
477
+ "learning_rate": 1.942057452787297e-05,
478
+ "loss": 1.578,
479
+ "step": 396
480
+ },
481
+ {
482
+ "epoch": 0.09553949961187078,
483
+ "eval_loss": 1.4027706384658813,
484
+ "eval_runtime": 224.2305,
485
+ "eval_samples_per_second": 3.791,
486
+ "eval_steps_per_second": 3.791,
487
+ "step": 400
488
+ },
489
+ {
490
+ "epoch": 0.09601719710993013,
491
+ "grad_norm": 3.875,
492
+ "learning_rate": 1.938493022759556e-05,
493
+ "loss": 1.6032,
494
+ "step": 402
495
+ },
496
+ {
497
+ "epoch": 0.0974502896041082,
498
+ "grad_norm": 6.125,
499
+ "learning_rate": 1.9348256763960146e-05,
500
+ "loss": 1.7055,
501
+ "step": 408
502
+ },
503
+ {
504
+ "epoch": 0.09888338209828626,
505
+ "grad_norm": 5.84375,
506
+ "learning_rate": 1.9310558158625286e-05,
507
+ "loss": 1.7454,
508
+ "step": 414
509
+ },
510
+ {
511
+ "epoch": 0.10031647459246432,
512
+ "grad_norm": 7.0625,
513
+ "learning_rate": 1.9271838545667876e-05,
514
+ "loss": 1.7345,
515
+ "step": 420
516
+ },
517
+ {
518
+ "epoch": 0.10174956708664239,
519
+ "grad_norm": 6.125,
520
+ "learning_rate": 1.923210217112981e-05,
521
+ "loss": 1.6099,
522
+ "step": 426
523
+ },
524
+ {
525
+ "epoch": 0.10318265958082044,
526
+ "grad_norm": 4.59375,
527
+ "learning_rate": 1.9191353392552346e-05,
528
+ "loss": 1.652,
529
+ "step": 432
530
+ },
531
+ {
532
+ "epoch": 0.10461575207499851,
533
+ "grad_norm": 5.96875,
534
+ "learning_rate": 1.914959667849825e-05,
535
+ "loss": 1.7092,
536
+ "step": 438
537
+ },
538
+ {
539
+ "epoch": 0.10604884456917657,
540
+ "grad_norm": 6.4375,
541
+ "learning_rate": 1.910683660806177e-05,
542
+ "loss": 1.7545,
543
+ "step": 444
544
+ },
545
+ {
546
+ "epoch": 0.10748193706335463,
547
+ "grad_norm": 10.4375,
548
+ "learning_rate": 1.9063077870366504e-05,
549
+ "loss": 1.5287,
550
+ "step": 450
551
+ },
552
+ {
553
+ "epoch": 0.1089150295575327,
554
+ "grad_norm": 7.84375,
555
+ "learning_rate": 1.901832526405114e-05,
556
+ "loss": 1.7219,
557
+ "step": 456
558
+ },
559
+ {
560
+ "epoch": 0.11034812205171075,
561
+ "grad_norm": 9.5625,
562
+ "learning_rate": 1.8972583696743284e-05,
563
+ "loss": 1.665,
564
+ "step": 462
565
+ },
566
+ {
567
+ "epoch": 0.11178121454588881,
568
+ "grad_norm": 10.0625,
569
+ "learning_rate": 1.892585818452126e-05,
570
+ "loss": 1.6363,
571
+ "step": 468
572
+ },
573
+ {
574
+ "epoch": 0.11321430704006688,
575
+ "grad_norm": 5.78125,
576
+ "learning_rate": 1.8878153851364013e-05,
577
+ "loss": 1.543,
578
+ "step": 474
579
+ },
580
+ {
581
+ "epoch": 0.11464739953424494,
582
+ "grad_norm": 6.125,
583
+ "learning_rate": 1.8829475928589272e-05,
584
+ "loss": 1.5826,
585
+ "step": 480
586
+ },
587
+ {
588
+ "epoch": 0.11608049202842301,
589
+ "grad_norm": 4.8125,
590
+ "learning_rate": 1.8779829754279806e-05,
591
+ "loss": 1.581,
592
+ "step": 486
593
+ },
594
+ {
595
+ "epoch": 0.11751358452260106,
596
+ "grad_norm": 9.75,
597
+ "learning_rate": 1.8729220772698096e-05,
598
+ "loss": 1.5841,
599
+ "step": 492
600
+ },
601
+ {
602
+ "epoch": 0.11894667701677912,
603
+ "grad_norm": 13.3125,
604
+ "learning_rate": 1.8677654533689287e-05,
605
+ "loss": 1.6944,
606
+ "step": 498
607
+ },
608
+ {
609
+ "epoch": 0.12037976951095719,
610
+ "grad_norm": 4.96875,
611
+ "learning_rate": 1.8625136692072577e-05,
612
+ "loss": 1.6203,
613
+ "step": 504
614
+ },
615
+ {
616
+ "epoch": 0.12181286200513525,
617
+ "grad_norm": 6.3125,
618
+ "learning_rate": 1.8571673007021124e-05,
619
+ "loss": 1.5639,
620
+ "step": 510
621
+ },
622
+ {
623
+ "epoch": 0.1232459544993133,
624
+ "grad_norm": 5.5,
625
+ "learning_rate": 1.851726934143048e-05,
626
+ "loss": 1.6397,
627
+ "step": 516
628
+ },
629
+ {
630
+ "epoch": 0.12467904699349137,
631
+ "grad_norm": 5.125,
632
+ "learning_rate": 1.8461931661275642e-05,
633
+ "loss": 1.7315,
634
+ "step": 522
635
+ },
636
+ {
637
+ "epoch": 0.12611213948766944,
638
+ "grad_norm": 6.25,
639
+ "learning_rate": 1.8405666034956842e-05,
640
+ "loss": 1.7201,
641
+ "step": 528
642
+ },
643
+ {
644
+ "epoch": 0.1275452319818475,
645
+ "grad_norm": 8.9375,
646
+ "learning_rate": 1.8348478632634067e-05,
647
+ "loss": 1.6047,
648
+ "step": 534
649
+ },
650
+ {
651
+ "epoch": 0.12897832447602556,
652
+ "grad_norm": 46.25,
653
+ "learning_rate": 1.8290375725550417e-05,
654
+ "loss": 1.6949,
655
+ "step": 540
656
+ },
657
+ {
658
+ "epoch": 0.13041141697020361,
659
+ "grad_norm": 5.9375,
660
+ "learning_rate": 1.8231363685344422e-05,
661
+ "loss": 1.7245,
662
+ "step": 546
663
+ },
664
+ {
665
+ "epoch": 0.13184450946438167,
666
+ "grad_norm": 5.78125,
667
+ "learning_rate": 1.8171448983351284e-05,
668
+ "loss": 1.641,
669
+ "step": 552
670
+ },
671
+ {
672
+ "epoch": 0.13327760195855975,
673
+ "grad_norm": 24.125,
674
+ "learning_rate": 1.8110638189893267e-05,
675
+ "loss": 1.6125,
676
+ "step": 558
677
+ },
678
+ {
679
+ "epoch": 0.1347106944527378,
680
+ "grad_norm": 6.4375,
681
+ "learning_rate": 1.804893797355914e-05,
682
+ "loss": 1.6647,
683
+ "step": 564
684
+ },
685
+ {
686
+ "epoch": 0.13614378694691587,
687
+ "grad_norm": 6.34375,
688
+ "learning_rate": 1.798635510047293e-05,
689
+ "loss": 1.7073,
690
+ "step": 570
691
+ },
692
+ {
693
+ "epoch": 0.13757687944109392,
694
+ "grad_norm": 6.1875,
695
+ "learning_rate": 1.792289643355191e-05,
696
+ "loss": 1.6271,
697
+ "step": 576
698
+ },
699
+ {
700
+ "epoch": 0.13900997193527198,
701
+ "grad_norm": 5.0625,
702
+ "learning_rate": 1.785856893175402e-05,
703
+ "loss": 1.6317,
704
+ "step": 582
705
+ },
706
+ {
707
+ "epoch": 0.14044306442945004,
708
+ "grad_norm": 4.6875,
709
+ "learning_rate": 1.7793379649314743e-05,
710
+ "loss": 1.6578,
711
+ "step": 588
712
+ },
713
+ {
714
+ "epoch": 0.14187615692362812,
715
+ "grad_norm": 4.84375,
716
+ "learning_rate": 1.7727335734973512e-05,
717
+ "loss": 1.6554,
718
+ "step": 594
719
+ },
720
+ {
721
+ "epoch": 0.14330924941780618,
722
+ "grad_norm": 6.1875,
723
+ "learning_rate": 1.766044443118978e-05,
724
+ "loss": 1.5523,
725
+ "step": 600
726
+ },
727
+ {
728
+ "epoch": 0.14474234191198423,
729
+ "grad_norm": 23.375,
730
+ "learning_rate": 1.759271307334881e-05,
731
+ "loss": 1.616,
732
+ "step": 606
733
+ },
734
+ {
735
+ "epoch": 0.1461754344061623,
736
+ "grad_norm": 6.9375,
737
+ "learning_rate": 1.7524149088957244e-05,
738
+ "loss": 1.7729,
739
+ "step": 612
740
+ },
741
+ {
742
+ "epoch": 0.14760852690034035,
743
+ "grad_norm": 10.25,
744
+ "learning_rate": 1.7454759996828622e-05,
745
+ "loss": 1.5922,
746
+ "step": 618
747
+ },
748
+ {
749
+ "epoch": 0.14904161939451843,
750
+ "grad_norm": 7.21875,
751
+ "learning_rate": 1.7384553406258842e-05,
752
+ "loss": 1.583,
753
+ "step": 624
754
+ },
755
+ {
756
+ "epoch": 0.1504747118886965,
757
+ "grad_norm": 6.9375,
758
+ "learning_rate": 1.7313537016191706e-05,
759
+ "loss": 1.6019,
760
+ "step": 630
761
+ },
762
+ {
763
+ "epoch": 0.15190780438287455,
764
+ "grad_norm": 11.5,
765
+ "learning_rate": 1.7241718614374678e-05,
766
+ "loss": 1.6195,
767
+ "step": 636
768
+ },
769
+ {
770
+ "epoch": 0.1533408968770526,
771
+ "grad_norm": 5.5,
772
+ "learning_rate": 1.716910607650483e-05,
773
+ "loss": 1.5012,
774
+ "step": 642
775
+ },
776
+ {
777
+ "epoch": 0.15477398937123066,
778
+ "grad_norm": 6.71875,
779
+ "learning_rate": 1.709570736536521e-05,
780
+ "loss": 1.7686,
781
+ "step": 648
782
+ },
783
+ {
784
+ "epoch": 0.15620708186540874,
785
+ "grad_norm": 5.71875,
786
+ "learning_rate": 1.7021530529951627e-05,
787
+ "loss": 1.7922,
788
+ "step": 654
789
+ },
790
+ {
791
+ "epoch": 0.1576401743595868,
792
+ "grad_norm": 7.8125,
793
+ "learning_rate": 1.6946583704589973e-05,
794
+ "loss": 1.623,
795
+ "step": 660
796
+ },
797
+ {
798
+ "epoch": 0.15907326685376486,
799
+ "grad_norm": 6.34375,
800
+ "learning_rate": 1.6870875108044233e-05,
801
+ "loss": 1.6039,
802
+ "step": 666
803
+ },
804
+ {
805
+ "epoch": 0.1605063593479429,
806
+ "grad_norm": 6.46875,
807
+ "learning_rate": 1.6794413042615168e-05,
808
+ "loss": 1.6392,
809
+ "step": 672
810
+ },
811
+ {
812
+ "epoch": 0.16193945184212097,
813
+ "grad_norm": 5.4375,
814
+ "learning_rate": 1.6717205893229904e-05,
815
+ "loss": 1.5683,
816
+ "step": 678
817
+ },
818
+ {
819
+ "epoch": 0.16337254433629902,
820
+ "grad_norm": 4.78125,
821
+ "learning_rate": 1.6639262126522417e-05,
822
+ "loss": 1.6165,
823
+ "step": 684
824
+ },
825
+ {
826
+ "epoch": 0.1648056368304771,
827
+ "grad_norm": 5.4375,
828
+ "learning_rate": 1.6560590289905074e-05,
829
+ "loss": 1.5341,
830
+ "step": 690
831
+ },
832
+ {
833
+ "epoch": 0.16623872932465517,
834
+ "grad_norm": 5.25,
835
+ "learning_rate": 1.6481199010631312e-05,
836
+ "loss": 1.6573,
837
+ "step": 696
838
+ },
839
+ {
840
+ "epoch": 0.16767182181883322,
841
+ "grad_norm": 5.21875,
842
+ "learning_rate": 1.6401096994849558e-05,
843
+ "loss": 1.5056,
844
+ "step": 702
845
+ },
846
+ {
847
+ "epoch": 0.16910491431301128,
848
+ "grad_norm": 12.625,
849
+ "learning_rate": 1.632029302664851e-05,
850
+ "loss": 1.5337,
851
+ "step": 708
852
+ },
853
+ {
854
+ "epoch": 0.17053800680718934,
855
+ "grad_norm": 4.28125,
856
+ "learning_rate": 1.6238795967093865e-05,
857
+ "loss": 1.5038,
858
+ "step": 714
859
+ },
860
+ {
861
+ "epoch": 0.17197109930136742,
862
+ "grad_norm": 6.96875,
863
+ "learning_rate": 1.6156614753256583e-05,
864
+ "loss": 1.5587,
865
+ "step": 720
866
+ },
867
+ {
868
+ "epoch": 0.17340419179554548,
869
+ "grad_norm": 4.90625,
870
+ "learning_rate": 1.607375839723287e-05,
871
+ "loss": 1.563,
872
+ "step": 726
873
+ },
874
+ {
875
+ "epoch": 0.17483728428972353,
876
+ "grad_norm": 5.34375,
877
+ "learning_rate": 1.599023598515586e-05,
878
+ "loss": 1.6058,
879
+ "step": 732
880
+ },
881
+ {
882
+ "epoch": 0.1762703767839016,
883
+ "grad_norm": 5.25,
884
+ "learning_rate": 1.5906056676199256e-05,
885
+ "loss": 1.7244,
886
+ "step": 738
887
+ },
888
+ {
889
+ "epoch": 0.17770346927807965,
890
+ "grad_norm": 4.5,
891
+ "learning_rate": 1.5821229701572897e-05,
892
+ "loss": 1.6587,
893
+ "step": 744
894
+ },
895
+ {
896
+ "epoch": 0.17913656177225773,
897
+ "grad_norm": 12.75,
898
+ "learning_rate": 1.573576436351046e-05,
899
+ "loss": 1.6018,
900
+ "step": 750
901
+ },
902
+ {
903
+ "epoch": 0.1805696542664358,
904
+ "grad_norm": 6.0,
905
+ "learning_rate": 1.564967003424938e-05,
906
+ "loss": 1.6205,
907
+ "step": 756
908
+ },
909
+ {
910
+ "epoch": 0.18200274676061384,
911
+ "grad_norm": 5.59375,
912
+ "learning_rate": 1.556295615500305e-05,
913
+ "loss": 1.6345,
914
+ "step": 762
915
+ },
916
+ {
917
+ "epoch": 0.1834358392547919,
918
+ "grad_norm": 4.59375,
919
+ "learning_rate": 1.5475632234925505e-05,
920
+ "loss": 1.5226,
921
+ "step": 768
922
+ },
923
+ {
924
+ "epoch": 0.18486893174896996,
925
+ "grad_norm": 4.78125,
926
+ "learning_rate": 1.5387707850068633e-05,
927
+ "loss": 1.6488,
928
+ "step": 774
929
+ },
930
+ {
931
+ "epoch": 0.18630202424314804,
932
+ "grad_norm": 4.28125,
933
+ "learning_rate": 1.529919264233205e-05,
934
+ "loss": 1.5393,
935
+ "step": 780
936
+ },
937
+ {
938
+ "epoch": 0.1877351167373261,
939
+ "grad_norm": 7.625,
940
+ "learning_rate": 1.5210096318405768e-05,
941
+ "loss": 1.5374,
942
+ "step": 786
943
+ },
944
+ {
945
+ "epoch": 0.18916820923150415,
946
+ "grad_norm": 4.21875,
947
+ "learning_rate": 1.5120428648705716e-05,
948
+ "loss": 1.4963,
949
+ "step": 792
950
+ },
951
+ {
952
+ "epoch": 0.1906013017256822,
953
+ "grad_norm": 4.25,
954
+ "learning_rate": 1.5030199466302354e-05,
955
+ "loss": 1.5828,
956
+ "step": 798
957
+ },
958
+ {
959
+ "epoch": 0.19107899922374155,
960
+ "eval_loss": 1.3809266090393066,
961
+ "eval_runtime": 223.0505,
962
+ "eval_samples_per_second": 3.811,
963
+ "eval_steps_per_second": 3.811,
964
+ "step": 800
965
+ },
966
+ {
967
+ "epoch": 0.19203439421986027,
968
+ "grad_norm": 6.21875,
969
+ "learning_rate": 1.493941866584231e-05,
970
+ "loss": 1.5799,
971
+ "step": 804
972
+ },
973
+ {
974
+ "epoch": 0.19346748671403832,
975
+ "grad_norm": 8.5,
976
+ "learning_rate": 1.4848096202463373e-05,
977
+ "loss": 1.6519,
978
+ "step": 810
979
+ },
980
+ {
981
+ "epoch": 0.1949005792082164,
982
+ "grad_norm": 4.59375,
983
+ "learning_rate": 1.4756242090702756e-05,
984
+ "loss": 1.5897,
985
+ "step": 816
986
+ },
987
+ {
988
+ "epoch": 0.19633367170239446,
989
+ "grad_norm": 5.75,
990
+ "learning_rate": 1.4663866403398915e-05,
991
+ "loss": 1.6454,
992
+ "step": 822
993
+ },
994
+ {
995
+ "epoch": 0.19776676419657252,
996
+ "grad_norm": 4.1875,
997
+ "learning_rate": 1.4570979270586944e-05,
998
+ "loss": 1.5361,
999
+ "step": 828
1000
+ },
1001
+ {
1002
+ "epoch": 0.19919985669075058,
1003
+ "grad_norm": 5.375,
1004
+ "learning_rate": 1.4477590878387697e-05,
1005
+ "loss": 1.5086,
1006
+ "step": 834
1007
+ },
1008
+ {
1009
+ "epoch": 0.20063294918492863,
1010
+ "grad_norm": 4.375,
1011
+ "learning_rate": 1.4383711467890776e-05,
1012
+ "loss": 1.6474,
1013
+ "step": 840
1014
+ },
1015
+ {
1016
+ "epoch": 0.20206604167910672,
1017
+ "grad_norm": 4.6875,
1018
+ "learning_rate": 1.4289351334031461e-05,
1019
+ "loss": 1.465,
1020
+ "step": 846
1021
+ },
1022
+ {
1023
+ "epoch": 0.20349913417328477,
1024
+ "grad_norm": 8.6875,
1025
+ "learning_rate": 1.4194520824461773e-05,
1026
+ "loss": 1.5312,
1027
+ "step": 852
1028
+ },
1029
+ {
1030
+ "epoch": 0.20493222666746283,
1031
+ "grad_norm": 5.53125,
1032
+ "learning_rate": 1.4099230338415728e-05,
1033
+ "loss": 1.4775,
1034
+ "step": 858
1035
+ },
1036
+ {
1037
+ "epoch": 0.2063653191616409,
1038
+ "grad_norm": 9.8125,
1039
+ "learning_rate": 1.4003490325568953e-05,
1040
+ "loss": 1.8343,
1041
+ "step": 864
1042
+ },
1043
+ {
1044
+ "epoch": 0.20779841165581894,
1045
+ "grad_norm": 8.0625,
1046
+ "learning_rate": 1.3907311284892737e-05,
1047
+ "loss": 1.537,
1048
+ "step": 870
1049
+ },
1050
+ {
1051
+ "epoch": 0.20923150414999703,
1052
+ "grad_norm": 6.3125,
1053
+ "learning_rate": 1.3810703763502744e-05,
1054
+ "loss": 1.7239,
1055
+ "step": 876
1056
+ },
1057
+ {
1058
+ "epoch": 0.21066459664417508,
1059
+ "grad_norm": 5.75,
1060
+ "learning_rate": 1.371367835550235e-05,
1061
+ "loss": 1.5176,
1062
+ "step": 882
1063
+ },
1064
+ {
1065
+ "epoch": 0.21209768913835314,
1066
+ "grad_norm": 4.65625,
1067
+ "learning_rate": 1.3616245700820922e-05,
1068
+ "loss": 1.641,
1069
+ "step": 888
1070
+ },
1071
+ {
1072
+ "epoch": 0.2135307816325312,
1073
+ "grad_norm": 4.0625,
1074
+ "learning_rate": 1.3518416484047018e-05,
1075
+ "loss": 1.5882,
1076
+ "step": 894
1077
+ },
1078
+ {
1079
+ "epoch": 0.21496387412670925,
1080
+ "grad_norm": 5.09375,
1081
+ "learning_rate": 1.342020143325669e-05,
1082
+ "loss": 1.6042,
1083
+ "step": 900
1084
+ },
1085
+ {
1086
+ "epoch": 0.2163969666208873,
1087
+ "grad_norm": 5.84375,
1088
+ "learning_rate": 1.3321611318837033e-05,
1089
+ "loss": 1.5516,
1090
+ "step": 906
1091
+ },
1092
+ {
1093
+ "epoch": 0.2178300591150654,
1094
+ "grad_norm": 6.15625,
1095
+ "learning_rate": 1.3222656952305113e-05,
1096
+ "loss": 1.5349,
1097
+ "step": 912
1098
+ },
1099
+ {
1100
+ "epoch": 0.21926315160924345,
1101
+ "grad_norm": 5.21875,
1102
+ "learning_rate": 1.3123349185122328e-05,
1103
+ "loss": 1.6652,
1104
+ "step": 918
1105
+ },
1106
+ {
1107
+ "epoch": 0.2206962441034215,
1108
+ "grad_norm": 17.25,
1109
+ "learning_rate": 1.3023698907504447e-05,
1110
+ "loss": 1.7149,
1111
+ "step": 924
1112
+ },
1113
+ {
1114
+ "epoch": 0.22212933659759956,
1115
+ "grad_norm": 6.8125,
1116
+ "learning_rate": 1.2923717047227368e-05,
1117
+ "loss": 1.6285,
1118
+ "step": 930
1119
+ },
1120
+ {
1121
+ "epoch": 0.22356242909177762,
1122
+ "grad_norm": 4.1875,
1123
+ "learning_rate": 1.2823414568428767e-05,
1124
+ "loss": 1.5982,
1125
+ "step": 936
1126
+ },
1127
+ {
1128
+ "epoch": 0.2249955215859557,
1129
+ "grad_norm": 5.8125,
1130
+ "learning_rate": 1.2722802470405744e-05,
1131
+ "loss": 1.5901,
1132
+ "step": 942
1133
+ },
1134
+ {
1135
+ "epoch": 0.22642861408013376,
1136
+ "grad_norm": 4.75,
1137
+ "learning_rate": 1.2621891786408648e-05,
1138
+ "loss": 1.5705,
1139
+ "step": 948
1140
+ },
1141
+ {
1142
+ "epoch": 0.22786170657431182,
1143
+ "grad_norm": 10.1875,
1144
+ "learning_rate": 1.252069358243114e-05,
1145
+ "loss": 1.5263,
1146
+ "step": 954
1147
+ },
1148
+ {
1149
+ "epoch": 0.22929479906848987,
1150
+ "grad_norm": 3.671875,
1151
+ "learning_rate": 1.2419218955996677e-05,
1152
+ "loss": 1.5622,
1153
+ "step": 960
1154
+ },
1155
+ {
1156
+ "epoch": 0.23072789156266793,
1157
+ "grad_norm": 4.625,
1158
+ "learning_rate": 1.2317479034941572e-05,
1159
+ "loss": 1.5984,
1160
+ "step": 966
1161
+ },
1162
+ {
1163
+ "epoch": 0.23216098405684601,
1164
+ "grad_norm": 7.21875,
1165
+ "learning_rate": 1.2215484976194675e-05,
1166
+ "loss": 1.6465,
1167
+ "step": 972
1168
+ },
1169
+ {
1170
+ "epoch": 0.23359407655102407,
1171
+ "grad_norm": 6.59375,
1172
+ "learning_rate": 1.211324796455389e-05,
1173
+ "loss": 1.705,
1174
+ "step": 978
1175
+ },
1176
+ {
1177
+ "epoch": 0.23502716904520213,
1178
+ "grad_norm": 5.96875,
1179
+ "learning_rate": 1.2010779211459649e-05,
1180
+ "loss": 1.5316,
1181
+ "step": 984
1182
+ },
1183
+ {
1184
+ "epoch": 0.23646026153938018,
1185
+ "grad_norm": 5.3125,
1186
+ "learning_rate": 1.190808995376545e-05,
1187
+ "loss": 1.4676,
1188
+ "step": 990
1189
+ },
1190
+ {
1191
+ "epoch": 0.23789335403355824,
1192
+ "grad_norm": 4.9375,
1193
+ "learning_rate": 1.1805191452505602e-05,
1194
+ "loss": 1.5319,
1195
+ "step": 996
1196
+ },
1197
+ {
1198
+ "epoch": 0.2393264465277363,
1199
+ "grad_norm": 5.625,
1200
+ "learning_rate": 1.1702094991660326e-05,
1201
+ "loss": 1.6112,
1202
+ "step": 1002
1203
+ },
1204
+ {
1205
+ "epoch": 0.24075953902191438,
1206
+ "grad_norm": 4.71875,
1207
+ "learning_rate": 1.159881187691835e-05,
1208
+ "loss": 1.6341,
1209
+ "step": 1008
1210
+ },
1211
+ {
1212
+ "epoch": 0.24219263151609244,
1213
+ "grad_norm": 4.3125,
1214
+ "learning_rate": 1.1495353434437098e-05,
1215
+ "loss": 1.4623,
1216
+ "step": 1014
1217
+ },
1218
+ {
1219
+ "epoch": 0.2436257240102705,
1220
+ "grad_norm": 19.625,
1221
+ "learning_rate": 1.1391731009600655e-05,
1222
+ "loss": 1.4166,
1223
+ "step": 1020
1224
+ },
1225
+ {
1226
+ "epoch": 0.24505881650444855,
1227
+ "grad_norm": 4.0625,
1228
+ "learning_rate": 1.128795596577563e-05,
1229
+ "loss": 1.5813,
1230
+ "step": 1026
1231
+ },
1232
+ {
1233
+ "epoch": 0.2464919089986266,
1234
+ "grad_norm": 6.25,
1235
+ "learning_rate": 1.1184039683065014e-05,
1236
+ "loss": 1.5772,
1237
+ "step": 1032
1238
+ },
1239
+ {
1240
+ "epoch": 0.2479250014928047,
1241
+ "grad_norm": 5.53125,
1242
+ "learning_rate": 1.1079993557060228e-05,
1243
+ "loss": 1.401,
1244
+ "step": 1038
1245
+ },
1246
+ {
1247
+ "epoch": 0.24935809398698275,
1248
+ "grad_norm": 6.65625,
1249
+ "learning_rate": 1.0975828997591496e-05,
1250
+ "loss": 1.6248,
1251
+ "step": 1044
1252
+ },
1253
+ {
1254
+ "epoch": 0.2507911864811608,
1255
+ "grad_norm": 856.0,
1256
+ "learning_rate": 1.0871557427476585e-05,
1257
+ "loss": 1.775,
1258
+ "step": 1050
1259
+ },
1260
+ {
1261
+ "epoch": 0.2522242789753389,
1262
+ "grad_norm": 4.1875,
1263
+ "learning_rate": 1.0767190281268187e-05,
1264
+ "loss": 1.586,
1265
+ "step": 1056
1266
+ },
1267
+ {
1268
+ "epoch": 0.25365737146951695,
1269
+ "grad_norm": 3.53125,
1270
+ "learning_rate": 1.0662739004000005e-05,
1271
+ "loss": 1.5397,
1272
+ "step": 1062
1273
+ },
1274
+ {
1275
+ "epoch": 0.255090463963695,
1276
+ "grad_norm": 4.125,
1277
+ "learning_rate": 1.055821504993164e-05,
1278
+ "loss": 1.8712,
1279
+ "step": 1068
1280
+ },
1281
+ {
1282
+ "epoch": 0.25652355645787306,
1283
+ "grad_norm": 5.1875,
1284
+ "learning_rate": 1.0453629881292537e-05,
1285
+ "loss": 1.5357,
1286
+ "step": 1074
1287
+ },
1288
+ {
1289
+ "epoch": 0.2579566489520511,
1290
+ "grad_norm": 3.921875,
1291
+ "learning_rate": 1.0348994967025012e-05,
1292
+ "loss": 1.4033,
1293
+ "step": 1080
1294
+ },
1295
+ {
1296
+ "epoch": 0.25938974144622917,
1297
+ "grad_norm": 5.3125,
1298
+ "learning_rate": 1.0244321781526533e-05,
1299
+ "loss": 1.5611,
1300
+ "step": 1086
1301
+ },
1302
+ {
1303
+ "epoch": 0.26082283394040723,
1304
+ "grad_norm": 4.8125,
1305
+ "learning_rate": 1.0139621803391454e-05,
1306
+ "loss": 1.577,
1307
+ "step": 1092
1308
+ },
1309
+ {
1310
+ "epoch": 0.2622559264345853,
1311
+ "grad_norm": 5.46875,
1312
+ "learning_rate": 1.0034906514152239e-05,
1313
+ "loss": 1.5149,
1314
+ "step": 1098
1315
+ },
1316
+ {
1317
+ "epoch": 0.26368901892876334,
1318
+ "grad_norm": 6.4375,
1319
+ "learning_rate": 9.930187397020385e-06,
1320
+ "loss": 1.5796,
1321
+ "step": 1104
1322
+ },
1323
+ {
1324
+ "epoch": 0.2651221114229414,
1325
+ "grad_norm": 4.28125,
1326
+ "learning_rate": 9.825475935627165e-06,
1327
+ "loss": 1.5702,
1328
+ "step": 1110
1329
+ },
1330
+ {
1331
+ "epoch": 0.2665552039171195,
1332
+ "grad_norm": 5.34375,
1333
+ "learning_rate": 9.720783612764314e-06,
1334
+ "loss": 1.5354,
1335
+ "step": 1116
1336
+ },
1337
+ {
1338
+ "epoch": 0.26798829641129757,
1339
+ "grad_norm": 4.375,
1340
+ "learning_rate": 9.616121909124801e-06,
1341
+ "loss": 1.4122,
1342
+ "step": 1122
1343
+ },
1344
+ {
1345
+ "epoch": 0.2694213889054756,
1346
+ "grad_norm": 5.46875,
1347
+ "learning_rate": 9.511502302043867e-06,
1348
+ "loss": 1.6959,
1349
+ "step": 1128
1350
+ },
1351
+ {
1352
+ "epoch": 0.2708544813996537,
1353
+ "grad_norm": 8.4375,
1354
+ "learning_rate": 9.406936264240386e-06,
1355
+ "loss": 1.5493,
1356
+ "step": 1134
1357
+ },
1358
+ {
1359
+ "epoch": 0.27228757389383174,
1360
+ "grad_norm": 5.46875,
1361
+ "learning_rate": 9.302435262558748e-06,
1362
+ "loss": 1.4156,
1363
+ "step": 1140
1364
+ },
1365
+ {
1366
+ "epoch": 0.2737206663880098,
1367
+ "grad_norm": 720.0,
1368
+ "learning_rate": 9.198010756711413e-06,
1369
+ "loss": 1.567,
1370
+ "step": 1146
1371
+ },
1372
+ {
1373
+ "epoch": 0.27515375888218785,
1374
+ "grad_norm": 3.875,
1375
+ "learning_rate": 9.093674198022201e-06,
1376
+ "loss": 1.3814,
1377
+ "step": 1152
1378
+ },
1379
+ {
1380
+ "epoch": 0.2765868513763659,
1381
+ "grad_norm": 3.671875,
1382
+ "learning_rate": 8.989437028170537e-06,
1383
+ "loss": 1.4261,
1384
+ "step": 1158
1385
+ },
1386
+ {
1387
+ "epoch": 0.27801994387054396,
1388
+ "grad_norm": 10.375,
1389
+ "learning_rate": 8.885310677936746e-06,
1390
+ "loss": 1.506,
1391
+ "step": 1164
1392
+ },
1393
+ {
1394
+ "epoch": 0.279453036364722,
1395
+ "grad_norm": 3.46875,
1396
+ "learning_rate": 8.781306565948528e-06,
1397
+ "loss": 1.3967,
1398
+ "step": 1170
1399
+ },
1400
+ {
1401
+ "epoch": 0.2808861288589001,
1402
+ "grad_norm": 3.984375,
1403
+ "learning_rate": 8.677436097428775e-06,
1404
+ "loss": 1.5761,
1405
+ "step": 1176
1406
+ },
1407
+ {
1408
+ "epoch": 0.2823192213530782,
1409
+ "grad_norm": 3.484375,
1410
+ "learning_rate": 8.573710662944884e-06,
1411
+ "loss": 1.5428,
1412
+ "step": 1182
1413
+ },
1414
+ {
1415
+ "epoch": 0.28375231384725624,
1416
+ "grad_norm": 6.25,
1417
+ "learning_rate": 8.47014163715962e-06,
1418
+ "loss": 1.5426,
1419
+ "step": 1188
1420
+ },
1421
+ {
1422
+ "epoch": 0.2851854063414343,
1423
+ "grad_norm": 6.25,
1424
+ "learning_rate": 8.366740377583781e-06,
1425
+ "loss": 1.503,
1426
+ "step": 1194
1427
+ },
1428
+ {
1429
+ "epoch": 0.28661849883561236,
1430
+ "grad_norm": 3.828125,
1431
+ "learning_rate": 8.263518223330698e-06,
1432
+ "loss": 1.4355,
1433
+ "step": 1200
1434
+ },
1435
+ {
1436
+ "epoch": 0.28661849883561236,
1437
+ "eval_loss": 1.315157413482666,
1438
+ "eval_runtime": 223.8181,
1439
+ "eval_samples_per_second": 3.798,
1440
+ "eval_steps_per_second": 3.798,
1441
+ "step": 1200
1442
+ },
1443
+ {
1444
+ "epoch": 0.2880515913297904,
1445
+ "grad_norm": 5.625,
1446
+ "learning_rate": 8.1604864938728e-06,
1447
+ "loss": 1.4389,
1448
+ "step": 1206
1449
+ },
1450
+ {
1451
+ "epoch": 0.28948468382396847,
1452
+ "grad_norm": 5.0625,
1453
+ "learning_rate": 8.057656487800283e-06,
1454
+ "loss": 1.5346,
1455
+ "step": 1212
1456
+ },
1457
+ {
1458
+ "epoch": 0.2909177763181465,
1459
+ "grad_norm": 4.21875,
1460
+ "learning_rate": 7.955039481582098e-06,
1461
+ "loss": 1.4492,
1462
+ "step": 1218
1463
+ },
1464
+ {
1465
+ "epoch": 0.2923508688123246,
1466
+ "grad_norm": 4.9375,
1467
+ "learning_rate": 7.852646728329368e-06,
1468
+ "loss": 1.4305,
1469
+ "step": 1224
1470
+ },
1471
+ {
1472
+ "epoch": 0.29378396130650264,
1473
+ "grad_norm": 4.9375,
1474
+ "learning_rate": 7.750489456561351e-06,
1475
+ "loss": 1.607,
1476
+ "step": 1230
1477
+ },
1478
+ {
1479
+ "epoch": 0.2952170538006807,
1480
+ "grad_norm": 4.90625,
1481
+ "learning_rate": 7.6485788689741e-06,
1482
+ "loss": 1.3777,
1483
+ "step": 1236
1484
+ },
1485
+ {
1486
+ "epoch": 0.2966501462948588,
1487
+ "grad_norm": 5.875,
1488
+ "learning_rate": 7.546926141211975e-06,
1489
+ "loss": 1.5751,
1490
+ "step": 1242
1491
+ },
1492
+ {
1493
+ "epoch": 0.29808323878903686,
1494
+ "grad_norm": 4.8125,
1495
+ "learning_rate": 7.445542420642097e-06,
1496
+ "loss": 1.5106,
1497
+ "step": 1248
1498
+ },
1499
+ {
1500
+ "epoch": 0.2995163312832149,
1501
+ "grad_norm": 4.875,
1502
+ "learning_rate": 7.344438825131912e-06,
1503
+ "loss": 1.5982,
1504
+ "step": 1254
1505
+ },
1506
+ {
1507
+ "epoch": 0.300949423777393,
1508
+ "grad_norm": 5.09375,
1509
+ "learning_rate": 7.243626441830009e-06,
1510
+ "loss": 1.5328,
1511
+ "step": 1260
1512
+ },
1513
+ {
1514
+ "epoch": 0.30238251627157103,
1515
+ "grad_norm": 4.09375,
1516
+ "learning_rate": 7.143116325950266e-06,
1517
+ "loss": 1.6138,
1518
+ "step": 1266
1519
+ },
1520
+ {
1521
+ "epoch": 0.3038156087657491,
1522
+ "grad_norm": 3.8125,
1523
+ "learning_rate": 7.042919499559538e-06,
1524
+ "loss": 1.4547,
1525
+ "step": 1272
1526
+ },
1527
+ {
1528
+ "epoch": 0.30524870125992715,
1529
+ "grad_norm": 4.1875,
1530
+ "learning_rate": 6.943046950368944e-06,
1531
+ "loss": 1.4393,
1532
+ "step": 1278
1533
+ },
1534
+ {
1535
+ "epoch": 0.3066817937541052,
1536
+ "grad_norm": 5.34375,
1537
+ "learning_rate": 6.843509630528977e-06,
1538
+ "loss": 1.4009,
1539
+ "step": 1284
1540
+ },
1541
+ {
1542
+ "epoch": 0.30811488624828326,
1543
+ "grad_norm": 5.125,
1544
+ "learning_rate": 6.744318455428436e-06,
1545
+ "loss": 1.5134,
1546
+ "step": 1290
1547
+ },
1548
+ {
1549
+ "epoch": 0.3095479787424613,
1550
+ "grad_norm": 4.96875,
1551
+ "learning_rate": 6.645484302497452e-06,
1552
+ "loss": 1.5411,
1553
+ "step": 1296
1554
+ },
1555
+ {
1556
+ "epoch": 0.3109810712366394,
1557
+ "grad_norm": 4.9375,
1558
+ "learning_rate": 6.547018010014654e-06,
1559
+ "loss": 1.5058,
1560
+ "step": 1302
1561
+ },
1562
+ {
1563
+ "epoch": 0.3124141637308175,
1564
+ "grad_norm": 3.59375,
1565
+ "learning_rate": 6.448930375918632e-06,
1566
+ "loss": 1.4026,
1567
+ "step": 1308
1568
+ },
1569
+ {
1570
+ "epoch": 0.31384725622499554,
1571
+ "grad_norm": 4.78125,
1572
+ "learning_rate": 6.351232156623803e-06,
1573
+ "loss": 1.3993,
1574
+ "step": 1314
1575
+ },
1576
+ {
1577
+ "epoch": 0.3152803487191736,
1578
+ "grad_norm": 4.21875,
1579
+ "learning_rate": 6.25393406584088e-06,
1580
+ "loss": 1.6574,
1581
+ "step": 1320
1582
+ },
1583
+ {
1584
+ "epoch": 0.31671344121335165,
1585
+ "grad_norm": 4.40625,
1586
+ "learning_rate": 6.157046773401964e-06,
1587
+ "loss": 1.5233,
1588
+ "step": 1326
1589
+ },
1590
+ {
1591
+ "epoch": 0.3181465337075297,
1592
+ "grad_norm": 5.25,
1593
+ "learning_rate": 6.06058090409049e-06,
1594
+ "loss": 1.5095,
1595
+ "step": 1332
1596
+ },
1597
+ {
1598
+ "epoch": 0.31957962620170777,
1599
+ "grad_norm": 4.625,
1600
+ "learning_rate": 5.9645470364761e-06,
1601
+ "loss": 1.3797,
1602
+ "step": 1338
1603
+ },
1604
+ {
1605
+ "epoch": 0.3210127186958858,
1606
+ "grad_norm": 5.84375,
1607
+ "learning_rate": 5.868955701754584e-06,
1608
+ "loss": 1.6089,
1609
+ "step": 1344
1610
+ },
1611
+ {
1612
+ "epoch": 0.3224458111900639,
1613
+ "grad_norm": 3.71875,
1614
+ "learning_rate": 5.773817382593008e-06,
1615
+ "loss": 1.4297,
1616
+ "step": 1350
1617
+ },
1618
+ {
1619
+ "epoch": 0.32387890368424194,
1620
+ "grad_norm": 3.578125,
1621
+ "learning_rate": 5.679142511980176e-06,
1622
+ "loss": 1.327,
1623
+ "step": 1356
1624
+ },
1625
+ {
1626
+ "epoch": 0.32531199617842,
1627
+ "grad_norm": 4.6875,
1628
+ "learning_rate": 5.584941472082549e-06,
1629
+ "loss": 1.4878,
1630
+ "step": 1362
1631
+ },
1632
+ {
1633
+ "epoch": 0.32674508867259805,
1634
+ "grad_norm": 5.125,
1635
+ "learning_rate": 5.491224593105695e-06,
1636
+ "loss": 1.4593,
1637
+ "step": 1368
1638
+ },
1639
+ {
1640
+ "epoch": 0.32817818116677616,
1641
+ "grad_norm": 7.1875,
1642
+ "learning_rate": 5.398002152161484e-06,
1643
+ "loss": 1.5287,
1644
+ "step": 1374
1645
+ },
1646
+ {
1647
+ "epoch": 0.3296112736609542,
1648
+ "grad_norm": 5.71875,
1649
+ "learning_rate": 5.305284372141095e-06,
1650
+ "loss": 1.4808,
1651
+ "step": 1380
1652
+ },
1653
+ {
1654
+ "epoch": 0.3310443661551323,
1655
+ "grad_norm": 4.09375,
1656
+ "learning_rate": 5.213081420593933e-06,
1657
+ "loss": 1.4244,
1658
+ "step": 1386
1659
+ },
1660
+ {
1661
+ "epoch": 0.33247745864931033,
1662
+ "grad_norm": 9.5,
1663
+ "learning_rate": 5.121403408612672e-06,
1664
+ "loss": 1.5213,
1665
+ "step": 1392
1666
+ },
1667
+ {
1668
+ "epoch": 0.3339105511434884,
1669
+ "grad_norm": 5.09375,
1670
+ "learning_rate": 5.030260389724447e-06,
1671
+ "loss": 1.4455,
1672
+ "step": 1398
1673
+ },
1674
+ {
1675
+ "epoch": 0.33534364363766644,
1676
+ "grad_norm": 6.6875,
1677
+ "learning_rate": 4.939662358788364e-06,
1678
+ "loss": 1.5983,
1679
+ "step": 1404
1680
+ },
1681
+ {
1682
+ "epoch": 0.3367767361318445,
1683
+ "grad_norm": 4.96875,
1684
+ "learning_rate": 4.849619250899458e-06,
1685
+ "loss": 1.3544,
1686
+ "step": 1410
1687
+ },
1688
+ {
1689
+ "epoch": 0.33820982862602256,
1690
+ "grad_norm": 4.65625,
1691
+ "learning_rate": 4.76014094029921e-06,
1692
+ "loss": 1.4412,
1693
+ "step": 1416
1694
+ },
1695
+ {
1696
+ "epoch": 0.3396429211202006,
1697
+ "grad_norm": 6.40625,
1698
+ "learning_rate": 4.671237239292699e-06,
1699
+ "loss": 1.4463,
1700
+ "step": 1422
1701
+ },
1702
+ {
1703
+ "epoch": 0.34107601361437867,
1704
+ "grad_norm": 5.25,
1705
+ "learning_rate": 4.582917897172603e-06,
1706
+ "loss": 1.5306,
1707
+ "step": 1428
1708
+ },
1709
+ {
1710
+ "epoch": 0.3425091061085568,
1711
+ "grad_norm": 4.40625,
1712
+ "learning_rate": 4.495192599150045e-06,
1713
+ "loss": 1.5532,
1714
+ "step": 1434
1715
+ },
1716
+ {
1717
+ "epoch": 0.34394219860273484,
1718
+ "grad_norm": 5.15625,
1719
+ "learning_rate": 4.408070965292534e-06,
1720
+ "loss": 1.4818,
1721
+ "step": 1440
1722
+ },
1723
+ {
1724
+ "epoch": 0.3453752910969129,
1725
+ "grad_norm": 4.125,
1726
+ "learning_rate": 4.321562549468991e-06,
1727
+ "loss": 1.4144,
1728
+ "step": 1446
1729
+ },
1730
+ {
1731
+ "epoch": 0.34680838359109095,
1732
+ "grad_norm": 4.28125,
1733
+ "learning_rate": 4.235676838302069e-06,
1734
+ "loss": 1.4173,
1735
+ "step": 1452
1736
+ },
1737
+ {
1738
+ "epoch": 0.348241476085269,
1739
+ "grad_norm": 8.5,
1740
+ "learning_rate": 4.150423250127846e-06,
1741
+ "loss": 1.4121,
1742
+ "step": 1458
1743
+ },
1744
+ {
1745
+ "epoch": 0.34967456857944706,
1746
+ "grad_norm": 5.90625,
1747
+ "learning_rate": 4.065811133962987e-06,
1748
+ "loss": 1.4121,
1749
+ "step": 1464
1750
+ },
1751
+ {
1752
+ "epoch": 0.3511076610736251,
1753
+ "grad_norm": 4.625,
1754
+ "learning_rate": 3.981849768479516e-06,
1755
+ "loss": 1.3973,
1756
+ "step": 1470
1757
+ },
1758
+ {
1759
+ "epoch": 0.3525407535678032,
1760
+ "grad_norm": 5.1875,
1761
+ "learning_rate": 3.898548360987325e-06,
1762
+ "loss": 1.4554,
1763
+ "step": 1476
1764
+ },
1765
+ {
1766
+ "epoch": 0.35397384606198123,
1767
+ "grad_norm": 5.40625,
1768
+ "learning_rate": 3.81591604642446e-06,
1769
+ "loss": 1.4958,
1770
+ "step": 1482
1771
+ },
1772
+ {
1773
+ "epoch": 0.3554069385561593,
1774
+ "grad_norm": 5.28125,
1775
+ "learning_rate": 3.7339618863553983e-06,
1776
+ "loss": 1.4843,
1777
+ "step": 1488
1778
+ },
1779
+ {
1780
+ "epoch": 0.35684003105033735,
1781
+ "grad_norm": 5.96875,
1782
+ "learning_rate": 3.6526948679773256e-06,
1783
+ "loss": 1.6051,
1784
+ "step": 1494
1785
+ },
1786
+ {
1787
+ "epoch": 0.35827312354451546,
1788
+ "grad_norm": 3.6875,
1789
+ "learning_rate": 3.5721239031346067e-06,
1790
+ "loss": 1.4176,
1791
+ "step": 1500
1792
+ },
1793
+ {
1794
+ "epoch": 0.3597062160386935,
1795
+ "grad_norm": 4.375,
1796
+ "learning_rate": 3.492257827341492e-06,
1797
+ "loss": 1.4049,
1798
+ "step": 1506
1799
+ },
1800
+ {
1801
+ "epoch": 0.3611393085328716,
1802
+ "grad_norm": 3.71875,
1803
+ "learning_rate": 3.4131053988131947e-06,
1804
+ "loss": 1.5823,
1805
+ "step": 1512
1806
+ },
1807
+ {
1808
+ "epoch": 0.36257240102704963,
1809
+ "grad_norm": 6.0,
1810
+ "learning_rate": 3.3346752975054763e-06,
1811
+ "loss": 1.4469,
1812
+ "step": 1518
1813
+ },
1814
+ {
1815
+ "epoch": 0.3640054935212277,
1816
+ "grad_norm": 4.21875,
1817
+ "learning_rate": 3.2569761241627694e-06,
1818
+ "loss": 1.4373,
1819
+ "step": 1524
1820
+ },
1821
+ {
1822
+ "epoch": 0.36543858601540574,
1823
+ "grad_norm": 6.03125,
1824
+ "learning_rate": 3.1800163993750166e-06,
1825
+ "loss": 1.4823,
1826
+ "step": 1530
1827
+ },
1828
+ {
1829
+ "epoch": 0.3668716785095838,
1830
+ "grad_norm": 4.625,
1831
+ "learning_rate": 3.103804562643302e-06,
1832
+ "loss": 1.4585,
1833
+ "step": 1536
1834
+ },
1835
+ {
1836
+ "epoch": 0.36830477100376185,
1837
+ "grad_norm": 4.28125,
1838
+ "learning_rate": 3.028348971454356e-06,
1839
+ "loss": 1.4233,
1840
+ "step": 1542
1841
+ },
1842
+ {
1843
+ "epoch": 0.3697378634979399,
1844
+ "grad_norm": 14.625,
1845
+ "learning_rate": 2.953657900364053e-06,
1846
+ "loss": 1.4869,
1847
+ "step": 1548
1848
+ },
1849
+ {
1850
+ "epoch": 0.37117095599211797,
1851
+ "grad_norm": 4.1875,
1852
+ "learning_rate": 2.8797395400900362e-06,
1853
+ "loss": 1.5315,
1854
+ "step": 1554
1855
+ },
1856
+ {
1857
+ "epoch": 0.3726040484862961,
1858
+ "grad_norm": 4.125,
1859
+ "learning_rate": 2.8066019966134907e-06,
1860
+ "loss": 1.4887,
1861
+ "step": 1560
1862
+ },
1863
+ {
1864
+ "epoch": 0.37403714098047414,
1865
+ "grad_norm": 3.796875,
1866
+ "learning_rate": 2.7342532902902418e-06,
1867
+ "loss": 1.4533,
1868
+ "step": 1566
1869
+ },
1870
+ {
1871
+ "epoch": 0.3754702334746522,
1872
+ "grad_norm": 4.03125,
1873
+ "learning_rate": 2.6627013549712355e-06,
1874
+ "loss": 1.4017,
1875
+ "step": 1572
1876
+ },
1877
+ {
1878
+ "epoch": 0.37690332596883025,
1879
+ "grad_norm": 6.84375,
1880
+ "learning_rate": 2.5919540371325005e-06,
1881
+ "loss": 1.3971,
1882
+ "step": 1578
1883
+ },
1884
+ {
1885
+ "epoch": 0.3783364184630083,
1886
+ "grad_norm": 5.5625,
1887
+ "learning_rate": 2.522019095014683e-06,
1888
+ "loss": 1.5576,
1889
+ "step": 1584
1890
+ },
1891
+ {
1892
+ "epoch": 0.37976951095718636,
1893
+ "grad_norm": 10.875,
1894
+ "learning_rate": 2.45290419777228e-06,
1895
+ "loss": 1.4719,
1896
+ "step": 1590
1897
+ },
1898
+ {
1899
+ "epoch": 0.3812026034513644,
1900
+ "grad_norm": 5.15625,
1901
+ "learning_rate": 2.3846169246326345e-06,
1902
+ "loss": 1.4618,
1903
+ "step": 1596
1904
+ },
1905
+ {
1906
+ "epoch": 0.3821579984474831,
1907
+ "eval_loss": 1.2876688241958618,
1908
+ "eval_runtime": 226.2654,
1909
+ "eval_samples_per_second": 3.757,
1910
+ "eval_steps_per_second": 3.757,
1911
+ "step": 1600
1912
+ }
1913
+ ],
1914
+ "logging_steps": 6,
1915
+ "max_steps": 2000,
1916
+ "num_input_tokens_seen": 0,
1917
+ "num_train_epochs": 1,
1918
+ "save_steps": 400,
1919
+ "total_flos": 2.9553261973639004e+18,
1920
+ "train_batch_size": 1,
1921
+ "trial_name": null,
1922
+ "trial_params": null
1923
+ }