loubnabnl HF staff commited on
Commit
dafc7d4
1 Parent(s): a3e8407

Model save

Browse files
README.md CHANGED
@@ -2,18 +2,12 @@
2
  license: apache-2.0
3
  base_model: HuggingFaceTB/SmolLM-360M
4
  tags:
5
- - alignment-handbook
6
- - trl
7
- - sft
8
- - generated_from_trainer
9
  - trl
10
  - sft
 
11
  - generated_from_trainer
12
  datasets:
13
- - HuggingFaceTB/Magpie-Pro-300K-Filtered-H4
14
- - HuggingFaceTB/self-oss-instruct-sc2-H4
15
- - HuggingFaceTB/OpenHermes-2.5-H4
16
- - HuggingFaceTB/instruct-data-basics-H4
17
  model-index:
18
  - name: smollm-350M-instruct-test2
19
  results: []
@@ -22,12 +16,12 @@ model-index:
22
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
23
  should probably proofread and complete it, then remove this comment. -->
24
 
25
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/loubnabnl/huggingface/runs/3q0xqph6)
26
  # smollm-350M-instruct-test2
27
 
28
- This model is a fine-tuned version of [HuggingFaceTB/SmolLM-360M](https://huggingface.co/HuggingFaceTB/SmolLM-360M) on the HuggingFaceTB/Magpie-Pro-300K-Filtered-H4, the HuggingFaceTB/self-oss-instruct-sc2-H4, the HuggingFaceTB/OpenHermes-2.5-H4 and the HuggingFaceTB/instruct-data-basics-H4 datasets.
29
  It achieves the following results on the evaluation set:
30
- - Loss: 1.2029
31
 
32
  ## Model description
33
 
@@ -64,7 +58,7 @@ The following hyperparameters were used during training:
64
 
65
  | Training Loss | Epoch | Step | Validation Loss |
66
  |:-------------:|:-----:|:----:|:---------------:|
67
- | 0.8399 | 1.0 | 816 | 1.2029 |
68
 
69
 
70
  ### Framework versions
 
2
  license: apache-2.0
3
  base_model: HuggingFaceTB/SmolLM-360M
4
  tags:
 
 
 
 
5
  - trl
6
  - sft
7
+ - alignment-handbook
8
  - generated_from_trainer
9
  datasets:
10
+ - generator
 
 
 
11
  model-index:
12
  - name: smollm-350M-instruct-test2
13
  results: []
 
16
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
  should probably proofread and complete it, then remove this comment. -->
18
 
19
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/loubnabnl/huggingface/runs/0gd1f5vs)
20
  # smollm-350M-instruct-test2
21
 
22
+ This model is a fine-tuned version of [HuggingFaceTB/SmolLM-360M](https://huggingface.co/HuggingFaceTB/SmolLM-360M) on the generator dataset.
23
  It achieves the following results on the evaluation set:
24
+ - Loss: 1.2024
25
 
26
  ## Model description
27
 
 
58
 
59
  | Training Loss | Epoch | Step | Validation Loss |
60
  |:-------------:|:-----:|:----:|:---------------:|
61
+ | 0.8401 | 1.0 | 816 | 1.2024 |
62
 
63
 
64
  ### Framework versions
all_results.json CHANGED
@@ -6,9 +6,9 @@
6
  "eval_samples_per_second": 189.723,
7
  "eval_steps_per_second": 5.934,
8
  "total_flos": 80063181619200.0,
9
- "train_loss": 0.9145085595402063,
10
- "train_runtime": 1902.7506,
11
  "train_samples": 319078,
12
- "train_samples_per_second": 54.885,
13
- "train_steps_per_second": 0.429
14
  }
 
6
  "eval_samples_per_second": 189.723,
7
  "eval_steps_per_second": 5.934,
8
  "total_flos": 80063181619200.0,
9
+ "train_loss": 0.914715180794398,
10
+ "train_runtime": 1874.4496,
11
  "train_samples": 319078,
12
+ "train_samples_per_second": 55.714,
13
+ "train_steps_per_second": 0.435
14
  }
config.json CHANGED
@@ -25,6 +25,6 @@
25
  "tie_word_embeddings": true,
26
  "torch_dtype": "bfloat16",
27
  "transformers_version": "4.42.3",
28
- "use_cache": true,
29
  "vocab_size": 49152
30
  }
 
25
  "tie_word_embeddings": true,
26
  "torch_dtype": "bfloat16",
27
  "transformers_version": "4.42.3",
28
+ "use_cache": false,
29
  "vocab_size": 49152
30
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c53544daf87a240d40ccb96ac60c525d7a2d397a3ab661fe5bf6f6928f6fbf9f
3
  size 723674912
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9807c299d8bab57ce4b5ad8aed938ec75a2a49cf79a22d3ee29cca105e2f5d4
3
  size 723674912
runs/Aug12_15-26-23_ip-26-0-165-24/events.out.tfevents.1723476428.ip-26-0-165-24.498584.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7522cf76d54904b417764e96888e83230877da42b81aee8eb6296aa0d6b01408
3
+ size 40257
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 80063181619200.0,
4
- "train_loss": 0.9145085595402063,
5
- "train_runtime": 1902.7506,
6
  "train_samples": 319078,
7
- "train_samples_per_second": 54.885,
8
- "train_steps_per_second": 0.429
9
  }
 
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 80063181619200.0,
4
+ "train_loss": 0.914715180794398,
5
+ "train_runtime": 1874.4496,
6
  "train_samples": 319078,
7
+ "train_samples_per_second": 55.714,
8
+ "train_steps_per_second": 0.435
9
  }
trainer_state.json CHANGED
@@ -10,1168 +10,1168 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0012254901960784314,
13
- "grad_norm": 1.6036396146523473,
14
  "learning_rate": 1.2195121951219513e-05,
15
  "loss": 1.3541,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.006127450980392157,
20
- "grad_norm": 1.4541999892274449,
21
  "learning_rate": 6.097560975609756e-05,
22
  "loss": 1.355,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.012254901960784314,
27
- "grad_norm": 1.5390535097118918,
28
  "learning_rate": 0.00012195121951219512,
29
  "loss": 1.3083,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.01838235294117647,
34
- "grad_norm": 0.5811818759637415,
35
  "learning_rate": 0.00018292682926829268,
36
  "loss": 1.2226,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.024509803921568627,
41
- "grad_norm": 0.3755084090839048,
42
  "learning_rate": 0.00024390243902439024,
43
- "loss": 1.14,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.030637254901960783,
48
- "grad_norm": 0.24648929470069889,
49
  "learning_rate": 0.0003048780487804878,
50
  "loss": 1.0942,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.03676470588235294,
55
- "grad_norm": 0.18518290277618207,
56
  "learning_rate": 0.00036585365853658537,
57
  "loss": 1.0785,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.0428921568627451,
62
- "grad_norm": 0.14372846391436536,
63
  "learning_rate": 0.0004268292682926829,
64
  "loss": 1.0549,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.049019607843137254,
69
- "grad_norm": 0.1389801769443967,
70
  "learning_rate": 0.0004878048780487805,
71
  "loss": 1.0493,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.05514705882352941,
76
- "grad_norm": 0.15520999728279067,
77
  "learning_rate": 0.0005487804878048781,
78
  "loss": 1.0306,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.061274509803921566,
83
- "grad_norm": 0.13121885809742784,
84
  "learning_rate": 0.0006097560975609756,
85
  "loss": 1.0204,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.06740196078431372,
90
- "grad_norm": 0.11703518229748765,
91
  "learning_rate": 0.0006707317073170732,
92
  "loss": 1.0281,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.07352941176470588,
97
- "grad_norm": 0.12071199354118181,
98
  "learning_rate": 0.0007317073170731707,
99
- "loss": 1.0188,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.07965686274509803,
104
- "grad_norm": 0.11482998270254378,
105
  "learning_rate": 0.0007926829268292683,
106
  "loss": 1.0019,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.0857843137254902,
111
- "grad_norm": 0.13413766368345428,
112
  "learning_rate": 0.0008536585365853659,
113
- "loss": 1.0043,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.09191176470588236,
118
- "grad_norm": 0.1471493243236106,
119
  "learning_rate": 0.0009146341463414635,
120
  "loss": 1.0071,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.09803921568627451,
125
- "grad_norm": 0.1259997868133445,
126
  "learning_rate": 0.000975609756097561,
127
  "loss": 0.9962,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.10416666666666667,
132
- "grad_norm": 0.16099811489081525,
133
  "learning_rate": 0.000999958782259877,
134
- "loss": 0.9999,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.11029411764705882,
139
- "grad_norm": 0.13529736823555488,
140
  "learning_rate": 0.0009997069206794246,
141
- "loss": 1.01,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.11642156862745098,
146
- "grad_norm": 0.1287112736990173,
147
  "learning_rate": 0.0009992262114666653,
148
  "loss": 0.9904,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.12254901960784313,
153
- "grad_norm": 0.13096915650623966,
154
  "learning_rate": 0.0009985168747689707,
155
- "loss": 0.9859,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.12867647058823528,
160
- "grad_norm": 0.14334941177812624,
161
  "learning_rate": 0.0009975792354368017,
162
  "loss": 0.9934,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.13480392156862744,
167
- "grad_norm": 0.13671174612094514,
168
  "learning_rate": 0.0009964137228749407,
169
  "loss": 0.9961,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.1409313725490196,
174
- "grad_norm": 0.12218713761592166,
175
  "learning_rate": 0.000995020870845837,
176
- "loss": 0.9948,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.14705882352941177,
181
- "grad_norm": 0.1291445702524626,
182
  "learning_rate": 0.0009934013172251653,
183
  "loss": 0.9824,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.15318627450980393,
188
- "grad_norm": 0.1346131782998567,
189
  "learning_rate": 0.0009915558037097002,
190
  "loss": 0.977,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.15931372549019607,
195
- "grad_norm": 0.13410866481050307,
196
  "learning_rate": 0.0009894851754776472,
197
  "loss": 0.9712,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.16544117647058823,
202
- "grad_norm": 0.12662709799384195,
203
  "learning_rate": 0.0009871903808015812,
204
  "loss": 0.9807,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.1715686274509804,
209
- "grad_norm": 0.10699714724935337,
210
  "learning_rate": 0.0009846724706141716,
211
  "loss": 0.977,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.17769607843137256,
216
- "grad_norm": 0.11416365647326593,
217
  "learning_rate": 0.0009819325980268945,
218
  "loss": 0.9743,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.18382352941176472,
223
- "grad_norm": 0.11826552720186441,
224
  "learning_rate": 0.0009789720178019483,
225
- "loss": 0.9741,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.18995098039215685,
230
- "grad_norm": 0.11966728016095998,
231
  "learning_rate": 0.0009757920857776188,
232
- "loss": 0.9633,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.19607843137254902,
237
- "grad_norm": 0.1224502041103689,
238
  "learning_rate": 0.0009723942582473544,
239
  "loss": 0.9544,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.20220588235294118,
244
- "grad_norm": 0.14383092165933975,
245
  "learning_rate": 0.0009687800912928362,
246
- "loss": 0.9696,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.20833333333333334,
251
- "grad_norm": 0.11172166270095091,
252
  "learning_rate": 0.0009649512400713498,
253
  "loss": 0.963,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.21446078431372548,
258
- "grad_norm": 0.15338163624311216,
259
  "learning_rate": 0.0009609094580577824,
260
- "loss": 0.96,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.22058823529411764,
265
- "grad_norm": 0.13464799651247097,
266
  "learning_rate": 0.0009566565962415959,
267
  "loss": 0.9578,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.2267156862745098,
272
- "grad_norm": 0.14069989404410843,
273
  "learning_rate": 0.0009521946022791401,
274
  "loss": 0.9555,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.23284313725490197,
279
- "grad_norm": 0.1563880044910766,
280
  "learning_rate": 0.0009475255196016972,
281
  "loss": 0.9579,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.23897058823529413,
286
- "grad_norm": 0.1354522777364055,
287
  "learning_rate": 0.0009426514864796647,
288
  "loss": 0.9494,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.24509803921568626,
293
- "grad_norm": 0.14141184524556524,
294
  "learning_rate": 0.0009375747350433044,
295
- "loss": 0.9479,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.2512254901960784,
300
- "grad_norm": 0.16178153584659036,
301
  "learning_rate": 0.0009322975902605082,
302
- "loss": 0.9655,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.25735294117647056,
307
- "grad_norm": 0.12500026452110888,
308
  "learning_rate": 0.0009268224688720474,
309
- "loss": 0.9446,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.26348039215686275,
314
- "grad_norm": 0.11225181351597031,
315
  "learning_rate": 0.0009211518782847931,
316
- "loss": 0.9425,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.2696078431372549,
321
- "grad_norm": 0.11760271912658449,
322
  "learning_rate": 0.0009152884154234145,
323
  "loss": 0.9451,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.2757352941176471,
328
- "grad_norm": 0.1118694290603578,
329
  "learning_rate": 0.0009092347655410818,
330
- "loss": 0.9403,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.2818627450980392,
335
- "grad_norm": 0.1420082600855828,
336
  "learning_rate": 0.0009029937009897176,
337
  "loss": 0.9349,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.28799019607843135,
342
- "grad_norm": 0.1056893276215326,
343
  "learning_rate": 0.0008965680799503608,
344
  "loss": 0.9329,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.29411764705882354,
349
- "grad_norm": 0.11697165985204966,
350
  "learning_rate": 0.0008899608451242233,
351
- "loss": 0.9396,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.3002450980392157,
356
- "grad_norm": 0.11807737475048682,
357
  "learning_rate": 0.0008831750223850389,
358
- "loss": 0.923,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.30637254901960786,
363
- "grad_norm": 0.1050901075842651,
364
  "learning_rate": 0.0008762137193933241,
365
  "loss": 0.9296,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.3125,
370
- "grad_norm": 0.12149544868604345,
371
  "learning_rate": 0.0008690801241731818,
372
- "loss": 0.9209,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.31862745098039214,
377
- "grad_norm": 1.9072328081474224,
378
  "learning_rate": 0.0008617775036523015,
379
- "loss": 0.9392,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.3247549019607843,
384
- "grad_norm": 0.11658317470657904,
385
  "learning_rate": 0.0008543092021658259,
386
  "loss": 0.9367,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.33088235294117646,
391
- "grad_norm": 0.11777705186781876,
392
  "learning_rate": 0.0008466786399247663,
393
- "loss": 0.9285,
394
  "step": 270
395
  },
396
  {
397
  "epoch": 0.33700980392156865,
398
- "grad_norm": 0.10974224954880234,
399
  "learning_rate": 0.0008388893114496705,
400
- "loss": 0.9357,
401
  "step": 275
402
  },
403
  {
404
  "epoch": 0.3431372549019608,
405
- "grad_norm": 0.10762311122261868,
406
  "learning_rate": 0.0008309447839702582,
407
- "loss": 0.9303,
408
  "step": 280
409
  },
410
  {
411
  "epoch": 0.3492647058823529,
412
- "grad_norm": 0.10853358544847327,
413
  "learning_rate": 0.0008228486957917607,
414
- "loss": 0.9222,
415
  "step": 285
416
  },
417
  {
418
  "epoch": 0.3553921568627451,
419
- "grad_norm": 0.11469387675689356,
420
  "learning_rate": 0.0008146047546287076,
421
- "loss": 0.9343,
422
  "step": 290
423
  },
424
  {
425
  "epoch": 0.36151960784313725,
426
- "grad_norm": 0.11795665158917668,
427
  "learning_rate": 0.0008062167359069301,
428
- "loss": 0.9277,
429
  "step": 295
430
  },
431
  {
432
  "epoch": 0.36764705882352944,
433
- "grad_norm": 0.11929317566114471,
434
  "learning_rate": 0.000797688481034551,
435
- "loss": 0.9176,
436
  "step": 300
437
  },
438
  {
439
  "epoch": 0.3737745098039216,
440
- "grad_norm": 0.10301604347512731,
441
  "learning_rate": 0.00078902389564276,
442
  "loss": 0.9239,
443
  "step": 305
444
  },
445
  {
446
  "epoch": 0.3799019607843137,
447
- "grad_norm": 0.1271379375111038,
448
  "learning_rate": 0.0007802269477971771,
449
- "loss": 0.9166,
450
  "step": 310
451
  },
452
  {
453
  "epoch": 0.3860294117647059,
454
- "grad_norm": 0.13842829612868068,
455
  "learning_rate": 0.0007713016661806211,
456
- "loss": 0.9162,
457
  "step": 315
458
  },
459
  {
460
  "epoch": 0.39215686274509803,
461
- "grad_norm": 0.1258803633770378,
462
  "learning_rate": 0.0007622521382481208,
463
- "loss": 0.9096,
464
  "step": 320
465
  },
466
  {
467
  "epoch": 0.39828431372549017,
468
- "grad_norm": 0.10943327109661027,
469
  "learning_rate": 0.0007530825083550073,
470
- "loss": 0.9031,
471
  "step": 325
472
  },
473
  {
474
  "epoch": 0.40441176470588236,
475
- "grad_norm": 0.10473608787205252,
476
  "learning_rate": 0.0007437969758589507,
477
- "loss": 0.9144,
478
  "step": 330
479
  },
480
  {
481
  "epoch": 0.4105392156862745,
482
- "grad_norm": 0.1120426574406447,
483
  "learning_rate": 0.0007343997931968067,
484
- "loss": 0.9073,
485
  "step": 335
486
  },
487
  {
488
  "epoch": 0.4166666666666667,
489
- "grad_norm": 0.10888401544172292,
490
  "learning_rate": 0.0007248952639371542,
491
- "loss": 0.9073,
492
  "step": 340
493
  },
494
  {
495
  "epoch": 0.4227941176470588,
496
- "grad_norm": 0.11326294156455767,
497
  "learning_rate": 0.0007152877408094178,
498
- "loss": 0.8996,
499
  "step": 345
500
  },
501
  {
502
  "epoch": 0.42892156862745096,
503
- "grad_norm": 0.12674079463497812,
504
  "learning_rate": 0.0007055816237104753,
505
- "loss": 0.9092,
506
  "step": 350
507
  },
508
  {
509
  "epoch": 0.43504901960784315,
510
- "grad_norm": 0.12252284768767446,
511
  "learning_rate": 0.0006957813576896647,
512
- "loss": 0.8988,
513
  "step": 355
514
  },
515
  {
516
  "epoch": 0.4411764705882353,
517
- "grad_norm": 0.12424572403107578,
518
  "learning_rate": 0.000685891430913113,
519
- "loss": 0.9088,
520
  "step": 360
521
  },
522
  {
523
  "epoch": 0.44730392156862747,
524
- "grad_norm": 0.11202550868881908,
525
  "learning_rate": 0.0006759163726083191,
526
- "loss": 0.9002,
527
  "step": 365
528
  },
529
  {
530
  "epoch": 0.4534313725490196,
531
- "grad_norm": 0.09998982889163562,
532
  "learning_rate": 0.0006658607509899319,
533
- "loss": 0.8993,
534
  "step": 370
535
  },
536
  {
537
  "epoch": 0.45955882352941174,
538
- "grad_norm": 0.11977953776420541,
539
  "learning_rate": 0.0006557291711676738,
540
- "loss": 0.9062,
541
  "step": 375
542
  },
543
  {
544
  "epoch": 0.46568627450980393,
545
- "grad_norm": 0.11102474447162053,
546
  "learning_rate": 0.0006455262730373672,
547
- "loss": 0.8898,
548
  "step": 380
549
  },
550
  {
551
  "epoch": 0.47181372549019607,
552
- "grad_norm": 0.12262996603961465,
553
  "learning_rate": 0.0006352567291560318,
554
- "loss": 0.8945,
555
  "step": 385
556
  },
557
  {
558
  "epoch": 0.47794117647058826,
559
- "grad_norm": 0.11193151635262173,
560
  "learning_rate": 0.0006249252426020216,
561
- "loss": 0.8974,
562
  "step": 390
563
  },
564
  {
565
  "epoch": 0.4840686274509804,
566
- "grad_norm": 0.11911248377352072,
567
  "learning_rate": 0.0006145365448211866,
568
- "loss": 0.8995,
569
  "step": 395
570
  },
571
  {
572
  "epoch": 0.49019607843137253,
573
- "grad_norm": 0.11024344557839909,
574
  "learning_rate": 0.0006040953934600423,
575
- "loss": 0.8919,
576
  "step": 400
577
  },
578
  {
579
  "epoch": 0.4963235294117647,
580
- "grad_norm": 0.10313300838358162,
581
  "learning_rate": 0.0005936065701869403,
582
- "loss": 0.8965,
583
  "step": 405
584
  },
585
  {
586
  "epoch": 0.5024509803921569,
587
- "grad_norm": 0.1517513971243366,
588
  "learning_rate": 0.0005830748785022368,
589
- "loss": 0.8951,
590
  "step": 410
591
  },
592
  {
593
  "epoch": 0.508578431372549,
594
- "grad_norm": 0.10621777821428764,
595
  "learning_rate": 0.0005725051415384657,
596
- "loss": 0.9009,
597
  "step": 415
598
  },
599
  {
600
  "epoch": 0.5147058823529411,
601
- "grad_norm": 0.120824242008392,
602
  "learning_rate": 0.0005619021998515165,
603
- "loss": 0.8916,
604
  "step": 420
605
  },
606
  {
607
  "epoch": 0.5208333333333334,
608
- "grad_norm": 0.10796312687200485,
609
  "learning_rate": 0.000551270909203838,
610
- "loss": 0.8875,
611
  "step": 425
612
  },
613
  {
614
  "epoch": 0.5269607843137255,
615
- "grad_norm": 0.10485643705406462,
616
  "learning_rate": 0.0005406161383406731,
617
- "loss": 0.8995,
618
  "step": 430
619
  },
620
  {
621
  "epoch": 0.5330882352941176,
622
- "grad_norm": 0.1040747798660248,
623
  "learning_rate": 0.0005299427667603515,
624
- "loss": 0.9022,
625
  "step": 435
626
  },
627
  {
628
  "epoch": 0.5392156862745098,
629
- "grad_norm": 0.10303234276114956,
630
  "learning_rate": 0.0005192556824796568,
631
- "loss": 0.8858,
632
  "step": 440
633
  },
634
  {
635
  "epoch": 0.5453431372549019,
636
- "grad_norm": 0.13041962513060196,
637
  "learning_rate": 0.0005085597797952905,
638
- "loss": 0.8842,
639
  "step": 445
640
  },
641
  {
642
  "epoch": 0.5514705882352942,
643
- "grad_norm": 0.09392051916112838,
644
  "learning_rate": 0.0004978599570424639,
645
- "loss": 0.8832,
646
  "step": 450
647
  },
648
  {
649
  "epoch": 0.5575980392156863,
650
- "grad_norm": 0.11180233058561544,
651
  "learning_rate": 0.0004871611143516367,
652
- "loss": 0.8878,
653
  "step": 455
654
  },
655
  {
656
  "epoch": 0.5637254901960784,
657
- "grad_norm": 0.12367570385780484,
658
  "learning_rate": 0.0004764681514044362,
659
- "loss": 0.8859,
660
  "step": 460
661
  },
662
  {
663
  "epoch": 0.5698529411764706,
664
- "grad_norm": 0.10034153908219615,
665
  "learning_rate": 0.0004657859651897806,
666
- "loss": 0.8889,
667
  "step": 465
668
  },
669
  {
670
  "epoch": 0.5759803921568627,
671
- "grad_norm": 0.09279117510206411,
672
  "learning_rate": 0.00045511944776123513,
673
- "loss": 0.878,
674
  "step": 470
675
  },
676
  {
677
  "epoch": 0.5821078431372549,
678
- "grad_norm": 0.09436846491514878,
679
  "learning_rate": 0.00044447348399663056,
680
- "loss": 0.8842,
681
  "step": 475
682
  },
683
  {
684
  "epoch": 0.5882352941176471,
685
- "grad_norm": 0.2016423837068627,
686
  "learning_rate": 0.0004338529493609647,
687
- "loss": 0.8815,
688
  "step": 480
689
  },
690
  {
691
  "epoch": 0.5943627450980392,
692
- "grad_norm": 0.1280806078271419,
693
  "learning_rate": 0.00042326270767361815,
694
- "loss": 0.8877,
695
  "step": 485
696
  },
697
  {
698
  "epoch": 0.6004901960784313,
699
- "grad_norm": 0.08986898470466548,
700
  "learning_rate": 0.00041270760888089997,
701
- "loss": 0.8819,
702
  "step": 490
703
  },
704
  {
705
  "epoch": 0.6066176470588235,
706
- "grad_norm": 0.0982311145214648,
707
  "learning_rate": 0.00040219248683494925,
708
- "loss": 0.8629,
709
  "step": 495
710
  },
711
  {
712
  "epoch": 0.6127450980392157,
713
- "grad_norm": 0.12205296661938488,
714
  "learning_rate": 0.0003917221570800065,
715
- "loss": 0.8713,
716
  "step": 500
717
  },
718
  {
719
  "epoch": 0.6188725490196079,
720
- "grad_norm": 0.10671335419272648,
721
  "learning_rate": 0.000381301414647068,
722
- "loss": 0.8703,
723
  "step": 505
724
  },
725
  {
726
  "epoch": 0.625,
727
- "grad_norm": 0.10436549804544415,
728
  "learning_rate": 0.0003709350318579371,
729
- "loss": 0.8929,
730
  "step": 510
731
  },
732
  {
733
  "epoch": 0.6311274509803921,
734
- "grad_norm": 0.09117916773772033,
735
  "learning_rate": 0.0003606277561396726,
736
- "loss": 0.8591,
737
  "step": 515
738
  },
739
  {
740
  "epoch": 0.6372549019607843,
741
- "grad_norm": 0.09178929014053801,
742
  "learning_rate": 0.00035038430785044053,
743
- "loss": 0.8625,
744
  "step": 520
745
  },
746
  {
747
  "epoch": 0.6433823529411765,
748
- "grad_norm": 0.09612760306153695,
749
  "learning_rate": 0.00034020937811776156,
750
- "loss": 0.8594,
751
  "step": 525
752
  },
753
  {
754
  "epoch": 0.6495098039215687,
755
- "grad_norm": 0.09538202932858882,
756
  "learning_rate": 0.00033010762669014347,
757
- "loss": 0.867,
758
  "step": 530
759
  },
760
  {
761
  "epoch": 0.6556372549019608,
762
- "grad_norm": 0.09494219161832793,
763
  "learning_rate": 0.00032008367980308734,
764
- "loss": 0.872,
765
  "step": 535
766
  },
767
  {
768
  "epoch": 0.6617647058823529,
769
- "grad_norm": 0.08714205666482473,
770
  "learning_rate": 0.0003101421280604379,
771
- "loss": 0.8838,
772
  "step": 540
773
  },
774
  {
775
  "epoch": 0.6678921568627451,
776
- "grad_norm": 0.0969140933539997,
777
  "learning_rate": 0.00030028752433205476,
778
- "loss": 0.8608,
779
  "step": 545
780
  },
781
  {
782
  "epoch": 0.6740196078431373,
783
- "grad_norm": 0.09815891195004724,
784
  "learning_rate": 0.00029052438166876307,
785
- "loss": 0.8525,
786
  "step": 550
787
  },
788
  {
789
  "epoch": 0.6801470588235294,
790
- "grad_norm": 0.09615743129166938,
791
  "learning_rate": 0.0002808571712355389,
792
- "loss": 0.8638,
793
  "step": 555
794
  },
795
  {
796
  "epoch": 0.6862745098039216,
797
- "grad_norm": 0.10788692448970114,
798
  "learning_rate": 0.00027129032026388045,
799
- "loss": 0.8579,
800
  "step": 560
801
  },
802
  {
803
  "epoch": 0.6924019607843137,
804
- "grad_norm": 0.09213201587158737,
805
  "learning_rate": 0.00026182821002429345,
806
- "loss": 0.8615,
807
  "step": 565
808
  },
809
  {
810
  "epoch": 0.6985294117647058,
811
- "grad_norm": 0.09011406445898068,
812
  "learning_rate": 0.00025247517381983136,
813
- "loss": 0.8653,
814
  "step": 570
815
  },
816
  {
817
  "epoch": 0.7046568627450981,
818
- "grad_norm": 0.09736667368082612,
819
  "learning_rate": 0.00024323549500159802,
820
- "loss": 0.8617,
821
  "step": 575
822
  },
823
  {
824
  "epoch": 0.7107843137254902,
825
- "grad_norm": 0.09441622008962705,
826
  "learning_rate": 0.0002341134050071283,
827
- "loss": 0.8549,
828
  "step": 580
829
  },
830
  {
831
  "epoch": 0.7169117647058824,
832
- "grad_norm": 0.10000573346346127,
833
  "learning_rate": 0.00022511308142254488,
834
- "loss": 0.8575,
835
  "step": 585
836
  },
837
  {
838
  "epoch": 0.7230392156862745,
839
- "grad_norm": 0.1122829077347512,
840
  "learning_rate": 0.000216238646069373,
841
- "loss": 0.8604,
842
  "step": 590
843
  },
844
  {
845
  "epoch": 0.7291666666666666,
846
- "grad_norm": 0.09161355750906706,
847
  "learning_rate": 0.00020749416311689845,
848
- "loss": 0.8604,
849
  "step": 595
850
  },
851
  {
852
  "epoch": 0.7352941176470589,
853
- "grad_norm": 0.10349631376405924,
854
  "learning_rate": 0.00019888363722092372,
855
- "loss": 0.8629,
856
  "step": 600
857
  },
858
  {
859
  "epoch": 0.741421568627451,
860
- "grad_norm": 0.088451467518437,
861
  "learning_rate": 0.00019041101168978093,
862
- "loss": 0.8587,
863
  "step": 605
864
  },
865
  {
866
  "epoch": 0.7475490196078431,
867
- "grad_norm": 0.09610692111696861,
868
  "learning_rate": 0.00018208016667844152,
869
- "loss": 0.8613,
870
  "step": 610
871
  },
872
  {
873
  "epoch": 0.7536764705882353,
874
- "grad_norm": 0.09635232236992683,
875
  "learning_rate": 0.00017389491741154372,
876
- "loss": 0.8541,
877
  "step": 615
878
  },
879
  {
880
  "epoch": 0.7598039215686274,
881
- "grad_norm": 0.08400718304881724,
882
  "learning_rate": 0.00016585901243616042,
883
- "loss": 0.8564,
884
  "step": 620
885
  },
886
  {
887
  "epoch": 0.7659313725490197,
888
- "grad_norm": 0.08527935145250837,
889
  "learning_rate": 0.0001579761319050991,
890
- "loss": 0.8545,
891
  "step": 625
892
  },
893
  {
894
  "epoch": 0.7720588235294118,
895
- "grad_norm": 0.08535848929061582,
896
  "learning_rate": 0.00015024988589152537,
897
- "loss": 0.858,
898
  "step": 630
899
  },
900
  {
901
  "epoch": 0.7781862745098039,
902
- "grad_norm": 0.08525484362176303,
903
  "learning_rate": 0.0001426838127356823,
904
- "loss": 0.8538,
905
  "step": 635
906
  },
907
  {
908
  "epoch": 0.7843137254901961,
909
- "grad_norm": 0.08278907118602048,
910
  "learning_rate": 0.0001352813774244565,
911
- "loss": 0.8488,
912
  "step": 640
913
  },
914
  {
915
  "epoch": 0.7904411764705882,
916
- "grad_norm": 0.08784154420560207,
917
  "learning_rate": 0.00012804597000454215,
918
- "loss": 0.8556,
919
  "step": 645
920
  },
921
  {
922
  "epoch": 0.7965686274509803,
923
- "grad_norm": 0.08728645873334986,
924
  "learning_rate": 0.00012098090402992085,
925
- "loss": 0.8662,
926
  "step": 650
927
  },
928
  {
929
  "epoch": 0.8026960784313726,
930
- "grad_norm": 0.08368590385119791,
931
  "learning_rate": 0.00011408941504437532,
932
- "loss": 0.8541,
933
  "step": 655
934
  },
935
  {
936
  "epoch": 0.8088235294117647,
937
- "grad_norm": 0.12573965965935716,
938
  "learning_rate": 0.00010737465909972776,
939
- "loss": 0.8472,
940
  "step": 660
941
  },
942
  {
943
  "epoch": 0.8149509803921569,
944
- "grad_norm": 0.08047884146311494,
945
  "learning_rate": 0.00010083971131048159,
946
- "loss": 0.8492,
947
  "step": 665
948
  },
949
  {
950
  "epoch": 0.821078431372549,
951
- "grad_norm": 0.07946237543030905,
952
  "learning_rate": 9.448756444553224e-05,
953
- "loss": 0.8503,
954
  "step": 670
955
  },
956
  {
957
  "epoch": 0.8272058823529411,
958
- "grad_norm": 0.08237915177176067,
959
  "learning_rate": 8.832112755758598e-05,
960
- "loss": 0.848,
961
  "step": 675
962
  },
963
  {
964
  "epoch": 0.8333333333333334,
965
- "grad_norm": 0.08064567161653642,
966
  "learning_rate": 8.234322465092047e-05,
967
- "loss": 0.8489,
968
  "step": 680
969
  },
970
  {
971
  "epoch": 0.8394607843137255,
972
- "grad_norm": 0.08580518309504229,
973
  "learning_rate": 7.655659338809329e-05,
974
- "loss": 0.8481,
975
  "step": 685
976
  },
977
  {
978
  "epoch": 0.8455882352941176,
979
- "grad_norm": 0.08240105778083462,
980
  "learning_rate": 7.096388383619079e-05,
981
- "loss": 0.8434,
982
  "step": 690
983
  },
984
  {
985
  "epoch": 0.8517156862745098,
986
- "grad_norm": 0.10667272643643334,
987
  "learning_rate": 6.556765725319525e-05,
988
- "loss": 0.8477,
989
  "step": 695
990
  },
991
  {
992
  "epoch": 0.8578431372549019,
993
- "grad_norm": 0.09351829478479391,
994
  "learning_rate": 6.037038491501978e-05,
995
- "loss": 0.8521,
996
  "step": 700
997
  },
998
  {
999
  "epoch": 0.8639705882352942,
1000
- "grad_norm": 0.07805110075360967,
1001
  "learning_rate": 5.53744469837551e-05,
1002
- "loss": 0.843,
1003
  "step": 705
1004
  },
1005
  {
1006
  "epoch": 0.8700980392156863,
1007
- "grad_norm": 0.08112250711019013,
1008
  "learning_rate": 5.058213141764151e-05,
1009
- "loss": 0.8434,
1010
  "step": 710
1011
  },
1012
  {
1013
  "epoch": 0.8762254901960784,
1014
- "grad_norm": 0.09988697539615195,
1015
  "learning_rate": 4.599563292326592e-05,
1016
- "loss": 0.8483,
1017
  "step": 715
1018
  },
1019
  {
1020
  "epoch": 0.8823529411764706,
1021
- "grad_norm": 0.08158094478141424,
1022
  "learning_rate": 4.161705195046761e-05,
1023
- "loss": 0.8441,
1024
  "step": 720
1025
  },
1026
  {
1027
  "epoch": 0.8884803921568627,
1028
- "grad_norm": 0.08831427591672994,
1029
  "learning_rate": 3.744839373040682e-05,
1030
- "loss": 0.8465,
1031
  "step": 725
1032
  },
1033
  {
1034
  "epoch": 0.8946078431372549,
1035
- "grad_norm": 0.08621255370586131,
1036
  "learning_rate": 3.349156735724274e-05,
1037
- "loss": 0.8478,
1038
  "step": 730
1039
  },
1040
  {
1041
  "epoch": 0.9007352941176471,
1042
- "grad_norm": 0.07423138411019962,
1043
  "learning_rate": 2.9748384913837522e-05,
1044
- "loss": 0.8345,
1045
  "step": 735
1046
  },
1047
  {
1048
  "epoch": 0.9068627450980392,
1049
- "grad_norm": 0.0795306653386215,
1050
  "learning_rate": 2.622056064188738e-05,
1051
- "loss": 0.8537,
1052
  "step": 740
1053
  },
1054
  {
1055
  "epoch": 0.9129901960784313,
1056
- "grad_norm": 0.07964808233910256,
1057
  "learning_rate": 2.2909710156863274e-05,
1058
- "loss": 0.8512,
1059
  "step": 745
1060
  },
1061
  {
1062
  "epoch": 0.9191176470588235,
1063
- "grad_norm": 0.08073502455884987,
1064
  "learning_rate": 1.981734970811644e-05,
1065
- "loss": 0.8415,
1066
  "step": 750
1067
  },
1068
  {
1069
  "epoch": 0.9252450980392157,
1070
- "grad_norm": 0.07585958934986796,
1071
  "learning_rate": 1.6944895484492072e-05,
1072
- "loss": 0.8521,
1073
  "step": 755
1074
  },
1075
  {
1076
  "epoch": 0.9313725490196079,
1077
- "grad_norm": 0.08108459367268084,
1078
  "learning_rate": 1.429366296576623e-05,
1079
- "loss": 0.851,
1080
  "step": 760
1081
  },
1082
  {
1083
  "epoch": 0.9375,
1084
- "grad_norm": 0.08450154778950293,
1085
  "learning_rate": 1.1864866320203115e-05,
1086
- "loss": 0.8478,
1087
  "step": 765
1088
  },
1089
  {
1090
  "epoch": 0.9436274509803921,
1091
- "grad_norm": 0.08368284039729793,
1092
  "learning_rate": 9.659617848510882e-06,
1093
- "loss": 0.8447,
1094
  "step": 770
1095
  },
1096
  {
1097
  "epoch": 0.9497549019607843,
1098
- "grad_norm": 0.08046653108562293,
1099
  "learning_rate": 7.678927474447817e-06,
1100
- "loss": 0.8443,
1101
  "step": 775
1102
  },
1103
  {
1104
  "epoch": 0.9558823529411765,
1105
- "grad_norm": 0.08163069514122503,
1106
  "learning_rate": 5.923702282314092e-06,
1107
- "loss": 0.8467,
1108
  "step": 780
1109
  },
1110
  {
1111
  "epoch": 0.9620098039215687,
1112
- "grad_norm": 0.07894532908798362,
1113
  "learning_rate": 4.394746101540115e-06,
1114
- "loss": 0.8421,
1115
  "step": 785
1116
  },
1117
  {
1118
  "epoch": 0.9681372549019608,
1119
- "grad_norm": 0.08066759347765237,
1120
  "learning_rate": 3.092759138561607e-06,
1121
- "loss": 0.8403,
1122
  "step": 790
1123
  },
1124
  {
1125
  "epoch": 0.9742647058823529,
1126
- "grad_norm": 0.08225885021056388,
1127
  "learning_rate": 2.018337656150726e-06,
1128
- "loss": 0.8459,
1129
  "step": 795
1130
  },
1131
  {
1132
  "epoch": 0.9803921568627451,
1133
- "grad_norm": 0.0939601026942546,
1134
  "learning_rate": 1.1719737003492159e-06,
1135
- "loss": 0.8385,
1136
  "step": 800
1137
  },
1138
  {
1139
  "epoch": 0.9865196078431373,
1140
- "grad_norm": 0.07754259049318248,
1141
  "learning_rate": 5.540548751292173e-07,
1142
- "loss": 0.8343,
1143
  "step": 805
1144
  },
1145
  {
1146
  "epoch": 0.9926470588235294,
1147
- "grad_norm": 0.08221470163380953,
1148
  "learning_rate": 1.6486416488459277e-07,
1149
- "loss": 0.8475,
1150
  "step": 810
1151
  },
1152
  {
1153
  "epoch": 0.9987745098039216,
1154
- "grad_norm": 0.08134157509220469,
1155
  "learning_rate": 4.579804834703438e-09,
1156
- "loss": 0.8399,
1157
  "step": 815
1158
  },
1159
  {
1160
  "epoch": 1.0,
1161
- "eval_loss": 1.202902913093567,
1162
- "eval_runtime": 113.99,
1163
- "eval_samples_per_second": 183.709,
1164
- "eval_steps_per_second": 5.746,
1165
  "step": 816
1166
  },
1167
  {
1168
  "epoch": 1.0,
1169
  "step": 816,
1170
  "total_flos": 80063181619200.0,
1171
- "train_loss": 0.9145085595402063,
1172
- "train_runtime": 1902.7506,
1173
- "train_samples_per_second": 54.885,
1174
- "train_steps_per_second": 0.429
1175
  }
1176
  ],
1177
  "logging_steps": 5,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0012254901960784314,
13
+ "grad_norm": 1.603988652495189,
14
  "learning_rate": 1.2195121951219513e-05,
15
  "loss": 1.3541,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.006127450980392157,
20
+ "grad_norm": 1.4587743423854005,
21
  "learning_rate": 6.097560975609756e-05,
22
  "loss": 1.355,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.012254901960784314,
27
+ "grad_norm": 1.538685925430638,
28
  "learning_rate": 0.00012195121951219512,
29
  "loss": 1.3083,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.01838235294117647,
34
+ "grad_norm": 0.5820643527511544,
35
  "learning_rate": 0.00018292682926829268,
36
  "loss": 1.2226,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.024509803921568627,
41
+ "grad_norm": 0.37609725156071605,
42
  "learning_rate": 0.00024390243902439024,
43
+ "loss": 1.1399,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.030637254901960783,
48
+ "grad_norm": 0.2477379120591674,
49
  "learning_rate": 0.0003048780487804878,
50
  "loss": 1.0942,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.03676470588235294,
55
+ "grad_norm": 0.18934222364015724,
56
  "learning_rate": 0.00036585365853658537,
57
  "loss": 1.0785,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.0428921568627451,
62
+ "grad_norm": 0.14574615338739755,
63
  "learning_rate": 0.0004268292682926829,
64
  "loss": 1.0549,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.049019607843137254,
69
+ "grad_norm": 0.12815166481708085,
70
  "learning_rate": 0.0004878048780487805,
71
  "loss": 1.0493,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.05514705882352941,
76
+ "grad_norm": 0.15010519509218812,
77
  "learning_rate": 0.0005487804878048781,
78
  "loss": 1.0306,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.061274509803921566,
83
+ "grad_norm": 0.13010925959434533,
84
  "learning_rate": 0.0006097560975609756,
85
  "loss": 1.0204,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.06740196078431372,
90
+ "grad_norm": 0.11891525726508857,
91
  "learning_rate": 0.0006707317073170732,
92
  "loss": 1.0281,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.07352941176470588,
97
+ "grad_norm": 0.12117005404429922,
98
  "learning_rate": 0.0007317073170731707,
99
+ "loss": 1.0187,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.07965686274509803,
104
+ "grad_norm": 0.11923924460163615,
105
  "learning_rate": 0.0007926829268292683,
106
  "loss": 1.0019,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.0857843137254902,
111
+ "grad_norm": 0.13523477315023974,
112
  "learning_rate": 0.0008536585365853659,
113
+ "loss": 1.0044,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.09191176470588236,
118
+ "grad_norm": 0.11307823129618054,
119
  "learning_rate": 0.0009146341463414635,
120
  "loss": 1.0071,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.09803921568627451,
125
+ "grad_norm": 0.1206648748330027,
126
  "learning_rate": 0.000975609756097561,
127
  "loss": 0.9962,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.10416666666666667,
132
+ "grad_norm": 0.15935283992889565,
133
  "learning_rate": 0.000999958782259877,
134
+ "loss": 0.9998,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.11029411764705882,
139
+ "grad_norm": 0.13630538643217202,
140
  "learning_rate": 0.0009997069206794246,
141
+ "loss": 1.0101,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.11642156862745098,
146
+ "grad_norm": 0.13281831595913912,
147
  "learning_rate": 0.0009992262114666653,
148
  "loss": 0.9904,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.12254901960784313,
153
+ "grad_norm": 0.13570583268291556,
154
  "learning_rate": 0.0009985168747689707,
155
+ "loss": 0.986,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.12867647058823528,
160
+ "grad_norm": 0.14302939089927838,
161
  "learning_rate": 0.0009975792354368017,
162
  "loss": 0.9934,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.13480392156862744,
167
+ "grad_norm": 0.14349254312543258,
168
  "learning_rate": 0.0009964137228749407,
169
  "loss": 0.9961,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.1409313725490196,
174
+ "grad_norm": 0.13092834458351188,
175
  "learning_rate": 0.000995020870845837,
176
+ "loss": 0.9949,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.14705882352941177,
181
+ "grad_norm": 0.1251118799739706,
182
  "learning_rate": 0.0009934013172251653,
183
  "loss": 0.9824,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.15318627450980393,
188
+ "grad_norm": 0.1323892618605143,
189
  "learning_rate": 0.0009915558037097002,
190
  "loss": 0.977,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.15931372549019607,
195
+ "grad_norm": 0.13068519381648078,
196
  "learning_rate": 0.0009894851754776472,
197
  "loss": 0.9712,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.16544117647058823,
202
+ "grad_norm": 0.1310198992984819,
203
  "learning_rate": 0.0009871903808015812,
204
  "loss": 0.9807,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.1715686274509804,
209
+ "grad_norm": 0.10811315309592277,
210
  "learning_rate": 0.0009846724706141716,
211
  "loss": 0.977,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.17769607843137256,
216
+ "grad_norm": 0.11603574555194691,
217
  "learning_rate": 0.0009819325980268945,
218
  "loss": 0.9743,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.18382352941176472,
223
+ "grad_norm": 0.11664960595520962,
224
  "learning_rate": 0.0009789720178019483,
225
+ "loss": 0.9742,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.18995098039215685,
230
+ "grad_norm": 0.11920852297334043,
231
  "learning_rate": 0.0009757920857776188,
232
+ "loss": 0.9635,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.19607843137254902,
237
+ "grad_norm": 0.13745202686899544,
238
  "learning_rate": 0.0009723942582473544,
239
  "loss": 0.9544,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.20220588235294118,
244
+ "grad_norm": 0.15444220703514816,
245
  "learning_rate": 0.0009687800912928362,
246
+ "loss": 0.9697,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.20833333333333334,
251
+ "grad_norm": 0.11511217870343073,
252
  "learning_rate": 0.0009649512400713498,
253
  "loss": 0.963,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.21446078431372548,
258
+ "grad_norm": 0.163845433820889,
259
  "learning_rate": 0.0009609094580577824,
260
+ "loss": 0.9601,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.22058823529411764,
265
+ "grad_norm": 0.12370218334013189,
266
  "learning_rate": 0.0009566565962415959,
267
  "loss": 0.9578,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.2267156862745098,
272
+ "grad_norm": 0.138057520129555,
273
  "learning_rate": 0.0009521946022791401,
274
  "loss": 0.9555,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.23284313725490197,
279
+ "grad_norm": 0.161151229045878,
280
  "learning_rate": 0.0009475255196016972,
281
  "loss": 0.9579,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.23897058823529413,
286
+ "grad_norm": 0.13900328482304902,
287
  "learning_rate": 0.0009426514864796647,
288
  "loss": 0.9494,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.24509803921568626,
293
+ "grad_norm": 0.14057545846182565,
294
  "learning_rate": 0.0009375747350433044,
295
+ "loss": 0.9478,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.2512254901960784,
300
+ "grad_norm": 0.15616371521107208,
301
  "learning_rate": 0.0009322975902605082,
302
+ "loss": 0.9654,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.25735294117647056,
307
+ "grad_norm": 0.11827026404580182,
308
  "learning_rate": 0.0009268224688720474,
309
+ "loss": 0.9445,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.26348039215686275,
314
+ "grad_norm": 0.11103911913637518,
315
  "learning_rate": 0.0009211518782847931,
316
+ "loss": 0.9424,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.2696078431372549,
321
+ "grad_norm": 0.11604427070566481,
322
  "learning_rate": 0.0009152884154234145,
323
  "loss": 0.9451,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.2757352941176471,
328
+ "grad_norm": 0.1099562215414043,
329
  "learning_rate": 0.0009092347655410818,
330
+ "loss": 0.9402,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.2818627450980392,
335
+ "grad_norm": 0.14837520991789005,
336
  "learning_rate": 0.0009029937009897176,
337
  "loss": 0.9349,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.28799019607843135,
342
+ "grad_norm": 0.10928552841333679,
343
  "learning_rate": 0.0008965680799503608,
344
  "loss": 0.9329,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.29411764705882354,
349
+ "grad_norm": 0.11407153214331639,
350
  "learning_rate": 0.0008899608451242233,
351
+ "loss": 0.9379,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.3002450980392157,
356
+ "grad_norm": 0.11231916470556697,
357
  "learning_rate": 0.0008831750223850389,
358
+ "loss": 0.9229,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.30637254901960786,
363
+ "grad_norm": 0.11185052745256109,
364
  "learning_rate": 0.0008762137193933241,
365
  "loss": 0.9296,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.3125,
370
+ "grad_norm": 0.11855657350077958,
371
  "learning_rate": 0.0008690801241731818,
372
+ "loss": 0.9207,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.31862745098039214,
377
+ "grad_norm": 1.8537407128611012,
378
  "learning_rate": 0.0008617775036523015,
379
+ "loss": 0.9387,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.3247549019607843,
384
+ "grad_norm": 0.11676606107692747,
385
  "learning_rate": 0.0008543092021658259,
386
  "loss": 0.9367,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.33088235294117646,
391
+ "grad_norm": 0.10492580984162286,
392
  "learning_rate": 0.0008466786399247663,
393
+ "loss": 0.928,
394
  "step": 270
395
  },
396
  {
397
  "epoch": 0.33700980392156865,
398
+ "grad_norm": 0.10281631398110604,
399
  "learning_rate": 0.0008388893114496705,
400
+ "loss": 0.935,
401
  "step": 275
402
  },
403
  {
404
  "epoch": 0.3431372549019608,
405
+ "grad_norm": 0.11217225067437296,
406
  "learning_rate": 0.0008309447839702582,
407
+ "loss": 0.9298,
408
  "step": 280
409
  },
410
  {
411
  "epoch": 0.3492647058823529,
412
+ "grad_norm": 0.11327220268180357,
413
  "learning_rate": 0.0008228486957917607,
414
+ "loss": 0.9219,
415
  "step": 285
416
  },
417
  {
418
  "epoch": 0.3553921568627451,
419
+ "grad_norm": 0.11554152008646122,
420
  "learning_rate": 0.0008146047546287076,
421
+ "loss": 0.934,
422
  "step": 290
423
  },
424
  {
425
  "epoch": 0.36151960784313725,
426
+ "grad_norm": 0.13610027478132888,
427
  "learning_rate": 0.0008062167359069301,
428
+ "loss": 0.9276,
429
  "step": 295
430
  },
431
  {
432
  "epoch": 0.36764705882352944,
433
+ "grad_norm": 0.12248610966496465,
434
  "learning_rate": 0.000797688481034551,
435
+ "loss": 0.9175,
436
  "step": 300
437
  },
438
  {
439
  "epoch": 0.3737745098039216,
440
+ "grad_norm": 0.10512495641494239,
441
  "learning_rate": 0.00078902389564276,
442
  "loss": 0.9239,
443
  "step": 305
444
  },
445
  {
446
  "epoch": 0.3799019607843137,
447
+ "grad_norm": 0.12079056888085157,
448
  "learning_rate": 0.0007802269477971771,
449
+ "loss": 0.9167,
450
  "step": 310
451
  },
452
  {
453
  "epoch": 0.3860294117647059,
454
+ "grad_norm": 0.1311550506036977,
455
  "learning_rate": 0.0007713016661806211,
456
+ "loss": 0.9165,
457
  "step": 315
458
  },
459
  {
460
  "epoch": 0.39215686274509803,
461
+ "grad_norm": 0.12748855363301959,
462
  "learning_rate": 0.0007622521382481208,
463
+ "loss": 0.9099,
464
  "step": 320
465
  },
466
  {
467
  "epoch": 0.39828431372549017,
468
+ "grad_norm": 0.11389138878908127,
469
  "learning_rate": 0.0007530825083550073,
470
+ "loss": 0.9034,
471
  "step": 325
472
  },
473
  {
474
  "epoch": 0.40441176470588236,
475
+ "grad_norm": 0.10172199627242663,
476
  "learning_rate": 0.0007437969758589507,
477
+ "loss": 0.9147,
478
  "step": 330
479
  },
480
  {
481
  "epoch": 0.4105392156862745,
482
+ "grad_norm": 0.1136698134249708,
483
  "learning_rate": 0.0007343997931968067,
484
+ "loss": 0.9076,
485
  "step": 335
486
  },
487
  {
488
  "epoch": 0.4166666666666667,
489
+ "grad_norm": 0.1110896296260987,
490
  "learning_rate": 0.0007248952639371542,
491
+ "loss": 0.9075,
492
  "step": 340
493
  },
494
  {
495
  "epoch": 0.4227941176470588,
496
+ "grad_norm": 0.10357314484765201,
497
  "learning_rate": 0.0007152877408094178,
498
+ "loss": 0.8998,
499
  "step": 345
500
  },
501
  {
502
  "epoch": 0.42892156862745096,
503
+ "grad_norm": 0.11773981651015025,
504
  "learning_rate": 0.0007055816237104753,
505
+ "loss": 0.9094,
506
  "step": 350
507
  },
508
  {
509
  "epoch": 0.43504901960784315,
510
+ "grad_norm": 0.1283630128752841,
511
  "learning_rate": 0.0006957813576896647,
512
+ "loss": 0.899,
513
  "step": 355
514
  },
515
  {
516
  "epoch": 0.4411764705882353,
517
+ "grad_norm": 0.1326640375854421,
518
  "learning_rate": 0.000685891430913113,
519
+ "loss": 0.9091,
520
  "step": 360
521
  },
522
  {
523
  "epoch": 0.44730392156862747,
524
+ "grad_norm": 0.12057333477888295,
525
  "learning_rate": 0.0006759163726083191,
526
+ "loss": 0.9005,
527
  "step": 365
528
  },
529
  {
530
  "epoch": 0.4534313725490196,
531
+ "grad_norm": 0.10157867473834796,
532
  "learning_rate": 0.0006658607509899319,
533
+ "loss": 0.8995,
534
  "step": 370
535
  },
536
  {
537
  "epoch": 0.45955882352941174,
538
+ "grad_norm": 0.13679116304924,
539
  "learning_rate": 0.0006557291711676738,
540
+ "loss": 0.9064,
541
  "step": 375
542
  },
543
  {
544
  "epoch": 0.46568627450980393,
545
+ "grad_norm": 0.10228308226469025,
546
  "learning_rate": 0.0006455262730373672,
547
+ "loss": 0.8902,
548
  "step": 380
549
  },
550
  {
551
  "epoch": 0.47181372549019607,
552
+ "grad_norm": 0.11810749832493427,
553
  "learning_rate": 0.0006352567291560318,
554
+ "loss": 0.8947,
555
  "step": 385
556
  },
557
  {
558
  "epoch": 0.47794117647058826,
559
+ "grad_norm": 0.11253919001414733,
560
  "learning_rate": 0.0006249252426020216,
561
+ "loss": 0.8984,
562
  "step": 390
563
  },
564
  {
565
  "epoch": 0.4840686274509804,
566
+ "grad_norm": 0.10889918340035115,
567
  "learning_rate": 0.0006145365448211866,
568
+ "loss": 0.9001,
569
  "step": 395
570
  },
571
  {
572
  "epoch": 0.49019607843137253,
573
+ "grad_norm": 0.10602494662106901,
574
  "learning_rate": 0.0006040953934600423,
575
+ "loss": 0.8924,
576
  "step": 400
577
  },
578
  {
579
  "epoch": 0.4963235294117647,
580
+ "grad_norm": 0.09537450461248778,
581
  "learning_rate": 0.0005936065701869403,
582
+ "loss": 0.8971,
583
  "step": 405
584
  },
585
  {
586
  "epoch": 0.5024509803921569,
587
+ "grad_norm": 0.1135732875240647,
588
  "learning_rate": 0.0005830748785022368,
589
+ "loss": 0.8956,
590
  "step": 410
591
  },
592
  {
593
  "epoch": 0.508578431372549,
594
+ "grad_norm": 0.11824825784313651,
595
  "learning_rate": 0.0005725051415384657,
596
+ "loss": 0.9014,
597
  "step": 415
598
  },
599
  {
600
  "epoch": 0.5147058823529411,
601
+ "grad_norm": 2.3957029087137602,
602
  "learning_rate": 0.0005619021998515165,
603
+ "loss": 0.8937,
604
  "step": 420
605
  },
606
  {
607
  "epoch": 0.5208333333333334,
608
+ "grad_norm": 0.1305239745293032,
609
  "learning_rate": 0.000551270909203838,
610
+ "loss": 0.889,
611
  "step": 425
612
  },
613
  {
614
  "epoch": 0.5269607843137255,
615
+ "grad_norm": 0.10923687170047386,
616
  "learning_rate": 0.0005406161383406731,
617
+ "loss": 0.9009,
618
  "step": 430
619
  },
620
  {
621
  "epoch": 0.5330882352941176,
622
+ "grad_norm": 0.11720531307848668,
623
  "learning_rate": 0.0005299427667603515,
624
+ "loss": 0.9035,
625
  "step": 435
626
  },
627
  {
628
  "epoch": 0.5392156862745098,
629
+ "grad_norm": 0.1043777454103823,
630
  "learning_rate": 0.0005192556824796568,
631
+ "loss": 0.887,
632
  "step": 440
633
  },
634
  {
635
  "epoch": 0.5453431372549019,
636
+ "grad_norm": 0.12019301588246883,
637
  "learning_rate": 0.0005085597797952905,
638
+ "loss": 0.8852,
639
  "step": 445
640
  },
641
  {
642
  "epoch": 0.5514705882352942,
643
+ "grad_norm": 0.09829925409523375,
644
  "learning_rate": 0.0004978599570424639,
645
+ "loss": 0.8841,
646
  "step": 450
647
  },
648
  {
649
  "epoch": 0.5575980392156863,
650
+ "grad_norm": 0.110813034496191,
651
  "learning_rate": 0.0004871611143516367,
652
+ "loss": 0.8888,
653
  "step": 455
654
  },
655
  {
656
  "epoch": 0.5637254901960784,
657
+ "grad_norm": 0.14013694091933743,
658
  "learning_rate": 0.0004764681514044362,
659
+ "loss": 0.8863,
660
  "step": 460
661
  },
662
  {
663
  "epoch": 0.5698529411764706,
664
+ "grad_norm": 0.10955250297933698,
665
  "learning_rate": 0.0004657859651897806,
666
+ "loss": 0.8904,
667
  "step": 465
668
  },
669
  {
670
  "epoch": 0.5759803921568627,
671
+ "grad_norm": 0.13711186271821346,
672
  "learning_rate": 0.00045511944776123513,
673
+ "loss": 0.8789,
674
  "step": 470
675
  },
676
  {
677
  "epoch": 0.5821078431372549,
678
+ "grad_norm": 0.09396380277187082,
679
  "learning_rate": 0.00044447348399663056,
680
+ "loss": 0.8847,
681
  "step": 475
682
  },
683
  {
684
  "epoch": 0.5882352941176471,
685
+ "grad_norm": 0.21392349020058346,
686
  "learning_rate": 0.0004338529493609647,
687
+ "loss": 0.8824,
688
  "step": 480
689
  },
690
  {
691
  "epoch": 0.5943627450980392,
692
+ "grad_norm": 0.12755805564480172,
693
  "learning_rate": 0.00042326270767361815,
694
+ "loss": 0.8884,
695
  "step": 485
696
  },
697
  {
698
  "epoch": 0.6004901960784313,
699
+ "grad_norm": 0.09157375745294742,
700
  "learning_rate": 0.00041270760888089997,
701
+ "loss": 0.8825,
702
  "step": 490
703
  },
704
  {
705
  "epoch": 0.6066176470588235,
706
+ "grad_norm": 0.10173653886247282,
707
  "learning_rate": 0.00040219248683494925,
708
+ "loss": 0.8637,
709
  "step": 495
710
  },
711
  {
712
  "epoch": 0.6127450980392157,
713
+ "grad_norm": 0.12386704656315299,
714
  "learning_rate": 0.0003917221570800065,
715
+ "loss": 0.8719,
716
  "step": 500
717
  },
718
  {
719
  "epoch": 0.6188725490196079,
720
+ "grad_norm": 0.10921071757131698,
721
  "learning_rate": 0.000381301414647068,
722
+ "loss": 0.8707,
723
  "step": 505
724
  },
725
  {
726
  "epoch": 0.625,
727
+ "grad_norm": 0.10860919138034633,
728
  "learning_rate": 0.0003709350318579371,
729
+ "loss": 0.8934,
730
  "step": 510
731
  },
732
  {
733
  "epoch": 0.6311274509803921,
734
+ "grad_norm": 0.08765926558701954,
735
  "learning_rate": 0.0003606277561396726,
736
+ "loss": 0.8595,
737
  "step": 515
738
  },
739
  {
740
  "epoch": 0.6372549019607843,
741
+ "grad_norm": 0.08795902636008367,
742
  "learning_rate": 0.00035038430785044053,
743
+ "loss": 0.8629,
744
  "step": 520
745
  },
746
  {
747
  "epoch": 0.6433823529411765,
748
+ "grad_norm": 0.10125788693590333,
749
  "learning_rate": 0.00034020937811776156,
750
+ "loss": 0.8597,
751
  "step": 525
752
  },
753
  {
754
  "epoch": 0.6495098039215687,
755
+ "grad_norm": 0.09640732281156021,
756
  "learning_rate": 0.00033010762669014347,
757
+ "loss": 0.8672,
758
  "step": 530
759
  },
760
  {
761
  "epoch": 0.6556372549019608,
762
+ "grad_norm": 0.09206201588796137,
763
  "learning_rate": 0.00032008367980308734,
764
+ "loss": 0.8723,
765
  "step": 535
766
  },
767
  {
768
  "epoch": 0.6617647058823529,
769
+ "grad_norm": 0.089094237721721,
770
  "learning_rate": 0.0003101421280604379,
771
+ "loss": 0.884,
772
  "step": 540
773
  },
774
  {
775
  "epoch": 0.6678921568627451,
776
+ "grad_norm": 0.10047930336023028,
777
  "learning_rate": 0.00030028752433205476,
778
+ "loss": 0.8612,
779
  "step": 545
780
  },
781
  {
782
  "epoch": 0.6740196078431373,
783
+ "grad_norm": 0.09796290633516842,
784
  "learning_rate": 0.00029052438166876307,
785
+ "loss": 0.8527,
786
  "step": 550
787
  },
788
  {
789
  "epoch": 0.6801470588235294,
790
+ "grad_norm": 0.08908481799962162,
791
  "learning_rate": 0.0002808571712355389,
792
+ "loss": 0.8636,
793
  "step": 555
794
  },
795
  {
796
  "epoch": 0.6862745098039216,
797
+ "grad_norm": 0.09854862986040251,
798
  "learning_rate": 0.00027129032026388045,
799
+ "loss": 0.8581,
800
  "step": 560
801
  },
802
  {
803
  "epoch": 0.6924019607843137,
804
+ "grad_norm": 0.096989721310236,
805
  "learning_rate": 0.00026182821002429345,
806
+ "loss": 0.8617,
807
  "step": 565
808
  },
809
  {
810
  "epoch": 0.6985294117647058,
811
+ "grad_norm": 0.09027729876751488,
812
  "learning_rate": 0.00025247517381983136,
813
+ "loss": 0.8654,
814
  "step": 570
815
  },
816
  {
817
  "epoch": 0.7046568627450981,
818
+ "grad_norm": 0.10227245851698821,
819
  "learning_rate": 0.00024323549500159802,
820
+ "loss": 0.8618,
821
  "step": 575
822
  },
823
  {
824
  "epoch": 0.7107843137254902,
825
+ "grad_norm": 0.09927553647728089,
826
  "learning_rate": 0.0002341134050071283,
827
+ "loss": 0.855,
828
  "step": 580
829
  },
830
  {
831
  "epoch": 0.7169117647058824,
832
+ "grad_norm": 0.09142338818988954,
833
  "learning_rate": 0.00022511308142254488,
834
+ "loss": 0.8577,
835
  "step": 585
836
  },
837
  {
838
  "epoch": 0.7230392156862745,
839
+ "grad_norm": 0.10507626286878373,
840
  "learning_rate": 0.000216238646069373,
841
+ "loss": 0.8605,
842
  "step": 590
843
  },
844
  {
845
  "epoch": 0.7291666666666666,
846
+ "grad_norm": 0.09773601600409339,
847
  "learning_rate": 0.00020749416311689845,
848
+ "loss": 0.8605,
849
  "step": 595
850
  },
851
  {
852
  "epoch": 0.7352941176470589,
853
+ "grad_norm": 0.1053760063340528,
854
  "learning_rate": 0.00019888363722092372,
855
+ "loss": 0.8631,
856
  "step": 600
857
  },
858
  {
859
  "epoch": 0.741421568627451,
860
+ "grad_norm": 0.09919853848427344,
861
  "learning_rate": 0.00019041101168978093,
862
+ "loss": 0.8589,
863
  "step": 605
864
  },
865
  {
866
  "epoch": 0.7475490196078431,
867
+ "grad_norm": 0.09240852582600491,
868
  "learning_rate": 0.00018208016667844152,
869
+ "loss": 0.8616,
870
  "step": 610
871
  },
872
  {
873
  "epoch": 0.7536764705882353,
874
+ "grad_norm": 0.09385869340911827,
875
  "learning_rate": 0.00017389491741154372,
876
+ "loss": 0.8543,
877
  "step": 615
878
  },
879
  {
880
  "epoch": 0.7598039215686274,
881
+ "grad_norm": 0.08602993504708097,
882
  "learning_rate": 0.00016585901243616042,
883
+ "loss": 0.8566,
884
  "step": 620
885
  },
886
  {
887
  "epoch": 0.7659313725490197,
888
+ "grad_norm": 0.08661913403120794,
889
  "learning_rate": 0.0001579761319050991,
890
+ "loss": 0.8546,
891
  "step": 625
892
  },
893
  {
894
  "epoch": 0.7720588235294118,
895
+ "grad_norm": 0.08756073235275695,
896
  "learning_rate": 0.00015024988589152537,
897
+ "loss": 0.8582,
898
  "step": 630
899
  },
900
  {
901
  "epoch": 0.7781862745098039,
902
+ "grad_norm": 0.08339963011288148,
903
  "learning_rate": 0.0001426838127356823,
904
+ "loss": 0.8541,
905
  "step": 635
906
  },
907
  {
908
  "epoch": 0.7843137254901961,
909
+ "grad_norm": 0.07897307103939846,
910
  "learning_rate": 0.0001352813774244565,
911
+ "loss": 0.849,
912
  "step": 640
913
  },
914
  {
915
  "epoch": 0.7904411764705882,
916
+ "grad_norm": 0.08692536794832408,
917
  "learning_rate": 0.00012804597000454215,
918
+ "loss": 0.8559,
919
  "step": 645
920
  },
921
  {
922
  "epoch": 0.7965686274509803,
923
+ "grad_norm": 0.08695762926336753,
924
  "learning_rate": 0.00012098090402992085,
925
+ "loss": 0.8665,
926
  "step": 650
927
  },
928
  {
929
  "epoch": 0.8026960784313726,
930
+ "grad_norm": 0.08241315305272631,
931
  "learning_rate": 0.00011408941504437532,
932
+ "loss": 0.8544,
933
  "step": 655
934
  },
935
  {
936
  "epoch": 0.8088235294117647,
937
+ "grad_norm": 0.07821925622204019,
938
  "learning_rate": 0.00010737465909972776,
939
+ "loss": 0.8474,
940
  "step": 660
941
  },
942
  {
943
  "epoch": 0.8149509803921569,
944
+ "grad_norm": 0.08521471066806094,
945
  "learning_rate": 0.00010083971131048159,
946
+ "loss": 0.8495,
947
  "step": 665
948
  },
949
  {
950
  "epoch": 0.821078431372549,
951
+ "grad_norm": 0.08381156457580924,
952
  "learning_rate": 9.448756444553224e-05,
953
+ "loss": 0.8506,
954
  "step": 670
955
  },
956
  {
957
  "epoch": 0.8272058823529411,
958
+ "grad_norm": 0.080205577901611,
959
  "learning_rate": 8.832112755758598e-05,
960
+ "loss": 0.8482,
961
  "step": 675
962
  },
963
  {
964
  "epoch": 0.8333333333333334,
965
+ "grad_norm": 0.08210122013268317,
966
  "learning_rate": 8.234322465092047e-05,
967
+ "loss": 0.8491,
968
  "step": 680
969
  },
970
  {
971
  "epoch": 0.8394607843137255,
972
+ "grad_norm": 0.08195821975889148,
973
  "learning_rate": 7.655659338809329e-05,
974
+ "loss": 0.8484,
975
  "step": 685
976
  },
977
  {
978
  "epoch": 0.8455882352941176,
979
+ "grad_norm": 0.08225068138923354,
980
  "learning_rate": 7.096388383619079e-05,
981
+ "loss": 0.8436,
982
  "step": 690
983
  },
984
  {
985
  "epoch": 0.8517156862745098,
986
+ "grad_norm": 0.10816220803390626,
987
  "learning_rate": 6.556765725319525e-05,
988
+ "loss": 0.8479,
989
  "step": 695
990
  },
991
  {
992
  "epoch": 0.8578431372549019,
993
+ "grad_norm": 0.08954785260614277,
994
  "learning_rate": 6.037038491501978e-05,
995
+ "loss": 0.8524,
996
  "step": 700
997
  },
998
  {
999
  "epoch": 0.8639705882352942,
1000
+ "grad_norm": 0.07665058203914679,
1001
  "learning_rate": 5.53744469837551e-05,
1002
+ "loss": 0.8431,
1003
  "step": 705
1004
  },
1005
  {
1006
  "epoch": 0.8700980392156863,
1007
+ "grad_norm": 0.0782138298232773,
1008
  "learning_rate": 5.058213141764151e-05,
1009
+ "loss": 0.8438,
1010
  "step": 710
1011
  },
1012
  {
1013
  "epoch": 0.8762254901960784,
1014
+ "grad_norm": 0.08269251578264038,
1015
  "learning_rate": 4.599563292326592e-05,
1016
+ "loss": 0.8485,
1017
  "step": 715
1018
  },
1019
  {
1020
  "epoch": 0.8823529411764706,
1021
+ "grad_norm": 0.0907687363220474,
1022
  "learning_rate": 4.161705195046761e-05,
1023
+ "loss": 0.8443,
1024
  "step": 720
1025
  },
1026
  {
1027
  "epoch": 0.8884803921568627,
1028
+ "grad_norm": 0.08259230750361556,
1029
  "learning_rate": 3.744839373040682e-05,
1030
+ "loss": 0.8467,
1031
  "step": 725
1032
  },
1033
  {
1034
  "epoch": 0.8946078431372549,
1035
+ "grad_norm": 0.07999560967778772,
1036
  "learning_rate": 3.349156735724274e-05,
1037
+ "loss": 0.848,
1038
  "step": 730
1039
  },
1040
  {
1041
  "epoch": 0.9007352941176471,
1042
+ "grad_norm": 0.07524504711853225,
1043
  "learning_rate": 2.9748384913837522e-05,
1044
+ "loss": 0.8348,
1045
  "step": 735
1046
  },
1047
  {
1048
  "epoch": 0.9068627450980392,
1049
+ "grad_norm": 0.08135847243984051,
1050
  "learning_rate": 2.622056064188738e-05,
1051
+ "loss": 0.854,
1052
  "step": 740
1053
  },
1054
  {
1055
  "epoch": 0.9129901960784313,
1056
+ "grad_norm": 0.07885634814452873,
1057
  "learning_rate": 2.2909710156863274e-05,
1058
+ "loss": 0.8514,
1059
  "step": 745
1060
  },
1061
  {
1062
  "epoch": 0.9191176470588235,
1063
+ "grad_norm": 0.08736339560766254,
1064
  "learning_rate": 1.981734970811644e-05,
1065
+ "loss": 0.8417,
1066
  "step": 750
1067
  },
1068
  {
1069
  "epoch": 0.9252450980392157,
1070
+ "grad_norm": 0.0767085793238129,
1071
  "learning_rate": 1.6944895484492072e-05,
1072
+ "loss": 0.8523,
1073
  "step": 755
1074
  },
1075
  {
1076
  "epoch": 0.9313725490196079,
1077
+ "grad_norm": 0.08318918651152993,
1078
  "learning_rate": 1.429366296576623e-05,
1079
+ "loss": 0.8511,
1080
  "step": 760
1081
  },
1082
  {
1083
  "epoch": 0.9375,
1084
+ "grad_norm": 0.07933377923909153,
1085
  "learning_rate": 1.1864866320203115e-05,
1086
+ "loss": 0.8479,
1087
  "step": 765
1088
  },
1089
  {
1090
  "epoch": 0.9436274509803921,
1091
+ "grad_norm": 0.09008515851237198,
1092
  "learning_rate": 9.659617848510882e-06,
1093
+ "loss": 0.8449,
1094
  "step": 770
1095
  },
1096
  {
1097
  "epoch": 0.9497549019607843,
1098
+ "grad_norm": 0.07787795748629618,
1099
  "learning_rate": 7.678927474447817e-06,
1100
+ "loss": 0.8446,
1101
  "step": 775
1102
  },
1103
  {
1104
  "epoch": 0.9558823529411765,
1105
+ "grad_norm": 0.08423127476840589,
1106
  "learning_rate": 5.923702282314092e-06,
1107
+ "loss": 0.8466,
1108
  "step": 780
1109
  },
1110
  {
1111
  "epoch": 0.9620098039215687,
1112
+ "grad_norm": 0.07739503343274702,
1113
  "learning_rate": 4.394746101540115e-06,
1114
+ "loss": 0.8423,
1115
  "step": 785
1116
  },
1117
  {
1118
  "epoch": 0.9681372549019608,
1119
+ "grad_norm": 0.07932764141883414,
1120
  "learning_rate": 3.092759138561607e-06,
1121
+ "loss": 0.8405,
1122
  "step": 790
1123
  },
1124
  {
1125
  "epoch": 0.9742647058823529,
1126
+ "grad_norm": 0.08063542360073593,
1127
  "learning_rate": 2.018337656150726e-06,
1128
+ "loss": 0.8461,
1129
  "step": 795
1130
  },
1131
  {
1132
  "epoch": 0.9803921568627451,
1133
+ "grad_norm": 0.08993712709783745,
1134
  "learning_rate": 1.1719737003492159e-06,
1135
+ "loss": 0.8388,
1136
  "step": 800
1137
  },
1138
  {
1139
  "epoch": 0.9865196078431373,
1140
+ "grad_norm": 0.0792762901452283,
1141
  "learning_rate": 5.540548751292173e-07,
1142
+ "loss": 0.8346,
1143
  "step": 805
1144
  },
1145
  {
1146
  "epoch": 0.9926470588235294,
1147
+ "grad_norm": 0.09250878108386706,
1148
  "learning_rate": 1.6486416488459277e-07,
1149
+ "loss": 0.8478,
1150
  "step": 810
1151
  },
1152
  {
1153
  "epoch": 0.9987745098039216,
1154
+ "grad_norm": 0.07786954435284818,
1155
  "learning_rate": 4.579804834703438e-09,
1156
+ "loss": 0.8401,
1157
  "step": 815
1158
  },
1159
  {
1160
  "epoch": 1.0,
1161
+ "eval_loss": 1.202426552772522,
1162
+ "eval_runtime": 111.4035,
1163
+ "eval_samples_per_second": 187.974,
1164
+ "eval_steps_per_second": 5.88,
1165
  "step": 816
1166
  },
1167
  {
1168
  "epoch": 1.0,
1169
  "step": 816,
1170
  "total_flos": 80063181619200.0,
1171
+ "train_loss": 0.914715180794398,
1172
+ "train_runtime": 1874.4496,
1173
+ "train_samples_per_second": 55.714,
1174
+ "train_steps_per_second": 0.435
1175
  }
1176
  ],
1177
  "logging_steps": 5,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:12a9a33d258da7c5a8d74e680d5374bc12f9dafc4e54f34a88d1cc9259dfaa7c
3
  size 6456
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:850382808a98614a406a920bb7e54bab43a949c3aee2c03fb6a5de54b03f913e
3
  size 6456