cervisiarius commited on
Commit
724c037
·
verified ·
1 Parent(s): b966e85

Model save

Browse files
adapter_config.json CHANGED
@@ -21,12 +21,12 @@
21
  "revision": null,
22
  "target_modules": [
23
  "up_proj",
24
- "v_proj",
25
  "q_proj",
 
26
  "gate_proj",
27
  "k_proj",
28
- "down_proj",
29
- "o_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
21
  "revision": null,
22
  "target_modules": [
23
  "up_proj",
 
24
  "q_proj",
25
+ "v_proj",
26
  "gate_proj",
27
  "k_proj",
28
+ "o_proj",
29
+ "down_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:72926929efb16b9742c5b0672503bc20540d9b6cc9f61396ffadde8cc9fd5943
3
  size 73911112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9439f84441ba4a34585d5554e5f06048fe4bc93effd918b82410f94c5bf6edc7
3
  size 73911112
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9991079393398751,
3
  "total_flos": 8.398274691072e+17,
4
- "train_loss": 0.0010137432387896947,
5
- "train_runtime": 24.1791,
6
  "train_samples": 16710,
7
- "train_samples_per_second": 370.899,
8
- "train_steps_per_second": 23.16
9
  }
 
1
  {
2
  "epoch": 0.9991079393398751,
3
  "total_flos": 8.398274691072e+17,
4
+ "train_loss": 0.5889353160347257,
5
+ "train_runtime": 8886.9999,
6
  "train_samples": 16710,
7
+ "train_samples_per_second": 1.009,
8
+ "train_steps_per_second": 0.063
9
  }
runs/Feb06_23-24-31_GCRAZGDL1601/events.out.tfevents.1738884276.GCRAZGDL1601.3098201.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63fbc7d88f143ec334dee35764121bf41e0a33f92c44c7407b43f6de224f6a2c
3
+ size 30021
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9991079393398751,
3
  "total_flos": 8.398274691072e+17,
4
- "train_loss": 0.0010137432387896947,
5
- "train_runtime": 24.1791,
6
  "train_samples": 16710,
7
- "train_samples_per_second": 370.899,
8
- "train_steps_per_second": 23.16
9
  }
 
1
  {
2
  "epoch": 0.9991079393398751,
3
  "total_flos": 8.398274691072e+17,
4
+ "train_loss": 0.5889353160347257,
5
+ "train_runtime": 8886.9999,
6
  "train_samples": 16710,
7
+ "train_samples_per_second": 1.009,
8
+ "train_steps_per_second": 0.063
9
  }
trainer_state.json CHANGED
@@ -9,797 +9,797 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.008936550491510277,
13
- "grad_norm": 0.04153493791818619,
14
  "learning_rate": 0.0002,
15
- "loss": 0.7499,
16
  "step": 5
17
  },
18
  {
19
- "epoch": 0.017873100983020553,
20
- "grad_norm": 0.044111333787441254,
21
  "learning_rate": 0.0002,
22
- "loss": 0.7161,
23
  "step": 10
24
  },
25
  {
26
- "epoch": 0.02680965147453083,
27
- "grad_norm": 0.041741564869880676,
28
  "learning_rate": 0.0002,
29
- "loss": 0.6638,
30
  "step": 15
31
  },
32
  {
33
- "epoch": 0.035746201966041107,
34
- "grad_norm": 0.03860403597354889,
35
  "learning_rate": 0.0002,
36
- "loss": 0.6516,
37
  "step": 20
38
  },
39
  {
40
- "epoch": 0.044682752457551385,
41
- "grad_norm": 0.03756846487522125,
42
  "learning_rate": 0.0002,
43
- "loss": 0.6725,
44
  "step": 25
45
  },
46
  {
47
- "epoch": 0.05361930294906166,
48
- "grad_norm": 0.03411193564534187,
49
  "learning_rate": 0.0002,
50
- "loss": 0.624,
51
  "step": 30
52
  },
53
  {
54
- "epoch": 0.06255585344057193,
55
- "grad_norm": 0.029239550232887268,
56
  "learning_rate": 0.0002,
57
- "loss": 0.6482,
58
  "step": 35
59
  },
60
  {
61
- "epoch": 0.07149240393208221,
62
- "grad_norm": 0.026874635368585587,
63
  "learning_rate": 0.0002,
64
- "loss": 0.631,
65
  "step": 40
66
  },
67
  {
68
- "epoch": 0.08042895442359249,
69
- "grad_norm": 0.026550836861133575,
70
  "learning_rate": 0.0002,
71
- "loss": 0.613,
72
  "step": 45
73
  },
74
  {
75
- "epoch": 0.08936550491510277,
76
- "grad_norm": 0.025206631049513817,
77
  "learning_rate": 0.0002,
78
- "loss": 0.6205,
79
  "step": 50
80
  },
81
  {
82
- "epoch": 0.09830205540661305,
83
- "grad_norm": 0.025149798020720482,
84
  "learning_rate": 0.0002,
85
- "loss": 0.5954,
86
  "step": 55
87
  },
88
  {
89
- "epoch": 0.10723860589812333,
90
- "grad_norm": 0.02644510194659233,
91
  "learning_rate": 0.0002,
92
- "loss": 0.6106,
93
  "step": 60
94
  },
95
  {
96
- "epoch": 0.1161751563896336,
97
- "grad_norm": 0.02368175983428955,
98
  "learning_rate": 0.0002,
99
- "loss": 0.5899,
100
  "step": 65
101
  },
102
  {
103
- "epoch": 0.12511170688114387,
104
- "grad_norm": 0.025952663272619247,
105
  "learning_rate": 0.0002,
106
- "loss": 0.5864,
107
  "step": 70
108
  },
109
  {
110
- "epoch": 0.13404825737265416,
111
- "grad_norm": 0.02527940273284912,
112
  "learning_rate": 0.0002,
113
- "loss": 0.5888,
114
  "step": 75
115
  },
116
  {
117
- "epoch": 0.14298480786416443,
118
- "grad_norm": 0.0256633460521698,
119
  "learning_rate": 0.0002,
120
- "loss": 0.579,
121
  "step": 80
122
  },
123
  {
124
- "epoch": 0.15192135835567472,
125
- "grad_norm": 0.023388464003801346,
126
  "learning_rate": 0.0002,
127
- "loss": 0.5983,
128
  "step": 85
129
  },
130
  {
131
- "epoch": 0.16085790884718498,
132
- "grad_norm": 0.02358727529644966,
133
  "learning_rate": 0.0002,
134
- "loss": 0.5993,
135
  "step": 90
136
  },
137
  {
138
- "epoch": 0.16979445933869527,
139
- "grad_norm": 0.02532259002327919,
140
  "learning_rate": 0.0002,
141
- "loss": 0.5883,
142
  "step": 95
143
  },
144
  {
145
- "epoch": 0.17873100983020554,
146
- "grad_norm": 0.025463785976171494,
147
  "learning_rate": 0.0002,
148
- "loss": 0.5646,
149
  "step": 100
150
  },
151
  {
152
- "epoch": 0.1876675603217158,
153
- "grad_norm": 0.02428724244236946,
154
  "learning_rate": 0.0002,
155
- "loss": 0.5863,
156
  "step": 105
157
  },
158
  {
159
- "epoch": 0.1966041108132261,
160
- "grad_norm": 0.027085445821285248,
161
  "learning_rate": 0.0002,
162
- "loss": 0.5887,
163
  "step": 110
164
  },
165
  {
166
- "epoch": 0.20554066130473636,
167
- "grad_norm": 0.024989286437630653,
168
  "learning_rate": 0.0002,
169
- "loss": 0.6307,
170
  "step": 115
171
  },
172
  {
173
- "epoch": 0.21447721179624665,
174
- "grad_norm": 0.025627750903367996,
175
  "learning_rate": 0.0002,
176
- "loss": 0.619,
177
  "step": 120
178
  },
179
  {
180
- "epoch": 0.22341376228775692,
181
- "grad_norm": 0.028242159634828568,
182
  "learning_rate": 0.0002,
183
- "loss": 0.6093,
184
  "step": 125
185
  },
186
  {
187
- "epoch": 0.2323503127792672,
188
- "grad_norm": 0.02808305062353611,
189
  "learning_rate": 0.0002,
190
- "loss": 0.6145,
191
  "step": 130
192
  },
193
  {
194
- "epoch": 0.24128686327077747,
195
- "grad_norm": 0.024711593985557556,
196
  "learning_rate": 0.0002,
197
- "loss": 0.5533,
198
  "step": 135
199
  },
200
  {
201
- "epoch": 0.25022341376228774,
202
- "grad_norm": 0.026244837790727615,
203
  "learning_rate": 0.0002,
204
- "loss": 0.5927,
205
  "step": 140
206
  },
207
  {
208
- "epoch": 0.25915996425379806,
209
- "grad_norm": 0.02702728658914566,
210
  "learning_rate": 0.0002,
211
- "loss": 0.5809,
212
  "step": 145
213
  },
214
  {
215
- "epoch": 0.2680965147453083,
216
- "grad_norm": 0.026773197576403618,
217
  "learning_rate": 0.0002,
218
- "loss": 0.588,
219
  "step": 150
220
  },
221
  {
222
- "epoch": 0.2770330652368186,
223
- "grad_norm": 0.026652127504348755,
224
  "learning_rate": 0.0002,
225
- "loss": 0.6091,
226
  "step": 155
227
  },
228
  {
229
- "epoch": 0.28596961572832885,
230
- "grad_norm": 0.026826491579413414,
231
  "learning_rate": 0.0002,
232
- "loss": 0.5746,
233
  "step": 160
234
  },
235
  {
236
- "epoch": 0.2949061662198391,
237
- "grad_norm": 0.028087735176086426,
238
  "learning_rate": 0.0002,
239
- "loss": 0.5986,
240
  "step": 165
241
  },
242
  {
243
- "epoch": 0.30384271671134944,
244
- "grad_norm": 0.025982139632105827,
245
  "learning_rate": 0.0002,
246
- "loss": 0.5819,
247
  "step": 170
248
  },
249
  {
250
- "epoch": 0.3127792672028597,
251
- "grad_norm": 0.02771054208278656,
252
  "learning_rate": 0.0002,
253
- "loss": 0.5874,
254
  "step": 175
255
  },
256
  {
257
- "epoch": 0.32171581769436997,
258
- "grad_norm": 0.026281068101525307,
259
  "learning_rate": 0.0002,
260
- "loss": 0.5955,
261
  "step": 180
262
  },
263
  {
264
- "epoch": 0.33065236818588023,
265
- "grad_norm": 0.026669517159461975,
266
  "learning_rate": 0.0002,
267
- "loss": 0.5842,
268
  "step": 185
269
  },
270
  {
271
- "epoch": 0.33958891867739055,
272
- "grad_norm": 0.027214782312512398,
273
  "learning_rate": 0.0002,
274
- "loss": 0.578,
275
  "step": 190
276
  },
277
  {
278
- "epoch": 0.3485254691689008,
279
- "grad_norm": 0.0279616117477417,
280
  "learning_rate": 0.0002,
281
- "loss": 0.5993,
282
  "step": 195
283
  },
284
  {
285
- "epoch": 0.3574620196604111,
286
- "grad_norm": 0.026525571942329407,
287
  "learning_rate": 0.0002,
288
- "loss": 0.5988,
289
  "step": 200
290
  },
291
  {
292
- "epoch": 0.36639857015192134,
293
- "grad_norm": 0.026812193915247917,
294
  "learning_rate": 0.0002,
295
- "loss": 0.5818,
296
  "step": 205
297
  },
298
  {
299
- "epoch": 0.3753351206434316,
300
- "grad_norm": 0.02935432456433773,
301
  "learning_rate": 0.0002,
302
- "loss": 0.5942,
303
  "step": 210
304
  },
305
  {
306
- "epoch": 0.38427167113494193,
307
- "grad_norm": 0.02752542681992054,
308
  "learning_rate": 0.0002,
309
- "loss": 0.5781,
310
  "step": 215
311
  },
312
  {
313
- "epoch": 0.3932082216264522,
314
- "grad_norm": 0.02715650200843811,
315
  "learning_rate": 0.0002,
316
- "loss": 0.5539,
317
  "step": 220
318
  },
319
  {
320
- "epoch": 0.40214477211796246,
321
- "grad_norm": 0.027740860357880592,
322
  "learning_rate": 0.0002,
323
- "loss": 0.6005,
324
  "step": 225
325
  },
326
  {
327
- "epoch": 0.4110813226094727,
328
- "grad_norm": 0.02917667292058468,
329
  "learning_rate": 0.0002,
330
- "loss": 0.5858,
331
  "step": 230
332
  },
333
  {
334
- "epoch": 0.42001787310098304,
335
- "grad_norm": 0.029291415587067604,
336
  "learning_rate": 0.0002,
337
- "loss": 0.6052,
338
  "step": 235
339
  },
340
  {
341
- "epoch": 0.4289544235924933,
342
- "grad_norm": 0.028029408305883408,
343
  "learning_rate": 0.0002,
344
- "loss": 0.5843,
345
  "step": 240
346
  },
347
  {
348
- "epoch": 0.43789097408400357,
349
- "grad_norm": 0.027253130450844765,
350
  "learning_rate": 0.0002,
351
- "loss": 0.5733,
352
  "step": 245
353
  },
354
  {
355
- "epoch": 0.44682752457551383,
356
- "grad_norm": 0.030096998438239098,
357
  "learning_rate": 0.0002,
358
- "loss": 0.5659,
359
  "step": 250
360
  },
361
  {
362
- "epoch": 0.45576407506702415,
363
- "grad_norm": 0.027552833780646324,
364
  "learning_rate": 0.0002,
365
- "loss": 0.6113,
366
  "step": 255
367
  },
368
  {
369
- "epoch": 0.4647006255585344,
370
- "grad_norm": 0.027978690341114998,
371
  "learning_rate": 0.0002,
372
- "loss": 0.5865,
373
  "step": 260
374
  },
375
  {
376
- "epoch": 0.4736371760500447,
377
- "grad_norm": 0.02770094946026802,
378
  "learning_rate": 0.0002,
379
- "loss": 0.5483,
380
  "step": 265
381
  },
382
  {
383
- "epoch": 0.48257372654155495,
384
- "grad_norm": 0.029694920405745506,
385
  "learning_rate": 0.0002,
386
- "loss": 0.5818,
387
  "step": 270
388
  },
389
  {
390
- "epoch": 0.4915102770330652,
391
- "grad_norm": 0.026892486959695816,
392
  "learning_rate": 0.0002,
393
- "loss": 0.5943,
394
  "step": 275
395
  },
396
  {
397
- "epoch": 0.5004468275245755,
398
- "grad_norm": 0.02659418247640133,
399
  "learning_rate": 0.0002,
400
- "loss": 0.5975,
401
  "step": 280
402
  },
403
  {
404
- "epoch": 0.5093833780160858,
405
- "grad_norm": 0.02613973245024681,
406
  "learning_rate": 0.0002,
407
- "loss": 0.5666,
408
  "step": 285
409
  },
410
  {
411
- "epoch": 0.5183199285075961,
412
- "grad_norm": 0.028008636087179184,
413
  "learning_rate": 0.0002,
414
- "loss": 0.5762,
415
  "step": 290
416
  },
417
  {
418
- "epoch": 0.5272564789991063,
419
- "grad_norm": 0.026727279648184776,
420
  "learning_rate": 0.0002,
421
- "loss": 0.5652,
422
  "step": 295
423
  },
424
  {
425
- "epoch": 0.5361930294906166,
426
- "grad_norm": 0.027729446068406105,
427
  "learning_rate": 0.0002,
428
- "loss": 0.5605,
429
  "step": 300
430
  },
431
  {
432
- "epoch": 0.5451295799821269,
433
- "grad_norm": 0.02615601010620594,
434
  "learning_rate": 0.0002,
435
- "loss": 0.6004,
436
  "step": 305
437
  },
438
  {
439
- "epoch": 0.5540661304736372,
440
- "grad_norm": 0.02820262685418129,
441
  "learning_rate": 0.0002,
442
- "loss": 0.5562,
443
  "step": 310
444
  },
445
  {
446
- "epoch": 0.5630026809651475,
447
- "grad_norm": 0.028281336650252342,
448
  "learning_rate": 0.0002,
449
- "loss": 0.5782,
450
  "step": 315
451
  },
452
  {
453
- "epoch": 0.5719392314566577,
454
- "grad_norm": 0.02955157682299614,
455
  "learning_rate": 0.0002,
456
- "loss": 0.6086,
457
  "step": 320
458
  },
459
  {
460
- "epoch": 0.580875781948168,
461
- "grad_norm": 0.02593911811709404,
462
  "learning_rate": 0.0002,
463
- "loss": 0.5618,
464
  "step": 325
465
  },
466
  {
467
- "epoch": 0.5898123324396782,
468
- "grad_norm": 0.028371961787343025,
469
  "learning_rate": 0.0002,
470
- "loss": 0.5747,
471
  "step": 330
472
  },
473
  {
474
- "epoch": 0.5987488829311886,
475
- "grad_norm": 0.027989625930786133,
476
  "learning_rate": 0.0002,
477
- "loss": 0.5736,
478
  "step": 335
479
  },
480
  {
481
- "epoch": 0.6076854334226989,
482
- "grad_norm": 0.02815859578549862,
483
  "learning_rate": 0.0002,
484
- "loss": 0.5666,
485
  "step": 340
486
  },
487
  {
488
- "epoch": 0.6166219839142091,
489
- "grad_norm": 0.02636733092367649,
490
  "learning_rate": 0.0002,
491
- "loss": 0.5774,
492
  "step": 345
493
  },
494
  {
495
- "epoch": 0.6255585344057194,
496
- "grad_norm": 0.028307748958468437,
497
  "learning_rate": 0.0002,
498
- "loss": 0.5783,
499
  "step": 350
500
  },
501
  {
502
- "epoch": 0.6344950848972297,
503
- "grad_norm": 0.028890695422887802,
504
  "learning_rate": 0.0002,
505
- "loss": 0.5796,
506
  "step": 355
507
  },
508
  {
509
- "epoch": 0.6434316353887399,
510
- "grad_norm": 0.028257351368665695,
511
  "learning_rate": 0.0002,
512
- "loss": 0.5646,
513
  "step": 360
514
  },
515
  {
516
- "epoch": 0.6523681858802503,
517
- "grad_norm": 0.02682431973516941,
518
  "learning_rate": 0.0002,
519
- "loss": 0.5507,
520
  "step": 365
521
  },
522
  {
523
- "epoch": 0.6613047363717605,
524
- "grad_norm": 0.027308348566293716,
525
  "learning_rate": 0.0002,
526
- "loss": 0.6145,
527
  "step": 370
528
  },
529
  {
530
- "epoch": 0.6702412868632708,
531
- "grad_norm": 0.027066020295023918,
532
  "learning_rate": 0.0002,
533
- "loss": 0.5708,
534
  "step": 375
535
  },
536
  {
537
- "epoch": 0.6791778373547811,
538
- "grad_norm": 0.02646820992231369,
539
  "learning_rate": 0.0002,
540
- "loss": 0.5466,
541
  "step": 380
542
  },
543
  {
544
- "epoch": 0.6881143878462913,
545
- "grad_norm": 0.026627352461218834,
546
  "learning_rate": 0.0002,
547
- "loss": 0.5735,
548
  "step": 385
549
  },
550
  {
551
- "epoch": 0.6970509383378016,
552
- "grad_norm": 0.02751215733587742,
553
  "learning_rate": 0.0002,
554
- "loss": 0.5708,
555
  "step": 390
556
  },
557
  {
558
- "epoch": 0.7059874888293118,
559
- "grad_norm": 0.029046354815363884,
560
  "learning_rate": 0.0002,
561
- "loss": 0.5701,
562
  "step": 395
563
  },
564
  {
565
- "epoch": 0.7149240393208222,
566
- "grad_norm": 0.028309453278779984,
567
  "learning_rate": 0.0002,
568
- "loss": 0.5824,
569
  "step": 400
570
  },
571
  {
572
- "epoch": 0.7238605898123325,
573
- "grad_norm": 0.027017708867788315,
574
  "learning_rate": 0.0002,
575
- "loss": 0.5672,
576
  "step": 405
577
  },
578
  {
579
- "epoch": 0.7327971403038427,
580
- "grad_norm": 0.02751619555056095,
581
  "learning_rate": 0.0002,
582
- "loss": 0.583,
583
  "step": 410
584
  },
585
  {
586
- "epoch": 0.741733690795353,
587
- "grad_norm": 0.029170291498303413,
588
  "learning_rate": 0.0002,
589
- "loss": 0.5923,
590
  "step": 415
591
  },
592
  {
593
- "epoch": 0.7506702412868632,
594
- "grad_norm": 0.02801818959414959,
595
  "learning_rate": 0.0002,
596
- "loss": 0.5912,
597
  "step": 420
598
  },
599
  {
600
- "epoch": 0.7596067917783735,
601
- "grad_norm": 0.029242202639579773,
602
  "learning_rate": 0.0002,
603
- "loss": 0.5766,
604
  "step": 425
605
  },
606
  {
607
- "epoch": 0.7685433422698839,
608
- "grad_norm": 0.02919500134885311,
609
  "learning_rate": 0.0002,
610
- "loss": 0.5938,
611
  "step": 430
612
  },
613
  {
614
- "epoch": 0.7774798927613941,
615
- "grad_norm": 0.028539441525936127,
616
  "learning_rate": 0.0002,
617
- "loss": 0.5841,
618
  "step": 435
619
  },
620
  {
621
- "epoch": 0.7864164432529044,
622
- "grad_norm": 0.0284650269895792,
623
  "learning_rate": 0.0002,
624
- "loss": 0.5939,
625
  "step": 440
626
  },
627
  {
628
- "epoch": 0.7953529937444147,
629
- "grad_norm": 0.027977267280220985,
630
  "learning_rate": 0.0002,
631
- "loss": 0.6035,
632
  "step": 445
633
  },
634
  {
635
- "epoch": 0.8042895442359249,
636
- "grad_norm": 0.026608245447278023,
637
  "learning_rate": 0.0002,
638
- "loss": 0.586,
639
  "step": 450
640
  },
641
  {
642
- "epoch": 0.8132260947274352,
643
- "grad_norm": 0.02660188265144825,
644
  "learning_rate": 0.0002,
645
- "loss": 0.5757,
646
  "step": 455
647
  },
648
  {
649
- "epoch": 0.8221626452189454,
650
- "grad_norm": 0.026315640658140182,
651
  "learning_rate": 0.0002,
652
- "loss": 0.5571,
653
  "step": 460
654
  },
655
  {
656
- "epoch": 0.8310991957104558,
657
- "grad_norm": 0.027830056846141815,
658
  "learning_rate": 0.0002,
659
- "loss": 0.5623,
660
  "step": 465
661
  },
662
  {
663
- "epoch": 0.8400357462019661,
664
- "grad_norm": 0.027287248522043228,
665
  "learning_rate": 0.0002,
666
- "loss": 0.5808,
667
  "step": 470
668
  },
669
  {
670
- "epoch": 0.8489722966934763,
671
- "grad_norm": 0.025539880618453026,
672
  "learning_rate": 0.0002,
673
- "loss": 0.556,
674
  "step": 475
675
  },
676
  {
677
- "epoch": 0.8579088471849866,
678
- "grad_norm": 0.02839650772511959,
679
  "learning_rate": 0.0002,
680
- "loss": 0.574,
681
  "step": 480
682
  },
683
  {
684
- "epoch": 0.8668453976764968,
685
- "grad_norm": 0.026629634201526642,
686
  "learning_rate": 0.0002,
687
- "loss": 0.5902,
688
  "step": 485
689
  },
690
  {
691
- "epoch": 0.8757819481680071,
692
- "grad_norm": 0.02764849364757538,
693
  "learning_rate": 0.0002,
694
- "loss": 0.5661,
695
  "step": 490
696
  },
697
  {
698
- "epoch": 0.8847184986595175,
699
- "grad_norm": 0.02718566171824932,
700
  "learning_rate": 0.0002,
701
- "loss": 0.5671,
702
  "step": 495
703
  },
704
  {
705
- "epoch": 0.8936550491510277,
706
- "grad_norm": 0.031797122210264206,
707
  "learning_rate": 0.0002,
708
- "loss": 0.5862,
709
  "step": 500
710
  },
711
  {
712
- "epoch": 0.902591599642538,
713
- "grad_norm": 0.027700966224074364,
714
  "learning_rate": 0.0002,
715
- "loss": 0.5892,
716
  "step": 505
717
  },
718
  {
719
- "epoch": 0.9115281501340483,
720
- "grad_norm": 0.029541322961449623,
721
  "learning_rate": 0.0002,
722
- "loss": 0.5934,
723
  "step": 510
724
  },
725
  {
726
- "epoch": 0.9204647006255585,
727
- "grad_norm": 0.027446402236819267,
728
  "learning_rate": 0.0002,
729
- "loss": 0.5818,
730
  "step": 515
731
  },
732
  {
733
- "epoch": 0.9294012511170688,
734
- "grad_norm": 0.029558710753917694,
735
  "learning_rate": 0.0002,
736
- "loss": 0.5659,
737
  "step": 520
738
  },
739
  {
740
- "epoch": 0.938337801608579,
741
- "grad_norm": 0.028251904994249344,
742
  "learning_rate": 0.0002,
743
- "loss": 0.5686,
744
  "step": 525
745
  },
746
  {
747
- "epoch": 0.9472743521000894,
748
- "grad_norm": 0.027048900723457336,
749
  "learning_rate": 0.0002,
750
- "loss": 0.6004,
751
  "step": 530
752
  },
753
  {
754
- "epoch": 0.9562109025915997,
755
- "grad_norm": 0.03018295019865036,
756
  "learning_rate": 0.0002,
757
- "loss": 0.5914,
758
  "step": 535
759
  },
760
  {
761
- "epoch": 0.9651474530831099,
762
- "grad_norm": 0.027683330699801445,
763
  "learning_rate": 0.0002,
764
- "loss": 0.559,
765
  "step": 540
766
  },
767
  {
768
- "epoch": 0.9740840035746202,
769
- "grad_norm": 0.027257127687335014,
770
  "learning_rate": 0.0002,
771
- "loss": 0.5783,
772
  "step": 545
773
  },
774
  {
775
- "epoch": 0.9830205540661304,
776
- "grad_norm": 0.028040431439876556,
777
  "learning_rate": 0.0002,
778
- "loss": 0.5754,
779
  "step": 550
780
  },
781
  {
782
- "epoch": 0.9919571045576407,
783
- "grad_norm": 0.02839042991399765,
784
  "learning_rate": 0.0002,
785
- "loss": 0.5578,
786
  "step": 555
787
  },
788
  {
789
  "epoch": 0.9991079393398751,
790
- "grad_norm": 0.06720411032438278,
791
  "learning_rate": 0.0002,
792
- "loss": 0.5677,
793
  "step": 560
794
  },
795
  {
796
  "epoch": 0.9991079393398751,
797
  "step": 560,
798
  "total_flos": 8.398274691072e+17,
799
- "train_loss": 0.0010137432387896947,
800
- "train_runtime": 24.1791,
801
- "train_samples_per_second": 370.899,
802
- "train_steps_per_second": 23.16
803
  }
804
  ],
805
  "logging_steps": 5,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.008920606601248885,
13
+ "grad_norm": 0.04344707727432251,
14
  "learning_rate": 0.0002,
15
+ "loss": 0.7458,
16
  "step": 5
17
  },
18
  {
19
+ "epoch": 0.01784121320249777,
20
+ "grad_norm": 0.041871875524520874,
21
  "learning_rate": 0.0002,
22
+ "loss": 0.71,
23
  "step": 10
24
  },
25
  {
26
+ "epoch": 0.026761819803746655,
27
+ "grad_norm": 0.041962090879678726,
28
  "learning_rate": 0.0002,
29
+ "loss": 0.672,
30
  "step": 15
31
  },
32
  {
33
+ "epoch": 0.03568242640499554,
34
+ "grad_norm": 0.03574032336473465,
35
  "learning_rate": 0.0002,
36
+ "loss": 0.686,
37
  "step": 20
38
  },
39
  {
40
+ "epoch": 0.04460303300624442,
41
+ "grad_norm": 0.03700491040945053,
42
  "learning_rate": 0.0002,
43
+ "loss": 0.6515,
44
  "step": 25
45
  },
46
  {
47
+ "epoch": 0.05352363960749331,
48
+ "grad_norm": 0.029821349307894707,
49
  "learning_rate": 0.0002,
50
+ "loss": 0.6247,
51
  "step": 30
52
  },
53
  {
54
+ "epoch": 0.06244424620874219,
55
+ "grad_norm": 0.027489786967635155,
56
  "learning_rate": 0.0002,
57
+ "loss": 0.6248,
58
  "step": 35
59
  },
60
  {
61
+ "epoch": 0.07136485280999108,
62
+ "grad_norm": 0.026733923703432083,
63
  "learning_rate": 0.0002,
64
+ "loss": 0.626,
65
  "step": 40
66
  },
67
  {
68
+ "epoch": 0.08028545941123996,
69
+ "grad_norm": 0.024855654686689377,
70
  "learning_rate": 0.0002,
71
+ "loss": 0.6141,
72
  "step": 45
73
  },
74
  {
75
+ "epoch": 0.08920606601248884,
76
+ "grad_norm": 0.024668758735060692,
77
  "learning_rate": 0.0002,
78
+ "loss": 0.5882,
79
  "step": 50
80
  },
81
  {
82
+ "epoch": 0.09812667261373774,
83
+ "grad_norm": 0.024284500628709793,
84
  "learning_rate": 0.0002,
85
+ "loss": 0.599,
86
  "step": 55
87
  },
88
  {
89
+ "epoch": 0.10704727921498662,
90
+ "grad_norm": 0.026213666424155235,
91
  "learning_rate": 0.0002,
92
+ "loss": 0.6458,
93
  "step": 60
94
  },
95
  {
96
+ "epoch": 0.1159678858162355,
97
+ "grad_norm": 0.025072986260056496,
98
  "learning_rate": 0.0002,
99
+ "loss": 0.5974,
100
  "step": 65
101
  },
102
  {
103
+ "epoch": 0.12488849241748438,
104
+ "grad_norm": 0.023400137200951576,
105
  "learning_rate": 0.0002,
106
+ "loss": 0.5535,
107
  "step": 70
108
  },
109
  {
110
+ "epoch": 0.13380909901873328,
111
+ "grad_norm": 0.025392651557922363,
112
  "learning_rate": 0.0002,
113
+ "loss": 0.5756,
114
  "step": 75
115
  },
116
  {
117
+ "epoch": 0.14272970561998216,
118
+ "grad_norm": 0.023731403052806854,
119
  "learning_rate": 0.0002,
120
+ "loss": 0.5895,
121
  "step": 80
122
  },
123
  {
124
+ "epoch": 0.15165031222123104,
125
+ "grad_norm": 0.02360088750720024,
126
  "learning_rate": 0.0002,
127
+ "loss": 0.5869,
128
  "step": 85
129
  },
130
  {
131
+ "epoch": 0.16057091882247992,
132
+ "grad_norm": 0.02655966207385063,
133
  "learning_rate": 0.0002,
134
+ "loss": 0.5851,
135
  "step": 90
136
  },
137
  {
138
+ "epoch": 0.1694915254237288,
139
+ "grad_norm": 0.02600923739373684,
140
  "learning_rate": 0.0002,
141
+ "loss": 0.609,
142
  "step": 95
143
  },
144
  {
145
+ "epoch": 0.1784121320249777,
146
+ "grad_norm": 0.025682499632239342,
147
  "learning_rate": 0.0002,
148
+ "loss": 0.5885,
149
  "step": 100
150
  },
151
  {
152
+ "epoch": 0.1873327386262266,
153
+ "grad_norm": 0.025844326242804527,
154
  "learning_rate": 0.0002,
155
+ "loss": 0.5997,
156
  "step": 105
157
  },
158
  {
159
+ "epoch": 0.19625334522747548,
160
+ "grad_norm": 0.027400100603699684,
161
  "learning_rate": 0.0002,
162
+ "loss": 0.609,
163
  "step": 110
164
  },
165
  {
166
+ "epoch": 0.20517395182872436,
167
+ "grad_norm": 0.025671344250440598,
168
  "learning_rate": 0.0002,
169
+ "loss": 0.5678,
170
  "step": 115
171
  },
172
  {
173
+ "epoch": 0.21409455842997324,
174
+ "grad_norm": 0.02872069925069809,
175
  "learning_rate": 0.0002,
176
+ "loss": 0.6052,
177
  "step": 120
178
  },
179
  {
180
+ "epoch": 0.22301516503122212,
181
+ "grad_norm": 0.026377148926258087,
182
  "learning_rate": 0.0002,
183
+ "loss": 0.5961,
184
  "step": 125
185
  },
186
  {
187
+ "epoch": 0.231935771632471,
188
+ "grad_norm": 0.027191588655114174,
189
  "learning_rate": 0.0002,
190
+ "loss": 0.5903,
191
  "step": 130
192
  },
193
  {
194
+ "epoch": 0.2408563782337199,
195
+ "grad_norm": 0.030450984835624695,
196
  "learning_rate": 0.0002,
197
+ "loss": 0.5703,
198
  "step": 135
199
  },
200
  {
201
+ "epoch": 0.24977698483496877,
202
+ "grad_norm": 0.02679985947906971,
203
  "learning_rate": 0.0002,
204
+ "loss": 0.5805,
205
  "step": 140
206
  },
207
  {
208
+ "epoch": 0.2586975914362177,
209
+ "grad_norm": 0.02805212326347828,
210
  "learning_rate": 0.0002,
211
+ "loss": 0.5867,
212
  "step": 145
213
  },
214
  {
215
+ "epoch": 0.26761819803746656,
216
+ "grad_norm": 0.027465296909213066,
217
  "learning_rate": 0.0002,
218
+ "loss": 0.599,
219
  "step": 150
220
  },
221
  {
222
+ "epoch": 0.27653880463871544,
223
+ "grad_norm": 0.027630291879177094,
224
  "learning_rate": 0.0002,
225
+ "loss": 0.5894,
226
  "step": 155
227
  },
228
  {
229
+ "epoch": 0.2854594112399643,
230
+ "grad_norm": 0.027628762647509575,
231
  "learning_rate": 0.0002,
232
+ "loss": 0.5984,
233
  "step": 160
234
  },
235
  {
236
+ "epoch": 0.2943800178412132,
237
+ "grad_norm": 0.02642473392188549,
238
  "learning_rate": 0.0002,
239
+ "loss": 0.5966,
240
  "step": 165
241
  },
242
  {
243
+ "epoch": 0.3033006244424621,
244
+ "grad_norm": 0.027726992964744568,
245
  "learning_rate": 0.0002,
246
+ "loss": 0.5954,
247
  "step": 170
248
  },
249
  {
250
+ "epoch": 0.31222123104371097,
251
+ "grad_norm": 0.027844909578561783,
252
  "learning_rate": 0.0002,
253
+ "loss": 0.5932,
254
  "step": 175
255
  },
256
  {
257
+ "epoch": 0.32114183764495985,
258
+ "grad_norm": 0.024765541777014732,
259
  "learning_rate": 0.0002,
260
+ "loss": 0.5608,
261
  "step": 180
262
  },
263
  {
264
+ "epoch": 0.33006244424620873,
265
+ "grad_norm": 0.02770071104168892,
266
  "learning_rate": 0.0002,
267
+ "loss": 0.6121,
268
  "step": 185
269
  },
270
  {
271
+ "epoch": 0.3389830508474576,
272
+ "grad_norm": 0.02800634689629078,
273
  "learning_rate": 0.0002,
274
+ "loss": 0.5964,
275
  "step": 190
276
  },
277
  {
278
+ "epoch": 0.3479036574487065,
279
+ "grad_norm": 0.028501659631729126,
280
  "learning_rate": 0.0002,
281
+ "loss": 0.5757,
282
  "step": 195
283
  },
284
  {
285
+ "epoch": 0.3568242640499554,
286
+ "grad_norm": 0.026436127722263336,
287
  "learning_rate": 0.0002,
288
+ "loss": 0.5689,
289
  "step": 200
290
  },
291
  {
292
+ "epoch": 0.36574487065120426,
293
+ "grad_norm": 0.027517864480614662,
294
  "learning_rate": 0.0002,
295
+ "loss": 0.5999,
296
  "step": 205
297
  },
298
  {
299
+ "epoch": 0.3746654772524532,
300
+ "grad_norm": 0.026851925998926163,
301
  "learning_rate": 0.0002,
302
+ "loss": 0.5771,
303
  "step": 210
304
  },
305
  {
306
+ "epoch": 0.3835860838537021,
307
+ "grad_norm": 0.027687210589647293,
308
  "learning_rate": 0.0002,
309
+ "loss": 0.5627,
310
  "step": 215
311
  },
312
  {
313
+ "epoch": 0.39250669045495096,
314
+ "grad_norm": 0.030687233433127403,
315
  "learning_rate": 0.0002,
316
+ "loss": 0.6206,
317
  "step": 220
318
  },
319
  {
320
+ "epoch": 0.40142729705619984,
321
+ "grad_norm": 0.028457796201109886,
322
  "learning_rate": 0.0002,
323
+ "loss": 0.5753,
324
  "step": 225
325
  },
326
  {
327
+ "epoch": 0.4103479036574487,
328
+ "grad_norm": 0.028889574110507965,
329
  "learning_rate": 0.0002,
330
+ "loss": 0.5739,
331
  "step": 230
332
  },
333
  {
334
+ "epoch": 0.4192685102586976,
335
+ "grad_norm": 0.028494860976934433,
336
  "learning_rate": 0.0002,
337
+ "loss": 0.5895,
338
  "step": 235
339
  },
340
  {
341
+ "epoch": 0.4281891168599465,
342
+ "grad_norm": 0.028224695473909378,
343
  "learning_rate": 0.0002,
344
+ "loss": 0.6016,
345
  "step": 240
346
  },
347
  {
348
+ "epoch": 0.43710972346119537,
349
+ "grad_norm": 0.027433231472969055,
350
  "learning_rate": 0.0002,
351
+ "loss": 0.5759,
352
  "step": 245
353
  },
354
  {
355
+ "epoch": 0.44603033006244425,
356
+ "grad_norm": 0.02770438976585865,
357
  "learning_rate": 0.0002,
358
+ "loss": 0.5963,
359
  "step": 250
360
  },
361
  {
362
+ "epoch": 0.45495093666369313,
363
+ "grad_norm": 0.030380843207240105,
364
  "learning_rate": 0.0002,
365
+ "loss": 0.6346,
366
  "step": 255
367
  },
368
  {
369
+ "epoch": 0.463871543264942,
370
+ "grad_norm": 0.028422418981790543,
371
  "learning_rate": 0.0002,
372
+ "loss": 0.6088,
373
  "step": 260
374
  },
375
  {
376
+ "epoch": 0.4727921498661909,
377
+ "grad_norm": 0.02833402529358864,
378
  "learning_rate": 0.0002,
379
+ "loss": 0.5672,
380
  "step": 265
381
  },
382
  {
383
+ "epoch": 0.4817127564674398,
384
+ "grad_norm": 0.0281459279358387,
385
  "learning_rate": 0.0002,
386
+ "loss": 0.5875,
387
  "step": 270
388
  },
389
  {
390
+ "epoch": 0.49063336306868865,
391
+ "grad_norm": 0.02902720309793949,
392
  "learning_rate": 0.0002,
393
+ "loss": 0.578,
394
  "step": 275
395
  },
396
  {
397
+ "epoch": 0.49955396966993754,
398
+ "grad_norm": 0.02845628187060356,
399
  "learning_rate": 0.0002,
400
+ "loss": 0.5852,
401
  "step": 280
402
  },
403
  {
404
+ "epoch": 0.5084745762711864,
405
+ "grad_norm": 0.02827693149447441,
406
  "learning_rate": 0.0002,
407
+ "loss": 0.5651,
408
  "step": 285
409
  },
410
  {
411
+ "epoch": 0.5173951828724354,
412
+ "grad_norm": 0.026872893795371056,
413
  "learning_rate": 0.0002,
414
+ "loss": 0.5847,
415
  "step": 290
416
  },
417
  {
418
+ "epoch": 0.5263157894736842,
419
+ "grad_norm": 0.02863125689327717,
420
  "learning_rate": 0.0002,
421
+ "loss": 0.569,
422
  "step": 295
423
  },
424
  {
425
+ "epoch": 0.5352363960749331,
426
+ "grad_norm": 0.02849287912249565,
427
  "learning_rate": 0.0002,
428
+ "loss": 0.5904,
429
  "step": 300
430
  },
431
  {
432
+ "epoch": 0.544157002676182,
433
+ "grad_norm": 0.029510285705327988,
434
  "learning_rate": 0.0002,
435
+ "loss": 0.5743,
436
  "step": 305
437
  },
438
  {
439
+ "epoch": 0.5530776092774309,
440
+ "grad_norm": 0.029404086992144585,
441
  "learning_rate": 0.0002,
442
+ "loss": 0.5897,
443
  "step": 310
444
  },
445
  {
446
+ "epoch": 0.5619982158786797,
447
+ "grad_norm": 0.028384504839777946,
448
  "learning_rate": 0.0002,
449
+ "loss": 0.5928,
450
  "step": 315
451
  },
452
  {
453
+ "epoch": 0.5709188224799286,
454
+ "grad_norm": 0.029003608971834183,
455
  "learning_rate": 0.0002,
456
+ "loss": 0.5758,
457
  "step": 320
458
  },
459
  {
460
+ "epoch": 0.5798394290811775,
461
+ "grad_norm": 0.026150822639465332,
462
  "learning_rate": 0.0002,
463
+ "loss": 0.5688,
464
  "step": 325
465
  },
466
  {
467
+ "epoch": 0.5887600356824264,
468
+ "grad_norm": 0.027160905301570892,
469
  "learning_rate": 0.0002,
470
+ "loss": 0.5783,
471
  "step": 330
472
  },
473
  {
474
+ "epoch": 0.5976806422836753,
475
+ "grad_norm": 0.026426801458001137,
476
  "learning_rate": 0.0002,
477
+ "loss": 0.5605,
478
  "step": 335
479
  },
480
  {
481
+ "epoch": 0.6066012488849242,
482
+ "grad_norm": 0.029923705384135246,
483
  "learning_rate": 0.0002,
484
+ "loss": 0.5901,
485
  "step": 340
486
  },
487
  {
488
+ "epoch": 0.6155218554861731,
489
+ "grad_norm": 0.02743326872587204,
490
  "learning_rate": 0.0002,
491
+ "loss": 0.5964,
492
  "step": 345
493
  },
494
  {
495
+ "epoch": 0.6244424620874219,
496
+ "grad_norm": 0.0284026637673378,
497
  "learning_rate": 0.0002,
498
+ "loss": 0.5874,
499
  "step": 350
500
  },
501
  {
502
+ "epoch": 0.6333630686886709,
503
+ "grad_norm": 0.030260000377893448,
504
  "learning_rate": 0.0002,
505
+ "loss": 0.5831,
506
  "step": 355
507
  },
508
  {
509
+ "epoch": 0.6422836752899197,
510
+ "grad_norm": 0.02586439996957779,
511
  "learning_rate": 0.0002,
512
+ "loss": 0.5541,
513
  "step": 360
514
  },
515
  {
516
+ "epoch": 0.6512042818911686,
517
+ "grad_norm": 0.026640642434358597,
518
  "learning_rate": 0.0002,
519
+ "loss": 0.5669,
520
  "step": 365
521
  },
522
  {
523
+ "epoch": 0.6601248884924175,
524
+ "grad_norm": 0.028971482068300247,
525
  "learning_rate": 0.0002,
526
+ "loss": 0.5952,
527
  "step": 370
528
  },
529
  {
530
+ "epoch": 0.6690454950936664,
531
+ "grad_norm": 0.027096880599856377,
532
  "learning_rate": 0.0002,
533
+ "loss": 0.5606,
534
  "step": 375
535
  },
536
  {
537
+ "epoch": 0.6779661016949152,
538
+ "grad_norm": 0.11753229796886444,
539
  "learning_rate": 0.0002,
540
+ "loss": 0.5667,
541
  "step": 380
542
  },
543
  {
544
+ "epoch": 0.6868867082961642,
545
+ "grad_norm": 0.029016384854912758,
546
  "learning_rate": 0.0002,
547
+ "loss": 0.5819,
548
  "step": 385
549
  },
550
  {
551
+ "epoch": 0.695807314897413,
552
+ "grad_norm": 0.03013915754854679,
553
  "learning_rate": 0.0002,
554
+ "loss": 0.5756,
555
  "step": 390
556
  },
557
  {
558
+ "epoch": 0.7047279214986619,
559
+ "grad_norm": 0.030191004276275635,
560
  "learning_rate": 0.0002,
561
+ "loss": 0.5714,
562
  "step": 395
563
  },
564
  {
565
+ "epoch": 0.7136485280999108,
566
+ "grad_norm": 0.02674183063209057,
567
  "learning_rate": 0.0002,
568
+ "loss": 0.5867,
569
  "step": 400
570
  },
571
  {
572
+ "epoch": 0.7225691347011597,
573
+ "grad_norm": 0.02824782207608223,
574
  "learning_rate": 0.0002,
575
+ "loss": 0.587,
576
  "step": 405
577
  },
578
  {
579
+ "epoch": 0.7314897413024085,
580
+ "grad_norm": 0.027175093069672585,
581
  "learning_rate": 0.0002,
582
+ "loss": 0.5883,
583
  "step": 410
584
  },
585
  {
586
+ "epoch": 0.7404103479036575,
587
+ "grad_norm": 0.028087187558412552,
588
  "learning_rate": 0.0002,
589
+ "loss": 0.5734,
590
  "step": 415
591
  },
592
  {
593
+ "epoch": 0.7493309545049064,
594
+ "grad_norm": 0.028495660051703453,
595
  "learning_rate": 0.0002,
596
+ "loss": 0.5499,
597
  "step": 420
598
  },
599
  {
600
+ "epoch": 0.7582515611061552,
601
+ "grad_norm": 0.029332948848605156,
602
  "learning_rate": 0.0002,
603
+ "loss": 0.6101,
604
  "step": 425
605
  },
606
  {
607
+ "epoch": 0.7671721677074042,
608
+ "grad_norm": 0.02667965553700924,
609
  "learning_rate": 0.0002,
610
+ "loss": 0.5652,
611
  "step": 430
612
  },
613
  {
614
+ "epoch": 0.776092774308653,
615
+ "grad_norm": 0.0268410611897707,
616
  "learning_rate": 0.0002,
617
+ "loss": 0.5584,
618
  "step": 435
619
  },
620
  {
621
+ "epoch": 0.7850133809099019,
622
+ "grad_norm": 0.027814755216240883,
623
  "learning_rate": 0.0002,
624
+ "loss": 0.5724,
625
  "step": 440
626
  },
627
  {
628
+ "epoch": 0.7939339875111507,
629
+ "grad_norm": 0.02988579496741295,
630
  "learning_rate": 0.0002,
631
+ "loss": 0.57,
632
  "step": 445
633
  },
634
  {
635
+ "epoch": 0.8028545941123997,
636
+ "grad_norm": 0.028960440307855606,
637
  "learning_rate": 0.0002,
638
+ "loss": 0.5484,
639
  "step": 450
640
  },
641
  {
642
+ "epoch": 0.8117752007136485,
643
+ "grad_norm": 0.028208531439304352,
644
  "learning_rate": 0.0002,
645
+ "loss": 0.5875,
646
  "step": 455
647
  },
648
  {
649
+ "epoch": 0.8206958073148974,
650
+ "grad_norm": 0.025945566594600677,
651
  "learning_rate": 0.0002,
652
+ "loss": 0.5696,
653
  "step": 460
654
  },
655
  {
656
+ "epoch": 0.8296164139161463,
657
+ "grad_norm": 0.028697073459625244,
658
  "learning_rate": 0.0002,
659
+ "loss": 0.5703,
660
  "step": 465
661
  },
662
  {
663
+ "epoch": 0.8385370205173952,
664
+ "grad_norm": 0.028819743543863297,
665
  "learning_rate": 0.0002,
666
+ "loss": 0.5814,
667
  "step": 470
668
  },
669
  {
670
+ "epoch": 0.847457627118644,
671
+ "grad_norm": 0.02928623929619789,
672
  "learning_rate": 0.0002,
673
+ "loss": 0.5738,
674
  "step": 475
675
  },
676
  {
677
+ "epoch": 0.856378233719893,
678
+ "grad_norm": 0.027526717633008957,
679
  "learning_rate": 0.0002,
680
+ "loss": 0.5678,
681
  "step": 480
682
  },
683
  {
684
+ "epoch": 0.8652988403211418,
685
+ "grad_norm": 0.02858017198741436,
686
  "learning_rate": 0.0002,
687
+ "loss": 0.5815,
688
  "step": 485
689
  },
690
  {
691
+ "epoch": 0.8742194469223907,
692
+ "grad_norm": 0.028315911069512367,
693
  "learning_rate": 0.0002,
694
+ "loss": 0.5928,
695
  "step": 490
696
  },
697
  {
698
+ "epoch": 0.8831400535236396,
699
+ "grad_norm": 0.029983386397361755,
700
  "learning_rate": 0.0002,
701
+ "loss": 0.6047,
702
  "step": 495
703
  },
704
  {
705
+ "epoch": 0.8920606601248885,
706
+ "grad_norm": 0.028466004878282547,
707
  "learning_rate": 0.0002,
708
+ "loss": 0.5918,
709
  "step": 500
710
  },
711
  {
712
+ "epoch": 0.9009812667261374,
713
+ "grad_norm": 0.02696722000837326,
714
  "learning_rate": 0.0002,
715
+ "loss": 0.5856,
716
  "step": 505
717
  },
718
  {
719
+ "epoch": 0.9099018733273863,
720
+ "grad_norm": 0.029618097469210625,
721
  "learning_rate": 0.0002,
722
+ "loss": 0.5617,
723
  "step": 510
724
  },
725
  {
726
+ "epoch": 0.9188224799286352,
727
+ "grad_norm": 0.02784411609172821,
728
  "learning_rate": 0.0002,
729
+ "loss": 0.553,
730
  "step": 515
731
  },
732
  {
733
+ "epoch": 0.927743086529884,
734
+ "grad_norm": 0.02615758404135704,
735
  "learning_rate": 0.0002,
736
+ "loss": 0.5555,
737
  "step": 520
738
  },
739
  {
740
+ "epoch": 0.936663693131133,
741
+ "grad_norm": 0.028149690479040146,
742
  "learning_rate": 0.0002,
743
+ "loss": 0.5723,
744
  "step": 525
745
  },
746
  {
747
+ "epoch": 0.9455842997323818,
748
+ "grad_norm": 0.026176296174526215,
749
  "learning_rate": 0.0002,
750
+ "loss": 0.5785,
751
  "step": 530
752
  },
753
  {
754
+ "epoch": 0.9545049063336307,
755
+ "grad_norm": 0.02792450040578842,
756
  "learning_rate": 0.0002,
757
+ "loss": 0.5871,
758
  "step": 535
759
  },
760
  {
761
+ "epoch": 0.9634255129348795,
762
+ "grad_norm": 0.027666164562106133,
763
  "learning_rate": 0.0002,
764
+ "loss": 0.5544,
765
  "step": 540
766
  },
767
  {
768
+ "epoch": 0.9723461195361285,
769
+ "grad_norm": 0.027818914502859116,
770
  "learning_rate": 0.0002,
771
+ "loss": 0.5716,
772
  "step": 545
773
  },
774
  {
775
+ "epoch": 0.9812667261373773,
776
+ "grad_norm": 0.028460504487156868,
777
  "learning_rate": 0.0002,
778
+ "loss": 0.5825,
779
  "step": 550
780
  },
781
  {
782
+ "epoch": 0.9901873327386262,
783
+ "grad_norm": 0.028340350836515427,
784
  "learning_rate": 0.0002,
785
+ "loss": 0.5593,
786
  "step": 555
787
  },
788
  {
789
  "epoch": 0.9991079393398751,
790
+ "grad_norm": 0.028169002383947372,
791
  "learning_rate": 0.0002,
792
+ "loss": 0.5903,
793
  "step": 560
794
  },
795
  {
796
  "epoch": 0.9991079393398751,
797
  "step": 560,
798
  "total_flos": 8.398274691072e+17,
799
+ "train_loss": 0.5889353160347257,
800
+ "train_runtime": 8886.9999,
801
+ "train_samples_per_second": 1.009,
802
+ "train_steps_per_second": 0.063
803
  }
804
  ],
805
  "logging_steps": 5,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c63a462d950c08173c57be55405c5894d5c8a097b7edc320db8d1fdaa9a37002
3
  size 5688
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5416cc212d3f82d6c34f13b8d64adc3e4eba46d885d3a698ff2292fc017985f6
3
  size 5688