agentlans commited on
Commit
611c63a
1 Parent(s): e45cd6c

Upload 11 files

Browse files
all_results.json CHANGED
@@ -1,15 +1,17 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_loss": 0.1280035674571991,
4
- "eval_mse": 0.12800357063188794,
5
- "eval_runtime": 9.6254,
6
  "eval_samples": 10000,
7
- "eval_samples_per_second": 1038.92,
8
- "eval_steps_per_second": 129.865,
 
9
  "total_flos": 4446488701440000.0,
10
- "train_loss": 0.12363309427897136,
11
- "train_runtime": 1375.441,
12
  "train_samples": 90000,
13
- "train_samples_per_second": 196.301,
14
- "train_steps_per_second": 24.538
 
15
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_loss": 0.09235269576311111,
4
+ "eval_mse": 0.09235269895339973,
5
+ "eval_runtime": 9.5082,
6
  "eval_samples": 10000,
7
+ "eval_samples_per_second": 1051.725,
8
+ "eval_steps_per_second": 131.466,
9
+ "num_input_tokens_seen": 34560000,
10
  "total_flos": 4446488701440000.0,
11
+ "train_loss": 0.08211795973601164,
12
+ "train_runtime": 1433.1905,
13
  "train_samples": 90000,
14
+ "train_samples_per_second": 188.391,
15
+ "train_steps_per_second": 23.549,
16
+ "train_tokens_per_second": 24114.032
17
  }
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "deberta-v3-xsmall-quality-pretrain",
3
  "architectures": [
4
  "DebertaV2ForSequenceClassification"
5
  ],
@@ -37,7 +37,7 @@
37
  "relative_attention": true,
38
  "share_att_key": true,
39
  "torch_dtype": "float32",
40
- "transformers_version": "4.44.2",
41
  "type_vocab_size": 0,
42
  "vocab_size": 128100
43
  }
 
1
  {
2
+ "_name_or_path": "/media/user/Expansion/Data/Quality/pretrained-deberta-xs",
3
  "architectures": [
4
  "DebertaV2ForSequenceClassification"
5
  ],
 
37
  "relative_attention": true,
38
  "share_att_key": true,
39
  "torch_dtype": "float32",
40
+ "transformers_version": "4.45.1",
41
  "type_vocab_size": 0,
42
  "vocab_size": 128100
43
  }
eval_results.json CHANGED
@@ -1,9 +1,10 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_loss": 0.1280035674571991,
4
- "eval_mse": 0.12800357063188794,
5
- "eval_runtime": 9.6254,
6
  "eval_samples": 10000,
7
- "eval_samples_per_second": 1038.92,
8
- "eval_steps_per_second": 129.865
 
9
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_loss": 0.09235269576311111,
4
+ "eval_mse": 0.09235269895339973,
5
+ "eval_runtime": 9.5082,
6
  "eval_samples": 10000,
7
+ "eval_samples_per_second": 1051.725,
8
+ "eval_steps_per_second": 131.466,
9
+ "num_input_tokens_seen": 34560000
10
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa511c11ec7e034ea02c0d29d8b93bfab1a3175bd0502ff717634440564f2031
3
  size 283345892
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdb1d2be17ae696c75e5981d9fdcdc61e0794d679d4c1326fed3f6098e74fe9e
3
  size 283345892
train_results.json CHANGED
@@ -1,9 +1,11 @@
1
  {
2
  "epoch": 3.0,
 
3
  "total_flos": 4446488701440000.0,
4
- "train_loss": 0.12363309427897136,
5
- "train_runtime": 1375.441,
6
  "train_samples": 90000,
7
- "train_samples_per_second": 196.301,
8
- "train_steps_per_second": 24.538
 
9
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "num_input_tokens_seen": 34560000,
4
  "total_flos": 4446488701440000.0,
5
+ "train_loss": 0.08211795973601164,
6
+ "train_runtime": 1433.1905,
7
  "train_samples": 90000,
8
+ "train_samples_per_second": 188.391,
9
+ "train_steps_per_second": 23.549,
10
+ "train_tokens_per_second": 24114.032
11
  }
trainer_state.json CHANGED
@@ -10,486 +10,555 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.044444444444444446,
13
- "grad_norm": 3.210815906524658,
14
  "learning_rate": 4.925925925925926e-05,
15
- "loss": 0.4315,
 
16
  "step": 500
17
  },
18
  {
19
  "epoch": 0.08888888888888889,
20
- "grad_norm": 8.126620292663574,
21
  "learning_rate": 4.851851851851852e-05,
22
- "loss": 0.2862,
 
23
  "step": 1000
24
  },
25
  {
26
  "epoch": 0.13333333333333333,
27
- "grad_norm": 4.665143013000488,
28
  "learning_rate": 4.7777777777777784e-05,
29
- "loss": 0.237,
 
30
  "step": 1500
31
  },
32
  {
33
  "epoch": 0.17777777777777778,
34
- "grad_norm": 2.0566887855529785,
35
  "learning_rate": 4.703703703703704e-05,
36
- "loss": 0.2251,
 
37
  "step": 2000
38
  },
39
  {
40
  "epoch": 0.2222222222222222,
41
- "grad_norm": 7.968118667602539,
42
  "learning_rate": 4.62962962962963e-05,
43
- "loss": 0.2239,
 
44
  "step": 2500
45
  },
46
  {
47
  "epoch": 0.26666666666666666,
48
- "grad_norm": 1.2121827602386475,
49
  "learning_rate": 4.555555555555556e-05,
50
- "loss": 0.2203,
 
51
  "step": 3000
52
  },
53
  {
54
  "epoch": 0.3111111111111111,
55
- "grad_norm": 3.395099401473999,
56
  "learning_rate": 4.481481481481482e-05,
57
- "loss": 0.2111,
 
58
  "step": 3500
59
  },
60
  {
61
  "epoch": 0.35555555555555557,
62
- "grad_norm": 1.9526039361953735,
63
  "learning_rate": 4.4074074074074076e-05,
64
- "loss": 0.193,
 
65
  "step": 4000
66
  },
67
  {
68
  "epoch": 0.4,
69
- "grad_norm": 3.2926955223083496,
70
  "learning_rate": 4.3333333333333334e-05,
71
- "loss": 0.1853,
 
72
  "step": 4500
73
  },
74
  {
75
  "epoch": 0.4444444444444444,
76
- "grad_norm": 2.8749849796295166,
77
  "learning_rate": 4.259259259259259e-05,
78
- "loss": 0.1675,
 
79
  "step": 5000
80
  },
81
  {
82
  "epoch": 0.4888888888888889,
83
- "grad_norm": 6.5106706619262695,
84
  "learning_rate": 4.185185185185185e-05,
85
- "loss": 0.1889,
 
86
  "step": 5500
87
  },
88
  {
89
  "epoch": 0.5333333333333333,
90
- "grad_norm": 1.893872618675232,
91
  "learning_rate": 4.111111111111111e-05,
92
- "loss": 0.2002,
 
93
  "step": 6000
94
  },
95
  {
96
  "epoch": 0.5777777777777777,
97
- "grad_norm": 4.822785377502441,
98
  "learning_rate": 4.0370370370370374e-05,
99
- "loss": 0.1802,
 
100
  "step": 6500
101
  },
102
  {
103
  "epoch": 0.6222222222222222,
104
- "grad_norm": 2.410318374633789,
105
  "learning_rate": 3.962962962962963e-05,
106
- "loss": 0.1578,
 
107
  "step": 7000
108
  },
109
  {
110
  "epoch": 0.6666666666666666,
111
- "grad_norm": 7.345259189605713,
112
  "learning_rate": 3.888888888888889e-05,
113
- "loss": 0.1668,
 
114
  "step": 7500
115
  },
116
  {
117
  "epoch": 0.7111111111111111,
118
- "grad_norm": 2.6905734539031982,
119
  "learning_rate": 3.814814814814815e-05,
120
- "loss": 0.1726,
 
121
  "step": 8000
122
  },
123
  {
124
  "epoch": 0.7555555555555555,
125
- "grad_norm": 7.399478912353516,
126
  "learning_rate": 3.740740740740741e-05,
127
- "loss": 0.1586,
 
128
  "step": 8500
129
  },
130
  {
131
  "epoch": 0.8,
132
- "grad_norm": 1.8845775127410889,
133
  "learning_rate": 3.6666666666666666e-05,
134
- "loss": 0.1629,
 
135
  "step": 9000
136
  },
137
  {
138
  "epoch": 0.8444444444444444,
139
- "grad_norm": 3.4357192516326904,
140
  "learning_rate": 3.592592592592593e-05,
141
- "loss": 0.1674,
 
142
  "step": 9500
143
  },
144
  {
145
  "epoch": 0.8888888888888888,
146
- "grad_norm": 1.8075140714645386,
147
  "learning_rate": 3.518518518518519e-05,
148
- "loss": 0.1671,
 
149
  "step": 10000
150
  },
151
  {
152
  "epoch": 0.9333333333333333,
153
- "grad_norm": 2.545607089996338,
154
  "learning_rate": 3.444444444444445e-05,
155
- "loss": 0.1595,
 
156
  "step": 10500
157
  },
158
  {
159
  "epoch": 0.9777777777777777,
160
- "grad_norm": 2.33050799369812,
161
  "learning_rate": 3.3703703703703706e-05,
162
- "loss": 0.1655,
 
163
  "step": 11000
164
  },
165
  {
166
  "epoch": 1.0222222222222221,
167
- "grad_norm": 1.9340065717697144,
168
  "learning_rate": 3.2962962962962964e-05,
169
- "loss": 0.1349,
 
170
  "step": 11500
171
  },
172
  {
173
  "epoch": 1.0666666666666667,
174
- "grad_norm": 1.3230476379394531,
175
  "learning_rate": 3.222222222222223e-05,
176
- "loss": 0.1069,
 
177
  "step": 12000
178
  },
179
  {
180
  "epoch": 1.1111111111111112,
181
- "grad_norm": 4.054622650146484,
182
  "learning_rate": 3.148148148148148e-05,
183
- "loss": 0.1098,
 
184
  "step": 12500
185
  },
186
  {
187
  "epoch": 1.1555555555555554,
188
- "grad_norm": 4.885370254516602,
189
  "learning_rate": 3.074074074074074e-05,
190
- "loss": 0.1162,
 
191
  "step": 13000
192
  },
193
  {
194
  "epoch": 1.2,
195
- "grad_norm": 3.5849854946136475,
196
  "learning_rate": 3e-05,
197
- "loss": 0.1077,
 
198
  "step": 13500
199
  },
200
  {
201
  "epoch": 1.2444444444444445,
202
- "grad_norm": 9.765403747558594,
203
  "learning_rate": 2.925925925925926e-05,
204
- "loss": 0.1181,
 
205
  "step": 14000
206
  },
207
  {
208
  "epoch": 1.2888888888888888,
209
- "grad_norm": 3.879992961883545,
210
  "learning_rate": 2.851851851851852e-05,
211
- "loss": 0.1011,
 
212
  "step": 14500
213
  },
214
  {
215
  "epoch": 1.3333333333333333,
216
- "grad_norm": 2.2347288131713867,
217
  "learning_rate": 2.777777777777778e-05,
218
- "loss": 0.1175,
 
219
  "step": 15000
220
  },
221
  {
222
  "epoch": 1.3777777777777778,
223
- "grad_norm": 7.474192142486572,
224
  "learning_rate": 2.7037037037037037e-05,
225
- "loss": 0.1073,
 
226
  "step": 15500
227
  },
228
  {
229
  "epoch": 1.4222222222222223,
230
- "grad_norm": 4.1693291664123535,
231
  "learning_rate": 2.6296296296296296e-05,
232
- "loss": 0.0995,
 
233
  "step": 16000
234
  },
235
  {
236
  "epoch": 1.4666666666666668,
237
- "grad_norm": 1.8383170366287231,
238
  "learning_rate": 2.5555555555555554e-05,
239
- "loss": 0.1145,
 
240
  "step": 16500
241
  },
242
  {
243
  "epoch": 1.511111111111111,
244
- "grad_norm": 0.694262683391571,
245
  "learning_rate": 2.4814814814814816e-05,
246
- "loss": 0.115,
 
247
  "step": 17000
248
  },
249
  {
250
  "epoch": 1.5555555555555556,
251
- "grad_norm": 0.9633584022521973,
252
  "learning_rate": 2.4074074074074074e-05,
253
- "loss": 0.1024,
 
254
  "step": 17500
255
  },
256
  {
257
  "epoch": 1.6,
258
- "grad_norm": 2.0199620723724365,
259
  "learning_rate": 2.3333333333333336e-05,
260
- "loss": 0.1011,
 
261
  "step": 18000
262
  },
263
  {
264
  "epoch": 1.6444444444444444,
265
- "grad_norm": 1.8132920265197754,
266
  "learning_rate": 2.2592592592592594e-05,
267
- "loss": 0.1041,
 
268
  "step": 18500
269
  },
270
  {
271
  "epoch": 1.6888888888888889,
272
- "grad_norm": 8.3416748046875,
273
  "learning_rate": 2.1851851851851852e-05,
274
- "loss": 0.1144,
 
275
  "step": 19000
276
  },
277
  {
278
  "epoch": 1.7333333333333334,
279
- "grad_norm": 1.4581434726715088,
280
  "learning_rate": 2.111111111111111e-05,
281
- "loss": 0.0955,
 
282
  "step": 19500
283
  },
284
  {
285
  "epoch": 1.7777777777777777,
286
- "grad_norm": 3.9014599323272705,
287
  "learning_rate": 2.037037037037037e-05,
288
- "loss": 0.104,
 
289
  "step": 20000
290
  },
291
  {
292
  "epoch": 1.8222222222222222,
293
- "grad_norm": 2.070230007171631,
294
  "learning_rate": 1.962962962962963e-05,
295
- "loss": 0.0995,
 
296
  "step": 20500
297
  },
298
  {
299
  "epoch": 1.8666666666666667,
300
- "grad_norm": 0.8812291026115417,
301
  "learning_rate": 1.888888888888889e-05,
302
- "loss": 0.0982,
 
303
  "step": 21000
304
  },
305
  {
306
  "epoch": 1.911111111111111,
307
- "grad_norm": 1.1036711931228638,
308
  "learning_rate": 1.814814814814815e-05,
309
- "loss": 0.0987,
 
310
  "step": 21500
311
  },
312
  {
313
  "epoch": 1.9555555555555557,
314
- "grad_norm": 0.6822977066040039,
315
  "learning_rate": 1.740740740740741e-05,
316
- "loss": 0.101,
 
317
  "step": 22000
318
  },
319
  {
320
  "epoch": 2.0,
321
- "grad_norm": 4.314443111419678,
322
  "learning_rate": 1.6666666666666667e-05,
323
- "loss": 0.0979,
 
324
  "step": 22500
325
  },
326
  {
327
  "epoch": 2.0444444444444443,
328
- "grad_norm": 3.5247573852539062,
329
  "learning_rate": 1.5925925925925926e-05,
330
- "loss": 0.0733,
 
331
  "step": 23000
332
  },
333
  {
334
  "epoch": 2.088888888888889,
335
- "grad_norm": 0.6801110506057739,
336
  "learning_rate": 1.5185185185185186e-05,
337
- "loss": 0.0651,
 
338
  "step": 23500
339
  },
340
  {
341
  "epoch": 2.1333333333333333,
342
- "grad_norm": 4.516416072845459,
343
  "learning_rate": 1.4444444444444444e-05,
344
- "loss": 0.0765,
 
345
  "step": 24000
346
  },
347
  {
348
  "epoch": 2.1777777777777776,
349
- "grad_norm": 1.9308606386184692,
350
  "learning_rate": 1.3703703703703704e-05,
351
- "loss": 0.0707,
 
352
  "step": 24500
353
  },
354
  {
355
  "epoch": 2.2222222222222223,
356
- "grad_norm": 1.1442885398864746,
357
  "learning_rate": 1.2962962962962962e-05,
358
- "loss": 0.0617,
 
359
  "step": 25000
360
  },
361
  {
362
  "epoch": 2.2666666666666666,
363
- "grad_norm": 5.30832576751709,
364
  "learning_rate": 1.2222222222222222e-05,
365
- "loss": 0.0693,
 
366
  "step": 25500
367
  },
368
  {
369
  "epoch": 2.311111111111111,
370
- "grad_norm": 0.5708422660827637,
371
  "learning_rate": 1.1481481481481482e-05,
372
- "loss": 0.0697,
 
373
  "step": 26000
374
  },
375
  {
376
  "epoch": 2.3555555555555556,
377
- "grad_norm": 0.680268406867981,
378
  "learning_rate": 1.074074074074074e-05,
379
- "loss": 0.066,
 
380
  "step": 26500
381
  },
382
  {
383
  "epoch": 2.4,
384
- "grad_norm": 0.72934889793396,
385
  "learning_rate": 1e-05,
386
- "loss": 0.0644,
 
387
  "step": 27000
388
  },
389
  {
390
  "epoch": 2.4444444444444446,
391
- "grad_norm": 4.663362503051758,
392
  "learning_rate": 9.259259259259259e-06,
393
- "loss": 0.0667,
 
394
  "step": 27500
395
  },
396
  {
397
  "epoch": 2.488888888888889,
398
- "grad_norm": 1.3758256435394287,
399
  "learning_rate": 8.518518518518519e-06,
400
- "loss": 0.0575,
 
401
  "step": 28000
402
  },
403
  {
404
  "epoch": 2.533333333333333,
405
- "grad_norm": 0.8022651672363281,
406
  "learning_rate": 7.777777777777777e-06,
407
- "loss": 0.0621,
 
408
  "step": 28500
409
  },
410
  {
411
  "epoch": 2.5777777777777775,
412
- "grad_norm": 1.576392412185669,
413
  "learning_rate": 7.0370370370370375e-06,
414
- "loss": 0.0586,
 
415
  "step": 29000
416
  },
417
  {
418
  "epoch": 2.6222222222222222,
419
- "grad_norm": 1.166567087173462,
420
  "learning_rate": 6.296296296296296e-06,
421
- "loss": 0.0662,
 
422
  "step": 29500
423
  },
424
  {
425
  "epoch": 2.6666666666666665,
426
- "grad_norm": 2.4423012733459473,
427
  "learning_rate": 5.555555555555556e-06,
428
- "loss": 0.0645,
 
429
  "step": 30000
430
  },
431
  {
432
  "epoch": 2.7111111111111112,
433
- "grad_norm": 3.1122899055480957,
434
  "learning_rate": 4.814814814814815e-06,
435
- "loss": 0.0642,
 
436
  "step": 30500
437
  },
438
  {
439
  "epoch": 2.7555555555555555,
440
- "grad_norm": 3.2840280532836914,
441
  "learning_rate": 4.074074074074075e-06,
442
- "loss": 0.0617,
 
443
  "step": 31000
444
  },
445
  {
446
  "epoch": 2.8,
447
- "grad_norm": 0.6285051703453064,
448
  "learning_rate": 3.3333333333333333e-06,
449
- "loss": 0.0566,
 
450
  "step": 31500
451
  },
452
  {
453
  "epoch": 2.8444444444444446,
454
- "grad_norm": 3.462080240249634,
455
  "learning_rate": 2.5925925925925925e-06,
456
- "loss": 0.0614,
 
457
  "step": 32000
458
  },
459
  {
460
  "epoch": 2.888888888888889,
461
- "grad_norm": 2.2216219902038574,
462
  "learning_rate": 1.8518518518518519e-06,
463
- "loss": 0.0647,
 
464
  "step": 32500
465
  },
466
  {
467
  "epoch": 2.9333333333333336,
468
- "grad_norm": 1.6926839351654053,
469
  "learning_rate": 1.1111111111111112e-06,
470
- "loss": 0.0611,
 
471
  "step": 33000
472
  },
473
  {
474
  "epoch": 2.977777777777778,
475
- "grad_norm": 4.723779678344727,
476
  "learning_rate": 3.703703703703704e-07,
477
- "loss": 0.0597,
 
478
  "step": 33500
479
  },
480
  {
481
  "epoch": 3.0,
 
482
  "step": 33750,
483
  "total_flos": 4446488701440000.0,
484
- "train_loss": 0.12363309427897136,
485
- "train_runtime": 1375.441,
486
- "train_samples_per_second": 196.301,
487
- "train_steps_per_second": 24.538
 
488
  }
489
  ],
490
  "logging_steps": 500,
491
  "max_steps": 33750,
492
- "num_input_tokens_seen": 0,
493
  "num_train_epochs": 3,
494
  "save_steps": 500,
495
  "stateful_callbacks": {
 
10
  "log_history": [
11
  {
12
  "epoch": 0.044444444444444446,
13
+ "grad_norm": 4.640678405761719,
14
  "learning_rate": 4.925925925925926e-05,
15
+ "loss": 0.2464,
16
+ "num_input_tokens_seen": 512000,
17
  "step": 500
18
  },
19
  {
20
  "epoch": 0.08888888888888889,
21
+ "grad_norm": 2.8545427322387695,
22
  "learning_rate": 4.851851851851852e-05,
23
+ "loss": 0.1755,
24
+ "num_input_tokens_seen": 1024000,
25
  "step": 1000
26
  },
27
  {
28
  "epoch": 0.13333333333333333,
29
+ "grad_norm": 1.1847012042999268,
30
  "learning_rate": 4.7777777777777784e-05,
31
+ "loss": 0.1628,
32
+ "num_input_tokens_seen": 1536000,
33
  "step": 1500
34
  },
35
  {
36
  "epoch": 0.17777777777777778,
37
+ "grad_norm": 3.767167568206787,
38
  "learning_rate": 4.703703703703704e-05,
39
+ "loss": 0.1475,
40
+ "num_input_tokens_seen": 2048000,
41
  "step": 2000
42
  },
43
  {
44
  "epoch": 0.2222222222222222,
45
+ "grad_norm": 3.5827574729919434,
46
  "learning_rate": 4.62962962962963e-05,
47
+ "loss": 0.1448,
48
+ "num_input_tokens_seen": 2560000,
49
  "step": 2500
50
  },
51
  {
52
  "epoch": 0.26666666666666666,
53
+ "grad_norm": 2.042477607727051,
54
  "learning_rate": 4.555555555555556e-05,
55
+ "loss": 0.1412,
56
+ "num_input_tokens_seen": 3072000,
57
  "step": 3000
58
  },
59
  {
60
  "epoch": 0.3111111111111111,
61
+ "grad_norm": 1.130966305732727,
62
  "learning_rate": 4.481481481481482e-05,
63
+ "loss": 0.1296,
64
+ "num_input_tokens_seen": 3584000,
65
  "step": 3500
66
  },
67
  {
68
  "epoch": 0.35555555555555557,
69
+ "grad_norm": 3.1181435585021973,
70
  "learning_rate": 4.4074074074074076e-05,
71
+ "loss": 0.131,
72
+ "num_input_tokens_seen": 4096000,
73
  "step": 4000
74
  },
75
  {
76
  "epoch": 0.4,
77
+ "grad_norm": 2.284423351287842,
78
  "learning_rate": 4.3333333333333334e-05,
79
+ "loss": 0.135,
80
+ "num_input_tokens_seen": 4608000,
81
  "step": 4500
82
  },
83
  {
84
  "epoch": 0.4444444444444444,
85
+ "grad_norm": 2.6171817779541016,
86
  "learning_rate": 4.259259259259259e-05,
87
+ "loss": 0.1251,
88
+ "num_input_tokens_seen": 5120000,
89
  "step": 5000
90
  },
91
  {
92
  "epoch": 0.4888888888888889,
93
+ "grad_norm": 2.712770462036133,
94
  "learning_rate": 4.185185185185185e-05,
95
+ "loss": 0.1214,
96
+ "num_input_tokens_seen": 5632000,
97
  "step": 5500
98
  },
99
  {
100
  "epoch": 0.5333333333333333,
101
+ "grad_norm": 1.8071026802062988,
102
  "learning_rate": 4.111111111111111e-05,
103
+ "loss": 0.1178,
104
+ "num_input_tokens_seen": 6144000,
105
  "step": 6000
106
  },
107
  {
108
  "epoch": 0.5777777777777777,
109
+ "grad_norm": 2.124100685119629,
110
  "learning_rate": 4.0370370370370374e-05,
111
+ "loss": 0.1187,
112
+ "num_input_tokens_seen": 6656000,
113
  "step": 6500
114
  },
115
  {
116
  "epoch": 0.6222222222222222,
117
+ "grad_norm": 2.7214131355285645,
118
  "learning_rate": 3.962962962962963e-05,
119
+ "loss": 0.118,
120
+ "num_input_tokens_seen": 7168000,
121
  "step": 7000
122
  },
123
  {
124
  "epoch": 0.6666666666666666,
125
+ "grad_norm": 2.097531318664551,
126
  "learning_rate": 3.888888888888889e-05,
127
+ "loss": 0.1156,
128
+ "num_input_tokens_seen": 7680000,
129
  "step": 7500
130
  },
131
  {
132
  "epoch": 0.7111111111111111,
133
+ "grad_norm": 2.9476003646850586,
134
  "learning_rate": 3.814814814814815e-05,
135
+ "loss": 0.1168,
136
+ "num_input_tokens_seen": 8192000,
137
  "step": 8000
138
  },
139
  {
140
  "epoch": 0.7555555555555555,
141
+ "grad_norm": 2.068228006362915,
142
  "learning_rate": 3.740740740740741e-05,
143
+ "loss": 0.1122,
144
+ "num_input_tokens_seen": 8704000,
145
  "step": 8500
146
  },
147
  {
148
  "epoch": 0.8,
149
+ "grad_norm": 1.8230115175247192,
150
  "learning_rate": 3.6666666666666666e-05,
151
+ "loss": 0.1129,
152
+ "num_input_tokens_seen": 9216000,
153
  "step": 9000
154
  },
155
  {
156
  "epoch": 0.8444444444444444,
157
+ "grad_norm": 2.0530920028686523,
158
  "learning_rate": 3.592592592592593e-05,
159
+ "loss": 0.1074,
160
+ "num_input_tokens_seen": 9728000,
161
  "step": 9500
162
  },
163
  {
164
  "epoch": 0.8888888888888888,
165
+ "grad_norm": 1.2028056383132935,
166
  "learning_rate": 3.518518518518519e-05,
167
+ "loss": 0.1022,
168
+ "num_input_tokens_seen": 10240000,
169
  "step": 10000
170
  },
171
  {
172
  "epoch": 0.9333333333333333,
173
+ "grad_norm": 1.908441185951233,
174
  "learning_rate": 3.444444444444445e-05,
175
+ "loss": 0.102,
176
+ "num_input_tokens_seen": 10752000,
177
  "step": 10500
178
  },
179
  {
180
  "epoch": 0.9777777777777777,
181
+ "grad_norm": 1.4811742305755615,
182
  "learning_rate": 3.3703703703703706e-05,
183
+ "loss": 0.0967,
184
+ "num_input_tokens_seen": 11264000,
185
  "step": 11000
186
  },
187
  {
188
  "epoch": 1.0222222222222221,
189
+ "grad_norm": 2.421898126602173,
190
  "learning_rate": 3.2962962962962964e-05,
191
+ "loss": 0.0908,
192
+ "num_input_tokens_seen": 11776000,
193
  "step": 11500
194
  },
195
  {
196
  "epoch": 1.0666666666666667,
197
+ "grad_norm": 1.024445652961731,
198
  "learning_rate": 3.222222222222223e-05,
199
+ "loss": 0.0729,
200
+ "num_input_tokens_seen": 12288000,
201
  "step": 12000
202
  },
203
  {
204
  "epoch": 1.1111111111111112,
205
+ "grad_norm": 2.516057014465332,
206
  "learning_rate": 3.148148148148148e-05,
207
+ "loss": 0.0739,
208
+ "num_input_tokens_seen": 12800000,
209
  "step": 12500
210
  },
211
  {
212
  "epoch": 1.1555555555555554,
213
+ "grad_norm": 3.101442813873291,
214
  "learning_rate": 3.074074074074074e-05,
215
+ "loss": 0.0728,
216
+ "num_input_tokens_seen": 13312000,
217
  "step": 13000
218
  },
219
  {
220
  "epoch": 1.2,
221
+ "grad_norm": 1.0181483030319214,
222
  "learning_rate": 3e-05,
223
+ "loss": 0.0713,
224
+ "num_input_tokens_seen": 13824000,
225
  "step": 13500
226
  },
227
  {
228
  "epoch": 1.2444444444444445,
229
+ "grad_norm": 2.7126169204711914,
230
  "learning_rate": 2.925925925925926e-05,
231
+ "loss": 0.0739,
232
+ "num_input_tokens_seen": 14336000,
233
  "step": 14000
234
  },
235
  {
236
  "epoch": 1.2888888888888888,
237
+ "grad_norm": 2.1057209968566895,
238
  "learning_rate": 2.851851851851852e-05,
239
+ "loss": 0.0718,
240
+ "num_input_tokens_seen": 14848000,
241
  "step": 14500
242
  },
243
  {
244
  "epoch": 1.3333333333333333,
245
+ "grad_norm": 2.226621150970459,
246
  "learning_rate": 2.777777777777778e-05,
247
+ "loss": 0.081,
248
+ "num_input_tokens_seen": 15360000,
249
  "step": 15000
250
  },
251
  {
252
  "epoch": 1.3777777777777778,
253
+ "grad_norm": 1.1672347784042358,
254
  "learning_rate": 2.7037037037037037e-05,
255
+ "loss": 0.0744,
256
+ "num_input_tokens_seen": 15872000,
257
  "step": 15500
258
  },
259
  {
260
  "epoch": 1.4222222222222223,
261
+ "grad_norm": 1.556462287902832,
262
  "learning_rate": 2.6296296296296296e-05,
263
+ "loss": 0.0727,
264
+ "num_input_tokens_seen": 16384000,
265
  "step": 16000
266
  },
267
  {
268
  "epoch": 1.4666666666666668,
269
+ "grad_norm": 1.4660066366195679,
270
  "learning_rate": 2.5555555555555554e-05,
271
+ "loss": 0.0688,
272
+ "num_input_tokens_seen": 16896000,
273
  "step": 16500
274
  },
275
  {
276
  "epoch": 1.511111111111111,
277
+ "grad_norm": 1.3551133871078491,
278
  "learning_rate": 2.4814814814814816e-05,
279
+ "loss": 0.0664,
280
+ "num_input_tokens_seen": 17408000,
281
  "step": 17000
282
  },
283
  {
284
  "epoch": 1.5555555555555556,
285
+ "grad_norm": 1.1325716972351074,
286
  "learning_rate": 2.4074074074074074e-05,
287
+ "loss": 0.0712,
288
+ "num_input_tokens_seen": 17920000,
289
  "step": 17500
290
  },
291
  {
292
  "epoch": 1.6,
293
+ "grad_norm": 1.0320031642913818,
294
  "learning_rate": 2.3333333333333336e-05,
295
+ "loss": 0.0696,
296
+ "num_input_tokens_seen": 18432000,
297
  "step": 18000
298
  },
299
  {
300
  "epoch": 1.6444444444444444,
301
+ "grad_norm": 2.1350128650665283,
302
  "learning_rate": 2.2592592592592594e-05,
303
+ "loss": 0.0712,
304
+ "num_input_tokens_seen": 18944000,
305
  "step": 18500
306
  },
307
  {
308
  "epoch": 1.6888888888888889,
309
+ "grad_norm": 1.1131097078323364,
310
  "learning_rate": 2.1851851851851852e-05,
311
+ "loss": 0.0728,
312
+ "num_input_tokens_seen": 19456000,
313
  "step": 19000
314
  },
315
  {
316
  "epoch": 1.7333333333333334,
317
+ "grad_norm": 1.8592647314071655,
318
  "learning_rate": 2.111111111111111e-05,
319
+ "loss": 0.07,
320
+ "num_input_tokens_seen": 19968000,
321
  "step": 19500
322
  },
323
  {
324
  "epoch": 1.7777777777777777,
325
+ "grad_norm": 1.7992678880691528,
326
  "learning_rate": 2.037037037037037e-05,
327
+ "loss": 0.0682,
328
+ "num_input_tokens_seen": 20480000,
329
  "step": 20000
330
  },
331
  {
332
  "epoch": 1.8222222222222222,
333
+ "grad_norm": 1.4641705751419067,
334
  "learning_rate": 1.962962962962963e-05,
335
+ "loss": 0.0627,
336
+ "num_input_tokens_seen": 20992000,
337
  "step": 20500
338
  },
339
  {
340
  "epoch": 1.8666666666666667,
341
+ "grad_norm": 1.758195161819458,
342
  "learning_rate": 1.888888888888889e-05,
343
+ "loss": 0.0686,
344
+ "num_input_tokens_seen": 21504000,
345
  "step": 21000
346
  },
347
  {
348
  "epoch": 1.911111111111111,
349
+ "grad_norm": 1.9842095375061035,
350
  "learning_rate": 1.814814814814815e-05,
351
+ "loss": 0.068,
352
+ "num_input_tokens_seen": 22016000,
353
  "step": 21500
354
  },
355
  {
356
  "epoch": 1.9555555555555557,
357
+ "grad_norm": 1.4546846151351929,
358
  "learning_rate": 1.740740740740741e-05,
359
+ "loss": 0.0646,
360
+ "num_input_tokens_seen": 22528000,
361
  "step": 22000
362
  },
363
  {
364
  "epoch": 2.0,
365
+ "grad_norm": 3.412598133087158,
366
  "learning_rate": 1.6666666666666667e-05,
367
+ "loss": 0.068,
368
+ "num_input_tokens_seen": 23040000,
369
  "step": 22500
370
  },
371
  {
372
  "epoch": 2.0444444444444443,
373
+ "grad_norm": 2.1435558795928955,
374
  "learning_rate": 1.5925925925925926e-05,
375
+ "loss": 0.0492,
376
+ "num_input_tokens_seen": 23552000,
377
  "step": 23000
378
  },
379
  {
380
  "epoch": 2.088888888888889,
381
+ "grad_norm": 1.800618290901184,
382
  "learning_rate": 1.5185185185185186e-05,
383
+ "loss": 0.0488,
384
+ "num_input_tokens_seen": 24064000,
385
  "step": 23500
386
  },
387
  {
388
  "epoch": 2.1333333333333333,
389
+ "grad_norm": 1.1772105693817139,
390
  "learning_rate": 1.4444444444444444e-05,
391
+ "loss": 0.0465,
392
+ "num_input_tokens_seen": 24576000,
393
  "step": 24000
394
  },
395
  {
396
  "epoch": 2.1777777777777776,
397
+ "grad_norm": 0.9198475480079651,
398
  "learning_rate": 1.3703703703703704e-05,
399
+ "loss": 0.0458,
400
+ "num_input_tokens_seen": 25088000,
401
  "step": 24500
402
  },
403
  {
404
  "epoch": 2.2222222222222223,
405
+ "grad_norm": 1.3264068365097046,
406
  "learning_rate": 1.2962962962962962e-05,
407
+ "loss": 0.0476,
408
+ "num_input_tokens_seen": 25600000,
409
  "step": 25000
410
  },
411
  {
412
  "epoch": 2.2666666666666666,
413
+ "grad_norm": 2.010927438735962,
414
  "learning_rate": 1.2222222222222222e-05,
415
+ "loss": 0.0485,
416
+ "num_input_tokens_seen": 26112000,
417
  "step": 25500
418
  },
419
  {
420
  "epoch": 2.311111111111111,
421
+ "grad_norm": 1.620568871498108,
422
  "learning_rate": 1.1481481481481482e-05,
423
+ "loss": 0.0443,
424
+ "num_input_tokens_seen": 26624000,
425
  "step": 26000
426
  },
427
  {
428
  "epoch": 2.3555555555555556,
429
+ "grad_norm": 1.1650017499923706,
430
  "learning_rate": 1.074074074074074e-05,
431
+ "loss": 0.0456,
432
+ "num_input_tokens_seen": 27136000,
433
  "step": 26500
434
  },
435
  {
436
  "epoch": 2.4,
437
+ "grad_norm": 1.404801607131958,
438
  "learning_rate": 1e-05,
439
+ "loss": 0.0453,
440
+ "num_input_tokens_seen": 27648000,
441
  "step": 27000
442
  },
443
  {
444
  "epoch": 2.4444444444444446,
445
+ "grad_norm": 1.6808654069900513,
446
  "learning_rate": 9.259259259259259e-06,
447
+ "loss": 0.0447,
448
+ "num_input_tokens_seen": 28160000,
449
  "step": 27500
450
  },
451
  {
452
  "epoch": 2.488888888888889,
453
+ "grad_norm": 1.0057835578918457,
454
  "learning_rate": 8.518518518518519e-06,
455
+ "loss": 0.0494,
456
+ "num_input_tokens_seen": 28672000,
457
  "step": 28000
458
  },
459
  {
460
  "epoch": 2.533333333333333,
461
+ "grad_norm": 1.2674041986465454,
462
  "learning_rate": 7.777777777777777e-06,
463
+ "loss": 0.044,
464
+ "num_input_tokens_seen": 29184000,
465
  "step": 28500
466
  },
467
  {
468
  "epoch": 2.5777777777777775,
469
+ "grad_norm": 1.236249327659607,
470
  "learning_rate": 7.0370370370370375e-06,
471
+ "loss": 0.0478,
472
+ "num_input_tokens_seen": 29696000,
473
  "step": 29000
474
  },
475
  {
476
  "epoch": 2.6222222222222222,
477
+ "grad_norm": 1.3907105922698975,
478
  "learning_rate": 6.296296296296296e-06,
479
+ "loss": 0.0428,
480
+ "num_input_tokens_seen": 30208000,
481
  "step": 29500
482
  },
483
  {
484
  "epoch": 2.6666666666666665,
485
+ "grad_norm": 1.8549304008483887,
486
  "learning_rate": 5.555555555555556e-06,
487
+ "loss": 0.0451,
488
+ "num_input_tokens_seen": 30720000,
489
  "step": 30000
490
  },
491
  {
492
  "epoch": 2.7111111111111112,
493
+ "grad_norm": 1.3427213430404663,
494
  "learning_rate": 4.814814814814815e-06,
495
+ "loss": 0.0438,
496
+ "num_input_tokens_seen": 31232000,
497
  "step": 30500
498
  },
499
  {
500
  "epoch": 2.7555555555555555,
501
+ "grad_norm": 1.9282052516937256,
502
  "learning_rate": 4.074074074074075e-06,
503
+ "loss": 0.046,
504
+ "num_input_tokens_seen": 31744000,
505
  "step": 31000
506
  },
507
  {
508
  "epoch": 2.8,
509
+ "grad_norm": 1.3202067613601685,
510
  "learning_rate": 3.3333333333333333e-06,
511
+ "loss": 0.0394,
512
+ "num_input_tokens_seen": 32256000,
513
  "step": 31500
514
  },
515
  {
516
  "epoch": 2.8444444444444446,
517
+ "grad_norm": 0.9947272539138794,
518
  "learning_rate": 2.5925925925925925e-06,
519
+ "loss": 0.0428,
520
+ "num_input_tokens_seen": 32768000,
521
  "step": 32000
522
  },
523
  {
524
  "epoch": 2.888888888888889,
525
+ "grad_norm": 1.1175397634506226,
526
  "learning_rate": 1.8518518518518519e-06,
527
+ "loss": 0.0411,
528
+ "num_input_tokens_seen": 33280000,
529
  "step": 32500
530
  },
531
  {
532
  "epoch": 2.9333333333333336,
533
+ "grad_norm": 1.4308379888534546,
534
  "learning_rate": 1.1111111111111112e-06,
535
+ "loss": 0.0422,
536
+ "num_input_tokens_seen": 33792000,
537
  "step": 33000
538
  },
539
  {
540
  "epoch": 2.977777777777778,
541
+ "grad_norm": 1.3511176109313965,
542
  "learning_rate": 3.703703703703704e-07,
543
+ "loss": 0.0428,
544
+ "num_input_tokens_seen": 34304000,
545
  "step": 33500
546
  },
547
  {
548
  "epoch": 3.0,
549
+ "num_input_tokens_seen": 34560000,
550
  "step": 33750,
551
  "total_flos": 4446488701440000.0,
552
+ "train_loss": 0.08211795973601164,
553
+ "train_runtime": 1433.1905,
554
+ "train_samples_per_second": 188.391,
555
+ "train_steps_per_second": 23.549,
556
+ "train_tokens_per_second": 24114.032
557
  }
558
  ],
559
  "logging_steps": 500,
560
  "max_steps": 33750,
561
+ "num_input_tokens_seen": 34560000,
562
  "num_train_epochs": 3,
563
  "save_steps": 500,
564
  "stateful_callbacks": {