pszemraj commited on
Commit
958e619
1 Parent(s): f08d76d

End of training

Browse files
Files changed (5) hide show
  1. README.md +5 -3
  2. all_results.json +15 -0
  3. eval_results.json +10 -0
  4. train_results.json +8 -0
  5. trainer_state.json +786 -0
README.md CHANGED
@@ -1,4 +1,6 @@
1
  ---
 
 
2
  tags:
3
  - generated_from_trainer
4
  metrics:
@@ -13,10 +15,10 @@ should probably proofread and complete it, then remove this comment. -->
13
 
14
  # bert-plus-embedderForMLM-goodwiki-deduped-split_4096-usecache
15
 
16
- This model was trained from scratch on the None dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 1.9893
19
- - Accuracy: 0.6152
20
 
21
  ## Model description
22
 
 
1
  ---
2
+ language:
3
+ - en
4
  tags:
5
  - generated_from_trainer
6
  metrics:
 
15
 
16
  # bert-plus-embedderForMLM-goodwiki-deduped-split_4096-usecache
17
 
18
+ This model was trained from scratch on the BEE-spoke-data/goodwiki-deduped-split dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 1.9835
21
+ - Accuracy: 0.6159
22
 
23
  ## Model description
24
 
all_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_accuracy": 0.6158778269993005,
4
+ "eval_loss": 1.9834641218185425,
5
+ "eval_runtime": 40.5481,
6
+ "eval_samples": 300,
7
+ "eval_samples_per_second": 7.399,
8
+ "eval_steps_per_second": 1.85,
9
+ "perplexity": 7.267876236350372,
10
+ "train_loss": 2.32403786500295,
11
+ "train_runtime": 14945.8713,
12
+ "train_samples": 38441,
13
+ "train_samples_per_second": 2.572,
14
+ "train_steps_per_second": 0.04
15
+ }
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_accuracy": 0.6158778269993005,
4
+ "eval_loss": 1.9834641218185425,
5
+ "eval_runtime": 40.5481,
6
+ "eval_samples": 300,
7
+ "eval_samples_per_second": 7.399,
8
+ "eval_steps_per_second": 1.85,
9
+ "perplexity": 7.267876236350372
10
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "train_loss": 2.32403786500295,
4
+ "train_runtime": 14945.8713,
5
+ "train_samples": 38441,
6
+ "train_samples_per_second": 2.572,
7
+ "train_steps_per_second": 0.04
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,786 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9988554780980127,
5
+ "eval_steps": 150,
6
+ "global_step": 600,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01,
13
+ "learning_rate": 5e-06,
14
+ "loss": 8.3066,
15
+ "step": 5
16
+ },
17
+ {
18
+ "epoch": 0.02,
19
+ "learning_rate": 1e-05,
20
+ "loss": 7.9763,
21
+ "step": 10
22
+ },
23
+ {
24
+ "epoch": 0.02,
25
+ "learning_rate": 1.5e-05,
26
+ "loss": 7.2999,
27
+ "step": 15
28
+ },
29
+ {
30
+ "epoch": 0.03,
31
+ "learning_rate": 2e-05,
32
+ "loss": 6.3145,
33
+ "step": 20
34
+ },
35
+ {
36
+ "epoch": 0.04,
37
+ "learning_rate": 2.5e-05,
38
+ "loss": 4.8549,
39
+ "step": 25
40
+ },
41
+ {
42
+ "epoch": 0.05,
43
+ "learning_rate": 3e-05,
44
+ "loss": 3.7485,
45
+ "step": 30
46
+ },
47
+ {
48
+ "epoch": 0.06,
49
+ "learning_rate": 3.5e-05,
50
+ "loss": 3.0173,
51
+ "step": 35
52
+ },
53
+ {
54
+ "epoch": 0.07,
55
+ "learning_rate": 4e-05,
56
+ "loss": 2.6364,
57
+ "step": 40
58
+ },
59
+ {
60
+ "epoch": 0.07,
61
+ "learning_rate": 4.5e-05,
62
+ "loss": 2.4726,
63
+ "step": 45
64
+ },
65
+ {
66
+ "epoch": 0.08,
67
+ "learning_rate": 5e-05,
68
+ "loss": 2.3895,
69
+ "step": 50
70
+ },
71
+ {
72
+ "epoch": 0.09,
73
+ "learning_rate": 5.500000000000001e-05,
74
+ "loss": 2.2992,
75
+ "step": 55
76
+ },
77
+ {
78
+ "epoch": 0.1,
79
+ "learning_rate": 6e-05,
80
+ "loss": 2.291,
81
+ "step": 60
82
+ },
83
+ {
84
+ "epoch": 0.11,
85
+ "learning_rate": 6.500000000000001e-05,
86
+ "loss": 2.2597,
87
+ "step": 65
88
+ },
89
+ {
90
+ "epoch": 0.12,
91
+ "learning_rate": 7e-05,
92
+ "loss": 2.2245,
93
+ "step": 70
94
+ },
95
+ {
96
+ "epoch": 0.12,
97
+ "learning_rate": 7.500000000000001e-05,
98
+ "loss": 2.2076,
99
+ "step": 75
100
+ },
101
+ {
102
+ "epoch": 0.13,
103
+ "learning_rate": 8e-05,
104
+ "loss": 2.2132,
105
+ "step": 80
106
+ },
107
+ {
108
+ "epoch": 0.14,
109
+ "learning_rate": 8.5e-05,
110
+ "loss": 2.1925,
111
+ "step": 85
112
+ },
113
+ {
114
+ "epoch": 0.15,
115
+ "learning_rate": 9e-05,
116
+ "loss": 2.1868,
117
+ "step": 90
118
+ },
119
+ {
120
+ "epoch": 0.16,
121
+ "learning_rate": 9.5e-05,
122
+ "loss": 2.1916,
123
+ "step": 95
124
+ },
125
+ {
126
+ "epoch": 0.17,
127
+ "learning_rate": 0.0001,
128
+ "loss": 2.1774,
129
+ "step": 100
130
+ },
131
+ {
132
+ "epoch": 0.17,
133
+ "learning_rate": 9.900000000000001e-05,
134
+ "loss": 2.183,
135
+ "step": 105
136
+ },
137
+ {
138
+ "epoch": 0.18,
139
+ "learning_rate": 9.8e-05,
140
+ "loss": 2.1712,
141
+ "step": 110
142
+ },
143
+ {
144
+ "epoch": 0.19,
145
+ "learning_rate": 9.7e-05,
146
+ "loss": 2.1713,
147
+ "step": 115
148
+ },
149
+ {
150
+ "epoch": 0.2,
151
+ "learning_rate": 9.6e-05,
152
+ "loss": 2.1731,
153
+ "step": 120
154
+ },
155
+ {
156
+ "epoch": 0.21,
157
+ "learning_rate": 9.5e-05,
158
+ "loss": 2.1565,
159
+ "step": 125
160
+ },
161
+ {
162
+ "epoch": 0.22,
163
+ "learning_rate": 9.4e-05,
164
+ "loss": 2.1575,
165
+ "step": 130
166
+ },
167
+ {
168
+ "epoch": 0.22,
169
+ "learning_rate": 9.300000000000001e-05,
170
+ "loss": 2.1414,
171
+ "step": 135
172
+ },
173
+ {
174
+ "epoch": 0.23,
175
+ "learning_rate": 9.200000000000001e-05,
176
+ "loss": 2.1384,
177
+ "step": 140
178
+ },
179
+ {
180
+ "epoch": 0.24,
181
+ "learning_rate": 9.1e-05,
182
+ "loss": 2.1487,
183
+ "step": 145
184
+ },
185
+ {
186
+ "epoch": 0.25,
187
+ "learning_rate": 9e-05,
188
+ "loss": 2.1283,
189
+ "step": 150
190
+ },
191
+ {
192
+ "epoch": 0.25,
193
+ "eval_accuracy": 0.6017748783939872,
194
+ "eval_loss": 2.089164972305298,
195
+ "eval_runtime": 58.1804,
196
+ "eval_samples_per_second": 5.156,
197
+ "eval_steps_per_second": 1.289,
198
+ "step": 150
199
+ },
200
+ {
201
+ "epoch": 0.26,
202
+ "learning_rate": 8.900000000000001e-05,
203
+ "loss": 2.1224,
204
+ "step": 155
205
+ },
206
+ {
207
+ "epoch": 0.27,
208
+ "learning_rate": 8.800000000000001e-05,
209
+ "loss": 2.144,
210
+ "step": 160
211
+ },
212
+ {
213
+ "epoch": 0.27,
214
+ "learning_rate": 8.7e-05,
215
+ "loss": 2.1293,
216
+ "step": 165
217
+ },
218
+ {
219
+ "epoch": 0.28,
220
+ "learning_rate": 8.6e-05,
221
+ "loss": 2.1417,
222
+ "step": 170
223
+ },
224
+ {
225
+ "epoch": 0.29,
226
+ "learning_rate": 8.5e-05,
227
+ "loss": 2.1282,
228
+ "step": 175
229
+ },
230
+ {
231
+ "epoch": 0.3,
232
+ "learning_rate": 8.4e-05,
233
+ "loss": 2.1008,
234
+ "step": 180
235
+ },
236
+ {
237
+ "epoch": 0.31,
238
+ "learning_rate": 8.3e-05,
239
+ "loss": 2.1218,
240
+ "step": 185
241
+ },
242
+ {
243
+ "epoch": 0.32,
244
+ "learning_rate": 8.2e-05,
245
+ "loss": 2.1292,
246
+ "step": 190
247
+ },
248
+ {
249
+ "epoch": 0.32,
250
+ "learning_rate": 8.1e-05,
251
+ "loss": 2.114,
252
+ "step": 195
253
+ },
254
+ {
255
+ "epoch": 0.33,
256
+ "learning_rate": 8e-05,
257
+ "loss": 2.1135,
258
+ "step": 200
259
+ },
260
+ {
261
+ "epoch": 0.34,
262
+ "learning_rate": 7.900000000000001e-05,
263
+ "loss": 2.118,
264
+ "step": 205
265
+ },
266
+ {
267
+ "epoch": 0.35,
268
+ "learning_rate": 7.800000000000001e-05,
269
+ "loss": 2.1231,
270
+ "step": 210
271
+ },
272
+ {
273
+ "epoch": 0.36,
274
+ "learning_rate": 7.7e-05,
275
+ "loss": 2.0705,
276
+ "step": 215
277
+ },
278
+ {
279
+ "epoch": 0.37,
280
+ "learning_rate": 7.6e-05,
281
+ "loss": 2.11,
282
+ "step": 220
283
+ },
284
+ {
285
+ "epoch": 0.37,
286
+ "learning_rate": 7.500000000000001e-05,
287
+ "loss": 2.0866,
288
+ "step": 225
289
+ },
290
+ {
291
+ "epoch": 0.38,
292
+ "learning_rate": 7.4e-05,
293
+ "loss": 2.1115,
294
+ "step": 230
295
+ },
296
+ {
297
+ "epoch": 0.39,
298
+ "learning_rate": 7.3e-05,
299
+ "loss": 2.1069,
300
+ "step": 235
301
+ },
302
+ {
303
+ "epoch": 0.4,
304
+ "learning_rate": 7.2e-05,
305
+ "loss": 2.1083,
306
+ "step": 240
307
+ },
308
+ {
309
+ "epoch": 0.41,
310
+ "learning_rate": 7.1e-05,
311
+ "loss": 2.1014,
312
+ "step": 245
313
+ },
314
+ {
315
+ "epoch": 0.42,
316
+ "learning_rate": 7e-05,
317
+ "loss": 2.1029,
318
+ "step": 250
319
+ },
320
+ {
321
+ "epoch": 0.42,
322
+ "learning_rate": 6.9e-05,
323
+ "loss": 2.0846,
324
+ "step": 255
325
+ },
326
+ {
327
+ "epoch": 0.43,
328
+ "learning_rate": 6.800000000000001e-05,
329
+ "loss": 2.1059,
330
+ "step": 260
331
+ },
332
+ {
333
+ "epoch": 0.44,
334
+ "learning_rate": 6.7e-05,
335
+ "loss": 2.0875,
336
+ "step": 265
337
+ },
338
+ {
339
+ "epoch": 0.45,
340
+ "learning_rate": 6.6e-05,
341
+ "loss": 2.0973,
342
+ "step": 270
343
+ },
344
+ {
345
+ "epoch": 0.46,
346
+ "learning_rate": 6.500000000000001e-05,
347
+ "loss": 2.0932,
348
+ "step": 275
349
+ },
350
+ {
351
+ "epoch": 0.47,
352
+ "learning_rate": 6.400000000000001e-05,
353
+ "loss": 2.0721,
354
+ "step": 280
355
+ },
356
+ {
357
+ "epoch": 0.47,
358
+ "learning_rate": 6.3e-05,
359
+ "loss": 2.0892,
360
+ "step": 285
361
+ },
362
+ {
363
+ "epoch": 0.48,
364
+ "learning_rate": 6.2e-05,
365
+ "loss": 2.1002,
366
+ "step": 290
367
+ },
368
+ {
369
+ "epoch": 0.49,
370
+ "learning_rate": 6.1e-05,
371
+ "loss": 2.0804,
372
+ "step": 295
373
+ },
374
+ {
375
+ "epoch": 0.5,
376
+ "learning_rate": 6e-05,
377
+ "loss": 2.0999,
378
+ "step": 300
379
+ },
380
+ {
381
+ "epoch": 0.5,
382
+ "eval_accuracy": 0.6084190234842455,
383
+ "eval_loss": 2.038738965988159,
384
+ "eval_runtime": 68.8578,
385
+ "eval_samples_per_second": 4.357,
386
+ "eval_steps_per_second": 1.089,
387
+ "step": 300
388
+ },
389
+ {
390
+ "epoch": 0.51,
391
+ "learning_rate": 5.9e-05,
392
+ "loss": 2.0905,
393
+ "step": 305
394
+ },
395
+ {
396
+ "epoch": 0.52,
397
+ "learning_rate": 5.8e-05,
398
+ "loss": 2.046,
399
+ "step": 310
400
+ },
401
+ {
402
+ "epoch": 0.52,
403
+ "learning_rate": 5.6999999999999996e-05,
404
+ "loss": 2.0794,
405
+ "step": 315
406
+ },
407
+ {
408
+ "epoch": 0.53,
409
+ "learning_rate": 5.6000000000000006e-05,
410
+ "loss": 2.0714,
411
+ "step": 320
412
+ },
413
+ {
414
+ "epoch": 0.54,
415
+ "learning_rate": 5.500000000000001e-05,
416
+ "loss": 2.0711,
417
+ "step": 325
418
+ },
419
+ {
420
+ "epoch": 0.55,
421
+ "learning_rate": 5.4000000000000005e-05,
422
+ "loss": 2.0525,
423
+ "step": 330
424
+ },
425
+ {
426
+ "epoch": 0.56,
427
+ "learning_rate": 5.300000000000001e-05,
428
+ "loss": 2.0689,
429
+ "step": 335
430
+ },
431
+ {
432
+ "epoch": 0.57,
433
+ "learning_rate": 5.2000000000000004e-05,
434
+ "loss": 2.0936,
435
+ "step": 340
436
+ },
437
+ {
438
+ "epoch": 0.57,
439
+ "learning_rate": 5.1000000000000006e-05,
440
+ "loss": 2.062,
441
+ "step": 345
442
+ },
443
+ {
444
+ "epoch": 0.58,
445
+ "learning_rate": 5e-05,
446
+ "loss": 2.0621,
447
+ "step": 350
448
+ },
449
+ {
450
+ "epoch": 0.59,
451
+ "learning_rate": 4.9e-05,
452
+ "loss": 2.0662,
453
+ "step": 355
454
+ },
455
+ {
456
+ "epoch": 0.6,
457
+ "learning_rate": 4.8e-05,
458
+ "loss": 2.0779,
459
+ "step": 360
460
+ },
461
+ {
462
+ "epoch": 0.61,
463
+ "learning_rate": 4.7e-05,
464
+ "loss": 2.0773,
465
+ "step": 365
466
+ },
467
+ {
468
+ "epoch": 0.62,
469
+ "learning_rate": 4.600000000000001e-05,
470
+ "loss": 2.0407,
471
+ "step": 370
472
+ },
473
+ {
474
+ "epoch": 0.62,
475
+ "learning_rate": 4.5e-05,
476
+ "loss": 2.0603,
477
+ "step": 375
478
+ },
479
+ {
480
+ "epoch": 0.63,
481
+ "learning_rate": 4.4000000000000006e-05,
482
+ "loss": 2.0556,
483
+ "step": 380
484
+ },
485
+ {
486
+ "epoch": 0.64,
487
+ "learning_rate": 4.3e-05,
488
+ "loss": 2.0742,
489
+ "step": 385
490
+ },
491
+ {
492
+ "epoch": 0.65,
493
+ "learning_rate": 4.2e-05,
494
+ "loss": 2.0444,
495
+ "step": 390
496
+ },
497
+ {
498
+ "epoch": 0.66,
499
+ "learning_rate": 4.1e-05,
500
+ "loss": 2.0409,
501
+ "step": 395
502
+ },
503
+ {
504
+ "epoch": 0.67,
505
+ "learning_rate": 4e-05,
506
+ "loss": 2.0539,
507
+ "step": 400
508
+ },
509
+ {
510
+ "epoch": 0.67,
511
+ "learning_rate": 3.9000000000000006e-05,
512
+ "loss": 2.0696,
513
+ "step": 405
514
+ },
515
+ {
516
+ "epoch": 0.68,
517
+ "learning_rate": 3.8e-05,
518
+ "loss": 2.0614,
519
+ "step": 410
520
+ },
521
+ {
522
+ "epoch": 0.69,
523
+ "learning_rate": 3.7e-05,
524
+ "loss": 2.0604,
525
+ "step": 415
526
+ },
527
+ {
528
+ "epoch": 0.7,
529
+ "learning_rate": 3.6e-05,
530
+ "loss": 2.0608,
531
+ "step": 420
532
+ },
533
+ {
534
+ "epoch": 0.71,
535
+ "learning_rate": 3.5e-05,
536
+ "loss": 2.0412,
537
+ "step": 425
538
+ },
539
+ {
540
+ "epoch": 0.72,
541
+ "learning_rate": 3.4000000000000007e-05,
542
+ "loss": 2.0558,
543
+ "step": 430
544
+ },
545
+ {
546
+ "epoch": 0.72,
547
+ "learning_rate": 3.3e-05,
548
+ "loss": 2.0338,
549
+ "step": 435
550
+ },
551
+ {
552
+ "epoch": 0.73,
553
+ "learning_rate": 3.2000000000000005e-05,
554
+ "loss": 2.0322,
555
+ "step": 440
556
+ },
557
+ {
558
+ "epoch": 0.74,
559
+ "learning_rate": 3.1e-05,
560
+ "loss": 2.0511,
561
+ "step": 445
562
+ },
563
+ {
564
+ "epoch": 0.75,
565
+ "learning_rate": 3e-05,
566
+ "loss": 2.0595,
567
+ "step": 450
568
+ },
569
+ {
570
+ "epoch": 0.75,
571
+ "eval_accuracy": 0.6142522292024847,
572
+ "eval_loss": 1.9970886707305908,
573
+ "eval_runtime": 61.0863,
574
+ "eval_samples_per_second": 4.911,
575
+ "eval_steps_per_second": 1.228,
576
+ "step": 450
577
+ },
578
+ {
579
+ "epoch": 0.76,
580
+ "learning_rate": 2.9e-05,
581
+ "loss": 2.032,
582
+ "step": 455
583
+ },
584
+ {
585
+ "epoch": 0.77,
586
+ "learning_rate": 2.8000000000000003e-05,
587
+ "loss": 2.0421,
588
+ "step": 460
589
+ },
590
+ {
591
+ "epoch": 0.77,
592
+ "learning_rate": 2.7000000000000002e-05,
593
+ "loss": 2.0255,
594
+ "step": 465
595
+ },
596
+ {
597
+ "epoch": 0.78,
598
+ "learning_rate": 2.6000000000000002e-05,
599
+ "loss": 2.0259,
600
+ "step": 470
601
+ },
602
+ {
603
+ "epoch": 0.79,
604
+ "learning_rate": 2.5e-05,
605
+ "loss": 2.0361,
606
+ "step": 475
607
+ },
608
+ {
609
+ "epoch": 0.8,
610
+ "learning_rate": 2.4e-05,
611
+ "loss": 2.0584,
612
+ "step": 480
613
+ },
614
+ {
615
+ "epoch": 0.81,
616
+ "learning_rate": 2.3000000000000003e-05,
617
+ "loss": 2.0589,
618
+ "step": 485
619
+ },
620
+ {
621
+ "epoch": 0.82,
622
+ "learning_rate": 2.2000000000000003e-05,
623
+ "loss": 2.0555,
624
+ "step": 490
625
+ },
626
+ {
627
+ "epoch": 0.82,
628
+ "learning_rate": 2.1e-05,
629
+ "loss": 2.0413,
630
+ "step": 495
631
+ },
632
+ {
633
+ "epoch": 0.83,
634
+ "learning_rate": 2e-05,
635
+ "loss": 2.0097,
636
+ "step": 500
637
+ },
638
+ {
639
+ "epoch": 0.84,
640
+ "learning_rate": 1.9e-05,
641
+ "loss": 2.0466,
642
+ "step": 505
643
+ },
644
+ {
645
+ "epoch": 0.85,
646
+ "learning_rate": 1.8e-05,
647
+ "loss": 2.0572,
648
+ "step": 510
649
+ },
650
+ {
651
+ "epoch": 0.86,
652
+ "learning_rate": 1.7000000000000003e-05,
653
+ "loss": 2.0376,
654
+ "step": 515
655
+ },
656
+ {
657
+ "epoch": 0.87,
658
+ "learning_rate": 1.6000000000000003e-05,
659
+ "loss": 2.0202,
660
+ "step": 520
661
+ },
662
+ {
663
+ "epoch": 0.87,
664
+ "learning_rate": 1.5e-05,
665
+ "loss": 2.0643,
666
+ "step": 525
667
+ },
668
+ {
669
+ "epoch": 0.88,
670
+ "learning_rate": 1.4000000000000001e-05,
671
+ "loss": 2.0201,
672
+ "step": 530
673
+ },
674
+ {
675
+ "epoch": 0.89,
676
+ "learning_rate": 1.3000000000000001e-05,
677
+ "loss": 2.0356,
678
+ "step": 535
679
+ },
680
+ {
681
+ "epoch": 0.9,
682
+ "learning_rate": 1.2e-05,
683
+ "loss": 2.0255,
684
+ "step": 540
685
+ },
686
+ {
687
+ "epoch": 0.91,
688
+ "learning_rate": 1.1000000000000001e-05,
689
+ "loss": 2.0374,
690
+ "step": 545
691
+ },
692
+ {
693
+ "epoch": 0.92,
694
+ "learning_rate": 1e-05,
695
+ "loss": 2.0385,
696
+ "step": 550
697
+ },
698
+ {
699
+ "epoch": 0.92,
700
+ "learning_rate": 9e-06,
701
+ "loss": 2.024,
702
+ "step": 555
703
+ },
704
+ {
705
+ "epoch": 0.93,
706
+ "learning_rate": 8.000000000000001e-06,
707
+ "loss": 2.0144,
708
+ "step": 560
709
+ },
710
+ {
711
+ "epoch": 0.94,
712
+ "learning_rate": 7.000000000000001e-06,
713
+ "loss": 2.04,
714
+ "step": 565
715
+ },
716
+ {
717
+ "epoch": 0.95,
718
+ "learning_rate": 6e-06,
719
+ "loss": 2.0372,
720
+ "step": 570
721
+ },
722
+ {
723
+ "epoch": 0.96,
724
+ "learning_rate": 5e-06,
725
+ "loss": 2.0122,
726
+ "step": 575
727
+ },
728
+ {
729
+ "epoch": 0.97,
730
+ "learning_rate": 4.000000000000001e-06,
731
+ "loss": 2.0182,
732
+ "step": 580
733
+ },
734
+ {
735
+ "epoch": 0.97,
736
+ "learning_rate": 3e-06,
737
+ "loss": 2.0323,
738
+ "step": 585
739
+ },
740
+ {
741
+ "epoch": 0.98,
742
+ "learning_rate": 2.0000000000000003e-06,
743
+ "loss": 2.0298,
744
+ "step": 590
745
+ },
746
+ {
747
+ "epoch": 0.99,
748
+ "learning_rate": 1.0000000000000002e-06,
749
+ "loss": 2.0212,
750
+ "step": 595
751
+ },
752
+ {
753
+ "epoch": 1.0,
754
+ "learning_rate": 0.0,
755
+ "loss": 2.0481,
756
+ "step": 600
757
+ },
758
+ {
759
+ "epoch": 1.0,
760
+ "eval_accuracy": 0.6152243190218628,
761
+ "eval_loss": 1.9892692565917969,
762
+ "eval_runtime": 64.3793,
763
+ "eval_samples_per_second": 4.66,
764
+ "eval_steps_per_second": 1.165,
765
+ "step": 600
766
+ },
767
+ {
768
+ "epoch": 1.0,
769
+ "step": 600,
770
+ "total_flos": 5.41006975991808e+16,
771
+ "train_loss": 2.32403786500295,
772
+ "train_runtime": 14945.8713,
773
+ "train_samples_per_second": 2.572,
774
+ "train_steps_per_second": 0.04
775
+ }
776
+ ],
777
+ "logging_steps": 5,
778
+ "max_steps": 600,
779
+ "num_input_tokens_seen": 0,
780
+ "num_train_epochs": 1,
781
+ "save_steps": 100,
782
+ "total_flos": 5.41006975991808e+16,
783
+ "train_batch_size": 4,
784
+ "trial_name": null,
785
+ "trial_params": null
786
+ }