Joshua Lochner commited on
Commit
6d40f24
1 Parent(s): f42ed7c

Next training iteration (2.03m)

Browse files
added_tokens.json CHANGED
@@ -1 +1 @@
1
- {"NO_SEGMENT_TOKEN": 32112, "BETWEEN_SEGMENTS_TOKEN": 32111, "NUMBER_TOKEN": 32103, "START_SPONSOR_TOKEN": 32113, "START_SELFPROMO_TOKEN": 32115, "END_SPONSOR_TOKEN": 32114, "END_INTERACTION_TOKEN": 32118, "[Applause]": 32107, "HYPHENATED_URL_TOKEN": 32101, "EXTRACT_SEGMENTS: ": 32110, "SHORT_HYPHENATED_TOKEN": 32104, "END_SELFPROMO_TOKEN": 32116, "NUMBER_PERCENTAGE_TOKEN": 32102, "PROFANITY_TOKEN": 32109, "[Music]": 32106, "[Laughter]": 32108, "LONG_WORD_TOKEN": 32105, "URL_TOKEN": 32100, "START_INTERACTION_TOKEN": 32117}
 
1
+ {"PROFANITY_TOKEN": 32109, "END_INTERACTION_TOKEN": 32118, "URL_TOKEN": 32100, "EXTRACT_SEGMENTS: ": 32110, "NUMBER_TOKEN": 32103, "NUMBER_PERCENTAGE_TOKEN": 32102, "LONG_WORD_TOKEN": 32105, "END_SPONSOR_TOKEN": 32114, "START_SELFPROMO_TOKEN": 32115, "END_SELFPROMO_TOKEN": 32116, "NO_SEGMENT_TOKEN": 32112, "HYPHENATED_URL_TOKEN": 32101, "START_SPONSOR_TOKEN": 32113, "[Music]": 32106, "[Laughter]": 32108, "BETWEEN_SEGMENTS_TOKEN": 32111, "[Applause]": 32107, "START_INTERACTION_TOKEN": 32117, "SHORT_HYPHENATED_TOKEN": 32104}
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "models/small-120k/",
3
  "architectures": [
4
  "T5ForConditionalGeneration"
5
  ],
 
1
  {
2
+ "_name_or_path": "models/small-1712500/",
3
  "architectures": [
4
  "T5ForConditionalGeneration"
5
  ],
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2d8a0af26f7ff16bae5ef0583063c69ee47700be6a4793c994a3ef6e83d117b8
3
  size 307892869
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49525bef46229b9dd14d256aab3b299a9abc1fe2f92316ca7465eda817803842
3
  size 307892869
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d8191c03018e0f166e6e407d71defd3cbfe5dcae86d404ff6a5903f7b765d8ac
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6019e275337380401ff115e5b151ec62b90d64182c9e491d19f52523b075b522
3
  size 14503
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:257198b9da6abece4de57c5b798cb130a628ec21c36fad1093e79d27e917992e
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeafd2e8b1e45dc5c6795cc90be9dfd1344b28195e3f803fd812afbd480b4467
3
  size 623
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "extra_ids": 100, "additional_special_tokens": ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>", "<extra_id_6>", "<extra_id_7>", "<extra_id_8>", "<extra_id_9>", "<extra_id_10>", "<extra_id_11>", "<extra_id_12>", "<extra_id_13>", "<extra_id_14>", "<extra_id_15>", "<extra_id_16>", "<extra_id_17>", "<extra_id_18>", "<extra_id_19>", "<extra_id_20>", "<extra_id_21>", "<extra_id_22>", "<extra_id_23>", "<extra_id_24>", "<extra_id_25>", "<extra_id_26>", "<extra_id_27>", "<extra_id_28>", "<extra_id_29>", "<extra_id_30>", "<extra_id_31>", "<extra_id_32>", "<extra_id_33>", "<extra_id_34>", "<extra_id_35>", "<extra_id_36>", "<extra_id_37>", "<extra_id_38>", "<extra_id_39>", "<extra_id_40>", "<extra_id_41>", "<extra_id_42>", "<extra_id_43>", "<extra_id_44>", "<extra_id_45>", "<extra_id_46>", "<extra_id_47>", "<extra_id_48>", "<extra_id_49>", "<extra_id_50>", "<extra_id_51>", "<extra_id_52>", "<extra_id_53>", "<extra_id_54>", "<extra_id_55>", "<extra_id_56>", "<extra_id_57>", "<extra_id_58>", "<extra_id_59>", "<extra_id_60>", "<extra_id_61>", "<extra_id_62>", "<extra_id_63>", "<extra_id_64>", "<extra_id_65>", "<extra_id_66>", "<extra_id_67>", "<extra_id_68>", "<extra_id_69>", "<extra_id_70>", "<extra_id_71>", "<extra_id_72>", "<extra_id_73>", "<extra_id_74>", "<extra_id_75>", "<extra_id_76>", "<extra_id_77>", "<extra_id_78>", "<extra_id_79>", "<extra_id_80>", "<extra_id_81>", "<extra_id_82>", "<extra_id_83>", "<extra_id_84>", "<extra_id_85>", "<extra_id_86>", "<extra_id_87>", "<extra_id_88>", "<extra_id_89>", "<extra_id_90>", "<extra_id_91>", "<extra_id_92>", "<extra_id_93>", "<extra_id_94>", "<extra_id_95>", "<extra_id_96>", "<extra_id_97>", "<extra_id_98>", "<extra_id_99>"], "model_max_length": 512, "name_or_path": "models/small-120k/", "special_tokens_map_file": "C:\\Users\\joshu/.cache\\huggingface\\transformers\\3ad6f8335c1b1ef8966245899d47dcf735abd134d21fd7d26f621fe45ac01184.c94798918c92ded6aeef2d2f0e666d2cc4145eca1aa6e1336fde07f2e13e2f46", "sp_model_kwargs": {}, "tokenizer_class": "T5Tokenizer"}
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "extra_ids": 100, "additional_special_tokens": ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>", "<extra_id_6>", "<extra_id_7>", "<extra_id_8>", "<extra_id_9>", "<extra_id_10>", "<extra_id_11>", "<extra_id_12>", "<extra_id_13>", "<extra_id_14>", "<extra_id_15>", "<extra_id_16>", "<extra_id_17>", "<extra_id_18>", "<extra_id_19>", "<extra_id_20>", "<extra_id_21>", "<extra_id_22>", "<extra_id_23>", "<extra_id_24>", "<extra_id_25>", "<extra_id_26>", "<extra_id_27>", "<extra_id_28>", "<extra_id_29>", "<extra_id_30>", "<extra_id_31>", "<extra_id_32>", "<extra_id_33>", "<extra_id_34>", "<extra_id_35>", "<extra_id_36>", "<extra_id_37>", "<extra_id_38>", "<extra_id_39>", "<extra_id_40>", "<extra_id_41>", "<extra_id_42>", "<extra_id_43>", "<extra_id_44>", "<extra_id_45>", "<extra_id_46>", "<extra_id_47>", "<extra_id_48>", "<extra_id_49>", "<extra_id_50>", "<extra_id_51>", "<extra_id_52>", "<extra_id_53>", "<extra_id_54>", "<extra_id_55>", "<extra_id_56>", "<extra_id_57>", "<extra_id_58>", "<extra_id_59>", "<extra_id_60>", "<extra_id_61>", "<extra_id_62>", "<extra_id_63>", "<extra_id_64>", "<extra_id_65>", "<extra_id_66>", "<extra_id_67>", "<extra_id_68>", "<extra_id_69>", "<extra_id_70>", "<extra_id_71>", "<extra_id_72>", "<extra_id_73>", "<extra_id_74>", "<extra_id_75>", "<extra_id_76>", "<extra_id_77>", "<extra_id_78>", "<extra_id_79>", "<extra_id_80>", "<extra_id_81>", "<extra_id_82>", "<extra_id_83>", "<extra_id_84>", "<extra_id_85>", "<extra_id_86>", "<extra_id_87>", "<extra_id_88>", "<extra_id_89>", "<extra_id_90>", "<extra_id_91>", "<extra_id_92>", "<extra_id_93>", "<extra_id_94>", "<extra_id_95>", "<extra_id_96>", "<extra_id_97>", "<extra_id_98>", "<extra_id_99>"], "model_max_length": 512, "name_or_path": "models/small-1712500/", "special_tokens_map_file": "C:\\Users\\joshu/.cache\\huggingface\\transformers\\3ad6f8335c1b1ef8966245899d47dcf735abd134d21fd7d26f621fe45ac01184.c94798918c92ded6aeef2d2f0e666d2cc4145eca1aa6e1336fde07f2e13e2f46", "sp_model_kwargs": {}, "max_length": 512, "tokenizer_class": "T5Tokenizer"}
trainer_state.json CHANGED
@@ -1,1116 +1,236 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.9405478794478936,
5
- "global_step": 1100000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
- "epoch": 0.04,
12
- "learning_rate": 4.8208841872978234e-05,
13
- "loss": 0.0755,
14
- "step": 10000
15
  },
16
  {
17
- "epoch": 0.07,
18
- "learning_rate": 4.6417683745956466e-05,
19
- "loss": 0.0721,
20
- "step": 20000
21
- },
22
- {
23
- "epoch": 0.07,
24
- "eval_loss": 0.06494459509849548,
25
- "eval_runtime": 754.7473,
26
- "eval_samples_per_second": 82.19,
27
- "eval_steps_per_second": 20.549,
28
- "step": 20000
29
  },
30
  {
31
  "epoch": 0.11,
32
- "learning_rate": 4.462652561893469e-05,
33
- "loss": 0.0704,
34
- "step": 30000
 
 
35
  },
36
  {
37
- "epoch": 0.14,
38
- "learning_rate": 4.2835367491912923e-05,
39
- "loss": 0.0701,
40
- "step": 40000
41
  },
42
  {
43
- "epoch": 0.14,
44
- "eval_loss": 0.06424280256032944,
45
- "eval_runtime": 676.9024,
46
- "eval_samples_per_second": 91.642,
47
- "eval_steps_per_second": 22.912,
48
- "step": 40000
49
  },
50
  {
51
- "epoch": 0.18,
52
- "learning_rate": 4.104420936489115e-05,
53
- "loss": 0.0713,
54
- "step": 50000
 
 
55
  },
56
  {
57
- "epoch": 0.21,
58
- "learning_rate": 3.925305123786938e-05,
59
- "loss": 0.0712,
60
- "step": 60000
61
  },
62
  {
63
- "epoch": 0.21,
64
- "eval_loss": 0.0624563954770565,
65
- "eval_runtime": 678.8069,
66
- "eval_samples_per_second": 91.385,
67
- "eval_steps_per_second": 22.847,
68
- "step": 60000
69
  },
70
  {
71
- "epoch": 0.25,
72
- "learning_rate": 3.746189311084761e-05,
73
- "loss": 0.0698,
74
- "step": 70000
 
 
75
  },
76
  {
77
- "epoch": 0.29,
78
- "learning_rate": 3.5670734983825844e-05,
79
- "loss": 0.0667,
80
- "step": 80000
81
  },
82
  {
83
- "epoch": 0.29,
84
- "eval_loss": 0.06348983943462372,
85
- "eval_runtime": 675.6836,
86
- "eval_samples_per_second": 91.808,
87
- "eval_steps_per_second": 22.953,
88
- "step": 80000
89
  },
90
  {
91
- "epoch": 0.32,
92
- "learning_rate": 3.3879576856804076e-05,
93
- "loss": 0.0657,
94
- "step": 90000
 
 
95
  },
96
  {
97
- "epoch": 0.36,
98
- "learning_rate": 3.20884187297823e-05,
99
- "loss": 0.0665,
100
- "step": 100000
101
  },
102
  {
103
- "epoch": 0.36,
104
- "eval_loss": 0.06186612322926521,
105
- "eval_runtime": 676.1915,
106
- "eval_samples_per_second": 91.739,
107
- "eval_steps_per_second": 22.936,
108
- "step": 100000
109
  },
110
  {
111
- "epoch": 0.39,
112
- "learning_rate": 3.0297260602760537e-05,
113
- "loss": 0.0657,
114
- "step": 110000
 
 
115
  },
116
  {
117
- "epoch": 0.43,
118
- "learning_rate": 2.8506102475738766e-05,
119
- "loss": 0.0653,
120
- "step": 120000
121
  },
122
  {
123
- "epoch": 0.43,
124
- "eval_loss": 0.061661478132009506,
125
- "eval_runtime": 677.4648,
126
- "eval_samples_per_second": 91.566,
127
- "eval_steps_per_second": 22.893,
128
  "step": 120000
129
  },
130
  {
131
- "epoch": 0.47,
132
- "learning_rate": 2.6714944348716997e-05,
133
- "loss": 0.0658,
134
- "step": 130000
135
- },
136
- {
137
- "epoch": 0.5,
138
- "learning_rate": 2.4923786221695226e-05,
139
- "loss": 0.0656,
140
- "step": 140000
141
  },
142
  {
143
- "epoch": 0.5,
144
- "eval_loss": 0.06098250672221184,
145
- "eval_runtime": 677.3485,
146
- "eval_samples_per_second": 91.582,
147
- "eval_steps_per_second": 22.897,
148
  "step": 140000
149
  },
150
  {
151
- "epoch": 0.54,
152
- "learning_rate": 2.3132628094673455e-05,
153
- "loss": 0.0649,
154
- "step": 150000
155
- },
156
- {
157
- "epoch": 0.57,
158
- "learning_rate": 2.1341469967651687e-05,
159
- "loss": 0.0648,
160
  "step": 160000
161
  },
162
  {
163
- "epoch": 0.57,
164
- "eval_loss": 0.06077565252780914,
165
- "eval_runtime": 677.1708,
166
- "eval_samples_per_second": 91.606,
167
- "eval_steps_per_second": 22.903,
168
  "step": 160000
169
  },
170
  {
171
- "epoch": 0.61,
172
- "learning_rate": 1.9550311840629915e-05,
173
- "loss": 0.0642,
174
- "step": 170000
175
- },
176
- {
177
- "epoch": 0.64,
178
- "learning_rate": 1.7759153713608147e-05,
179
- "loss": 0.0649,
180
- "step": 180000
181
- },
182
- {
183
- "epoch": 0.64,
184
- "eval_loss": 0.05941811203956604,
185
- "eval_runtime": 677.9109,
186
- "eval_samples_per_second": 91.506,
187
- "eval_steps_per_second": 22.878,
188
  "step": 180000
189
  },
190
  {
191
- "epoch": 0.68,
192
- "learning_rate": 1.5967995586586376e-05,
193
- "loss": 0.0652,
194
- "step": 190000
195
- },
196
- {
197
- "epoch": 0.72,
198
- "learning_rate": 1.4176837459564608e-05,
199
- "loss": 0.0638,
200
  "step": 200000
201
  },
202
  {
203
- "epoch": 0.72,
204
- "eval_loss": 0.06042506918311119,
205
- "eval_runtime": 678.1922,
206
- "eval_samples_per_second": 91.468,
207
- "eval_steps_per_second": 22.868,
208
  "step": 200000
209
  },
210
  {
211
- "epoch": 0.75,
212
- "learning_rate": 1.2385679332542836e-05,
213
- "loss": 0.0621,
214
- "step": 210000
215
- },
216
- {
217
- "epoch": 0.79,
218
- "learning_rate": 1.0594521205521066e-05,
219
- "loss": 0.0643,
220
- "step": 220000
221
- },
222
- {
223
- "epoch": 0.79,
224
- "eval_loss": 0.059569716453552246,
225
- "eval_runtime": 688.4187,
226
- "eval_samples_per_second": 90.109,
227
- "eval_steps_per_second": 22.528,
228
  "step": 220000
229
  },
230
  {
231
- "epoch": 0.82,
232
- "learning_rate": 8.803363078499297e-06,
233
- "loss": 0.0623,
234
- "step": 230000
235
- },
236
- {
237
- "epoch": 0.86,
238
- "learning_rate": 7.012204951477527e-06,
239
- "loss": 0.0646,
240
  "step": 240000
241
  },
242
  {
243
- "epoch": 0.86,
244
- "eval_loss": 0.059120796620845795,
245
- "eval_runtime": 715.8659,
246
- "eval_samples_per_second": 86.654,
247
- "eval_steps_per_second": 21.665,
248
  "step": 240000
249
  },
250
  {
251
- "epoch": 0.9,
252
- "learning_rate": 4.5522104682445576e-05,
253
- "loss": 0.0634,
254
- "step": 250000
255
- },
256
- {
257
- "epoch": 0.93,
258
- "learning_rate": 4.53429888697434e-05,
259
- "loss": 0.0648,
260
  "step": 260000
261
  },
262
  {
263
- "epoch": 0.93,
264
- "eval_loss": 0.06051425263285637,
265
- "eval_runtime": 694.96,
266
- "eval_samples_per_second": 89.261,
267
- "eval_steps_per_second": 22.316,
268
- "step": 260000
269
- },
270
- {
271
- "epoch": 0.97,
272
- "learning_rate": 4.516387305704122e-05,
273
- "loss": 0.0644,
274
- "step": 270000
275
- },
276
- {
277
- "epoch": 1.0,
278
- "learning_rate": 4.4984757244339046e-05,
279
- "loss": 0.0655,
280
  "step": 280000
281
  },
282
  {
283
- "epoch": 1.0,
284
- "eval_loss": 0.06069951504468918,
285
- "eval_runtime": 679.0968,
286
- "eval_samples_per_second": 91.346,
287
- "eval_steps_per_second": 22.838,
288
  "step": 280000
289
  },
290
  {
291
- "epoch": 1.04,
292
- "learning_rate": 4.4805641431636866e-05,
293
- "loss": 0.0652,
294
- "step": 290000
295
- },
296
- {
297
- "epoch": 1.07,
298
- "learning_rate": 4.462652561893469e-05,
299
- "loss": 0.0631,
300
- "step": 300000
301
- },
302
- {
303
- "epoch": 1.07,
304
- "eval_loss": 0.06102127209305763,
305
- "eval_runtime": 705.9715,
306
- "eval_samples_per_second": 87.869,
307
- "eval_steps_per_second": 21.968,
308
  "step": 300000
309
  },
310
  {
311
- "epoch": 1.11,
312
- "learning_rate": 4.444740980623252e-05,
313
- "loss": 0.064,
314
- "step": 310000
315
- },
316
- {
317
- "epoch": 1.15,
318
- "learning_rate": 4.426829399353034e-05,
319
- "loss": 0.0643,
320
  "step": 320000
321
  },
322
  {
323
- "epoch": 1.15,
324
- "eval_loss": 0.05971848592162132,
325
- "eval_runtime": 698.4495,
326
- "eval_samples_per_second": 88.815,
327
- "eval_steps_per_second": 22.205,
328
  "step": 320000
329
- },
330
- {
331
- "epoch": 1.18,
332
- "learning_rate": 4.408917818082816e-05,
333
- "loss": 0.0638,
334
- "step": 330000
335
- },
336
- {
337
- "epoch": 1.22,
338
- "learning_rate": 4.391006236812599e-05,
339
- "loss": 0.064,
340
- "step": 340000
341
- },
342
- {
343
- "epoch": 1.22,
344
- "eval_loss": 0.05945688858628273,
345
- "eval_runtime": 700.9514,
346
- "eval_samples_per_second": 88.498,
347
- "eval_steps_per_second": 22.126,
348
- "step": 340000
349
- },
350
- {
351
- "epoch": 1.25,
352
- "learning_rate": 4.373094655542381e-05,
353
- "loss": 0.0613,
354
- "step": 350000
355
- },
356
- {
357
- "epoch": 1.29,
358
- "learning_rate": 4.355183074272163e-05,
359
- "loss": 0.0635,
360
- "step": 360000
361
- },
362
- {
363
- "epoch": 1.29,
364
- "eval_loss": 0.06002034246921539,
365
- "eval_runtime": 697.4531,
366
- "eval_samples_per_second": 88.942,
367
- "eval_steps_per_second": 22.237,
368
- "step": 360000
369
- },
370
- {
371
- "epoch": 1.33,
372
- "learning_rate": 4.337271493001945e-05,
373
- "loss": 0.0635,
374
- "step": 370000
375
- },
376
- {
377
- "epoch": 1.36,
378
- "learning_rate": 4.319359911731728e-05,
379
- "loss": 0.0627,
380
- "step": 380000
381
- },
382
- {
383
- "epoch": 1.36,
384
- "eval_loss": 0.0593414306640625,
385
- "eval_runtime": 720.1404,
386
- "eval_samples_per_second": 86.14,
387
- "eval_steps_per_second": 21.536,
388
- "step": 380000
389
- },
390
- {
391
- "epoch": 1.4,
392
- "learning_rate": 4.30144833046151e-05,
393
- "loss": 0.0629,
394
- "step": 390000
395
- },
396
- {
397
- "epoch": 1.43,
398
- "learning_rate": 4.2835367491912923e-05,
399
- "loss": 0.0642,
400
- "step": 400000
401
- },
402
- {
403
- "epoch": 1.43,
404
- "eval_loss": 0.059706129133701324,
405
- "eval_runtime": 706.6838,
406
- "eval_samples_per_second": 87.78,
407
- "eval_steps_per_second": 21.946,
408
- "step": 400000
409
- },
410
- {
411
- "epoch": 1.47,
412
- "learning_rate": 4.265625167921074e-05,
413
- "loss": 0.062,
414
- "step": 410000
415
- },
416
- {
417
- "epoch": 1.5,
418
- "learning_rate": 4.247713586650857e-05,
419
- "loss": 0.0616,
420
- "step": 420000
421
- },
422
- {
423
- "epoch": 1.5,
424
- "eval_loss": 0.058674536645412445,
425
- "eval_runtime": 740.0142,
426
- "eval_samples_per_second": 83.827,
427
- "eval_steps_per_second": 20.958,
428
- "step": 420000
429
- },
430
- {
431
- "epoch": 1.54,
432
- "learning_rate": 4.2298020053806394e-05,
433
- "loss": 0.0611,
434
- "step": 430000
435
- },
436
- {
437
- "epoch": 1.58,
438
- "learning_rate": 4.211890424110422e-05,
439
- "loss": 0.0618,
440
- "step": 440000
441
- },
442
- {
443
- "epoch": 1.58,
444
- "eval_loss": 0.05948682501912117,
445
- "eval_runtime": 698.4731,
446
- "eval_samples_per_second": 88.812,
447
- "eval_steps_per_second": 22.204,
448
- "step": 440000
449
- },
450
- {
451
- "epoch": 1.61,
452
- "learning_rate": 4.193978842840204e-05,
453
- "loss": 0.062,
454
- "step": 450000
455
- },
456
- {
457
- "epoch": 1.65,
458
- "learning_rate": 4.176067261569986e-05,
459
- "loss": 0.0603,
460
- "step": 460000
461
- },
462
- {
463
- "epoch": 1.65,
464
- "eval_loss": 0.059506043791770935,
465
- "eval_runtime": 699.0665,
466
- "eval_samples_per_second": 88.737,
467
- "eval_steps_per_second": 22.185,
468
- "step": 460000
469
- },
470
- {
471
- "epoch": 1.68,
472
- "learning_rate": 4.1581556802997684e-05,
473
- "loss": 0.0606,
474
- "step": 470000
475
- },
476
- {
477
- "epoch": 1.72,
478
- "learning_rate": 4.1402440990295504e-05,
479
- "loss": 0.0596,
480
- "step": 480000
481
- },
482
- {
483
- "epoch": 1.72,
484
- "eval_loss": 0.06113377958536148,
485
- "eval_runtime": 687.5695,
486
- "eval_samples_per_second": 90.221,
487
- "eval_steps_per_second": 22.556,
488
- "step": 480000
489
- },
490
- {
491
- "epoch": 1.76,
492
- "learning_rate": 4.122332517759333e-05,
493
- "loss": 0.0604,
494
- "step": 490000
495
- },
496
- {
497
- "epoch": 1.79,
498
- "learning_rate": 4.104420936489115e-05,
499
- "loss": 0.0602,
500
- "step": 500000
501
- },
502
- {
503
- "epoch": 1.79,
504
- "eval_loss": 0.058628179132938385,
505
- "eval_runtime": 685.1917,
506
- "eval_samples_per_second": 90.534,
507
- "eval_steps_per_second": 22.635,
508
- "step": 500000
509
- },
510
- {
511
- "epoch": 1.83,
512
- "learning_rate": 4.0865093552188975e-05,
513
- "loss": 0.0588,
514
- "step": 510000
515
- },
516
- {
517
- "epoch": 1.86,
518
- "learning_rate": 4.0685977739486794e-05,
519
- "loss": 0.0603,
520
- "step": 520000
521
- },
522
- {
523
- "epoch": 1.86,
524
- "eval_loss": 0.058195825666189194,
525
- "eval_runtime": 684.8426,
526
- "eval_samples_per_second": 90.58,
527
- "eval_steps_per_second": 22.646,
528
- "step": 520000
529
- },
530
- {
531
- "epoch": 1.9,
532
- "learning_rate": 4.050686192678462e-05,
533
- "loss": 0.0606,
534
- "step": 530000
535
- },
536
- {
537
- "epoch": 1.93,
538
- "learning_rate": 4.0327746114082445e-05,
539
- "loss": 0.0603,
540
- "step": 540000
541
- },
542
- {
543
- "epoch": 1.93,
544
- "eval_loss": 0.059178441762924194,
545
- "eval_runtime": 686.71,
546
- "eval_samples_per_second": 90.334,
547
- "eval_steps_per_second": 22.584,
548
- "step": 540000
549
- },
550
- {
551
- "epoch": 1.97,
552
- "learning_rate": 4.014863030138027e-05,
553
- "loss": 0.0614,
554
- "step": 550000
555
- },
556
- {
557
- "epoch": 2.01,
558
- "learning_rate": 3.996951448867809e-05,
559
- "loss": 0.0596,
560
- "step": 560000
561
- },
562
- {
563
- "epoch": 2.01,
564
- "eval_loss": 0.05809653550386429,
565
- "eval_runtime": 686.1393,
566
- "eval_samples_per_second": 90.409,
567
- "eval_steps_per_second": 22.603,
568
- "step": 560000
569
- },
570
- {
571
- "epoch": 2.04,
572
- "learning_rate": 3.9790398675975916e-05,
573
- "loss": 0.0587,
574
- "step": 570000
575
- },
576
- {
577
- "epoch": 2.08,
578
- "learning_rate": 3.9611282863273736e-05,
579
- "loss": 0.0584,
580
- "step": 580000
581
- },
582
- {
583
- "epoch": 2.08,
584
- "eval_loss": 0.05719467252492905,
585
- "eval_runtime": 686.9168,
586
- "eval_samples_per_second": 90.306,
587
- "eval_steps_per_second": 22.578,
588
- "step": 580000
589
- },
590
- {
591
- "epoch": 2.11,
592
- "learning_rate": 3.943216705057156e-05,
593
- "loss": 0.0592,
594
- "step": 590000
595
- },
596
- {
597
- "epoch": 2.15,
598
- "learning_rate": 3.925305123786938e-05,
599
- "loss": 0.0592,
600
- "step": 600000
601
- },
602
- {
603
- "epoch": 2.15,
604
- "eval_loss": 0.05839799717068672,
605
- "eval_runtime": 691.5693,
606
- "eval_samples_per_second": 89.699,
607
- "eval_steps_per_second": 22.426,
608
- "step": 600000
609
- },
610
- {
611
- "epoch": 2.19,
612
- "learning_rate": 3.9073935425167206e-05,
613
- "loss": 0.0583,
614
- "step": 610000
615
- },
616
- {
617
- "epoch": 2.22,
618
- "learning_rate": 3.8894819612465026e-05,
619
- "loss": 0.0598,
620
- "step": 620000
621
- },
622
- {
623
- "epoch": 2.22,
624
- "eval_loss": 0.05699307098984718,
625
- "eval_runtime": 703.0844,
626
- "eval_samples_per_second": 88.23,
627
- "eval_steps_per_second": 22.059,
628
- "step": 620000
629
- },
630
- {
631
- "epoch": 2.26,
632
- "learning_rate": 3.871570379976285e-05,
633
- "loss": 0.0587,
634
- "step": 630000
635
- },
636
- {
637
- "epoch": 2.29,
638
- "learning_rate": 3.853658798706067e-05,
639
- "loss": 0.058,
640
- "step": 640000
641
- },
642
- {
643
- "epoch": 2.29,
644
- "eval_loss": 0.05786525830626488,
645
- "eval_runtime": 719.4112,
646
- "eval_samples_per_second": 86.227,
647
- "eval_steps_per_second": 21.558,
648
- "step": 640000
649
- },
650
- {
651
- "epoch": 2.33,
652
- "learning_rate": 3.8357472174358497e-05,
653
- "loss": 0.0579,
654
- "step": 650000
655
- },
656
- {
657
- "epoch": 2.36,
658
- "learning_rate": 3.817835636165632e-05,
659
- "loss": 0.0576,
660
- "step": 660000
661
- },
662
- {
663
- "epoch": 2.36,
664
- "eval_loss": 0.05708477646112442,
665
- "eval_runtime": 740.1092,
666
- "eval_samples_per_second": 83.816,
667
- "eval_steps_per_second": 20.955,
668
- "step": 660000
669
- },
670
- {
671
- "epoch": 2.4,
672
- "learning_rate": 3.799924054895415e-05,
673
- "loss": 0.0578,
674
- "step": 670000
675
- },
676
- {
677
- "epoch": 2.44,
678
- "learning_rate": 3.782012473625197e-05,
679
- "loss": 0.0586,
680
- "step": 680000
681
- },
682
- {
683
- "epoch": 2.44,
684
- "eval_loss": 0.05821482464671135,
685
- "eval_runtime": 697.0334,
686
- "eval_samples_per_second": 88.996,
687
- "eval_steps_per_second": 22.25,
688
- "step": 680000
689
- },
690
- {
691
- "epoch": 2.47,
692
- "learning_rate": 3.764100892354979e-05,
693
- "loss": 0.0579,
694
- "step": 690000
695
- },
696
- {
697
- "epoch": 2.51,
698
- "learning_rate": 3.746189311084761e-05,
699
- "loss": 0.0579,
700
- "step": 700000
701
- },
702
- {
703
- "epoch": 2.51,
704
- "eval_loss": 0.057255882769823074,
705
- "eval_runtime": 693.1254,
706
- "eval_samples_per_second": 89.498,
707
- "eval_steps_per_second": 22.375,
708
- "step": 700000
709
- },
710
- {
711
- "epoch": 2.54,
712
- "learning_rate": 3.728277729814544e-05,
713
- "loss": 0.0586,
714
- "step": 710000
715
- },
716
- {
717
- "epoch": 2.58,
718
- "learning_rate": 3.710366148544326e-05,
719
- "loss": 0.0583,
720
- "step": 720000
721
- },
722
- {
723
- "epoch": 2.58,
724
- "eval_loss": 0.0572069026529789,
725
- "eval_runtime": 691.2275,
726
- "eval_samples_per_second": 89.743,
727
- "eval_steps_per_second": 22.437,
728
- "step": 720000
729
- },
730
- {
731
- "epoch": 2.62,
732
- "learning_rate": 3.6924545672741083e-05,
733
- "loss": 0.0591,
734
- "step": 730000
735
- },
736
- {
737
- "epoch": 2.65,
738
- "learning_rate": 3.67454298600389e-05,
739
- "loss": 0.0573,
740
- "step": 740000
741
- },
742
- {
743
- "epoch": 2.65,
744
- "eval_loss": 0.05713275447487831,
745
- "eval_runtime": 696.2667,
746
- "eval_samples_per_second": 89.094,
747
- "eval_steps_per_second": 22.275,
748
- "step": 740000
749
- },
750
- {
751
- "epoch": 2.69,
752
- "learning_rate": 3.656631404733673e-05,
753
- "loss": 0.0559,
754
- "step": 750000
755
- },
756
- {
757
- "epoch": 2.72,
758
- "learning_rate": 3.638719823463455e-05,
759
- "loss": 0.0571,
760
- "step": 760000
761
- },
762
- {
763
- "epoch": 2.72,
764
- "eval_loss": 0.056393299251794815,
765
- "eval_runtime": 683.9753,
766
- "eval_samples_per_second": 90.695,
767
- "eval_steps_per_second": 22.675,
768
- "step": 760000
769
- },
770
- {
771
- "epoch": 2.76,
772
- "learning_rate": 3.6208082421932374e-05,
773
- "loss": 0.0572,
774
- "step": 770000
775
- },
776
- {
777
- "epoch": 2.79,
778
- "learning_rate": 3.60289666092302e-05,
779
- "loss": 0.0571,
780
- "step": 780000
781
- },
782
- {
783
- "epoch": 2.79,
784
- "eval_loss": 0.057615023106336594,
785
- "eval_runtime": 683.0895,
786
- "eval_samples_per_second": 90.812,
787
- "eval_steps_per_second": 22.704,
788
- "step": 780000
789
- },
790
- {
791
- "epoch": 2.83,
792
- "learning_rate": 3.5849850796528025e-05,
793
- "loss": 0.0575,
794
- "step": 790000
795
- },
796
- {
797
- "epoch": 2.87,
798
- "learning_rate": 3.5670734983825844e-05,
799
- "loss": 0.0556,
800
- "step": 800000
801
- },
802
- {
803
- "epoch": 2.87,
804
- "eval_loss": 0.055679138749837875,
805
- "eval_runtime": 682.7397,
806
- "eval_samples_per_second": 90.859,
807
- "eval_steps_per_second": 22.716,
808
- "step": 800000
809
- },
810
- {
811
- "epoch": 2.9,
812
- "learning_rate": 3.549161917112367e-05,
813
- "loss": 0.057,
814
- "step": 810000
815
- },
816
- {
817
- "epoch": 2.94,
818
- "learning_rate": 3.531250335842149e-05,
819
- "loss": 0.057,
820
- "step": 820000
821
- },
822
- {
823
- "epoch": 2.94,
824
- "eval_loss": 0.056380968540906906,
825
- "eval_runtime": 682.9026,
826
- "eval_samples_per_second": 90.837,
827
- "eval_steps_per_second": 22.71,
828
- "step": 820000
829
- },
830
- {
831
- "epoch": 2.97,
832
- "learning_rate": 3.5133387545719315e-05,
833
- "loss": 0.0558,
834
- "step": 830000
835
- },
836
- {
837
- "epoch": 3.01,
838
- "learning_rate": 3.4954271733017135e-05,
839
- "loss": 0.0561,
840
- "step": 840000
841
- },
842
- {
843
- "epoch": 3.01,
844
- "eval_loss": 0.05610540881752968,
845
- "eval_runtime": 684.8005,
846
- "eval_samples_per_second": 90.586,
847
- "eval_steps_per_second": 22.647,
848
- "step": 840000
849
- },
850
- {
851
- "epoch": 3.04,
852
- "learning_rate": 3.477515592031496e-05,
853
- "loss": 0.0558,
854
- "step": 850000
855
- },
856
- {
857
- "epoch": 3.08,
858
- "learning_rate": 3.459604010761278e-05,
859
- "loss": 0.0552,
860
- "step": 860000
861
- },
862
- {
863
- "epoch": 3.08,
864
- "eval_loss": 0.05571727082133293,
865
- "eval_runtime": 684.9047,
866
- "eval_samples_per_second": 90.572,
867
- "eval_steps_per_second": 22.644,
868
- "step": 860000
869
- },
870
- {
871
- "epoch": 3.12,
872
- "learning_rate": 3.4416924294910605e-05,
873
- "loss": 0.0566,
874
- "step": 870000
875
- },
876
- {
877
- "epoch": 3.15,
878
- "learning_rate": 3.4237808482208425e-05,
879
- "loss": 0.0572,
880
- "step": 880000
881
- },
882
- {
883
- "epoch": 3.15,
884
- "eval_loss": 0.05569184198975563,
885
- "eval_runtime": 685.6335,
886
- "eval_samples_per_second": 90.475,
887
- "eval_steps_per_second": 22.62,
888
- "step": 880000
889
- },
890
- {
891
- "epoch": 3.19,
892
- "learning_rate": 3.405869266950625e-05,
893
- "loss": 0.0558,
894
- "step": 890000
895
- },
896
- {
897
- "epoch": 3.22,
898
- "learning_rate": 3.3879576856804076e-05,
899
- "loss": 0.0559,
900
- "step": 900000
901
- },
902
- {
903
- "epoch": 3.22,
904
- "eval_loss": 0.055385466665029526,
905
- "eval_runtime": 689.1891,
906
- "eval_samples_per_second": 90.009,
907
- "eval_steps_per_second": 22.503,
908
- "step": 900000
909
- },
910
- {
911
- "epoch": 3.26,
912
- "learning_rate": 3.37004610441019e-05,
913
- "loss": 0.0557,
914
- "step": 910000
915
- },
916
- {
917
- "epoch": 3.3,
918
- "learning_rate": 3.352134523139972e-05,
919
- "loss": 0.0548,
920
- "step": 920000
921
- },
922
- {
923
- "epoch": 3.3,
924
- "eval_loss": 0.05598163977265358,
925
- "eval_runtime": 684.1191,
926
- "eval_samples_per_second": 90.676,
927
- "eval_steps_per_second": 22.67,
928
- "step": 920000
929
- },
930
- {
931
- "epoch": 3.33,
932
- "learning_rate": 3.334222941869754e-05,
933
- "loss": 0.0552,
934
- "step": 930000
935
- },
936
- {
937
- "epoch": 3.37,
938
- "learning_rate": 3.3163113605995366e-05,
939
- "loss": 0.0565,
940
- "step": 940000
941
- },
942
- {
943
- "epoch": 3.37,
944
- "eval_loss": 0.055193424224853516,
945
- "eval_runtime": 732.5988,
946
- "eval_samples_per_second": 84.675,
947
- "eval_steps_per_second": 21.17,
948
- "step": 940000
949
- },
950
- {
951
- "epoch": 3.4,
952
- "learning_rate": 3.2983997793293186e-05,
953
- "loss": 0.0557,
954
- "step": 950000
955
- },
956
- {
957
- "epoch": 3.44,
958
- "learning_rate": 3.280488198059101e-05,
959
- "loss": 0.055,
960
- "step": 960000
961
- },
962
- {
963
- "epoch": 3.44,
964
- "eval_loss": 0.05571094900369644,
965
- "eval_runtime": 724.8884,
966
- "eval_samples_per_second": 85.576,
967
- "eval_steps_per_second": 21.395,
968
- "step": 960000
969
- },
970
- {
971
- "epoch": 3.47,
972
- "learning_rate": 3.262576616788883e-05,
973
- "loss": 0.0556,
974
- "step": 970000
975
- },
976
- {
977
- "epoch": 3.51,
978
- "learning_rate": 3.2446650355186657e-05,
979
- "loss": 0.0563,
980
- "step": 980000
981
- },
982
- {
983
- "epoch": 3.51,
984
- "eval_loss": 0.05553479492664337,
985
- "eval_runtime": 714.2985,
986
- "eval_samples_per_second": 86.845,
987
- "eval_steps_per_second": 21.712,
988
- "step": 980000
989
- },
990
- {
991
- "epoch": 3.55,
992
- "learning_rate": 3.2267534542484476e-05,
993
- "loss": 0.0552,
994
- "step": 990000
995
- },
996
- {
997
- "epoch": 3.58,
998
- "learning_rate": 3.20884187297823e-05,
999
- "loss": 0.0553,
1000
- "step": 1000000
1001
- },
1002
- {
1003
- "epoch": 3.58,
1004
- "eval_loss": 0.05587638169527054,
1005
- "eval_runtime": 709.3024,
1006
- "eval_samples_per_second": 87.456,
1007
- "eval_steps_per_second": 21.865,
1008
- "step": 1000000
1009
- },
1010
- {
1011
- "epoch": 3.62,
1012
- "learning_rate": 3.190930291708013e-05,
1013
- "loss": 0.0544,
1014
- "step": 1010000
1015
- },
1016
- {
1017
- "epoch": 3.65,
1018
- "learning_rate": 3.1730187104377953e-05,
1019
- "loss": 0.0545,
1020
- "step": 1020000
1021
- },
1022
- {
1023
- "epoch": 3.65,
1024
- "eval_loss": 0.05536632239818573,
1025
- "eval_runtime": 696.888,
1026
- "eval_samples_per_second": 89.014,
1027
- "eval_steps_per_second": 22.255,
1028
- "step": 1020000
1029
- },
1030
- {
1031
- "epoch": 3.69,
1032
- "learning_rate": 3.155107129167577e-05,
1033
- "loss": 0.0543,
1034
- "step": 1030000
1035
- },
1036
- {
1037
- "epoch": 3.73,
1038
- "learning_rate": 3.13719554789736e-05,
1039
- "loss": 0.0544,
1040
- "step": 1040000
1041
- },
1042
- {
1043
- "epoch": 3.73,
1044
- "eval_loss": 0.05515935644507408,
1045
- "eval_runtime": 706.1745,
1046
- "eval_samples_per_second": 87.844,
1047
- "eval_steps_per_second": 21.962,
1048
- "step": 1040000
1049
- },
1050
- {
1051
- "epoch": 3.76,
1052
- "learning_rate": 3.119283966627142e-05,
1053
- "loss": 0.0534,
1054
- "step": 1050000
1055
- },
1056
- {
1057
- "epoch": 3.8,
1058
- "learning_rate": 3.1013723853569243e-05,
1059
- "loss": 0.0552,
1060
- "step": 1060000
1061
- },
1062
- {
1063
- "epoch": 3.8,
1064
- "eval_loss": 0.054923560470342636,
1065
- "eval_runtime": 716.4597,
1066
- "eval_samples_per_second": 86.583,
1067
- "eval_steps_per_second": 21.647,
1068
- "step": 1060000
1069
- },
1070
- {
1071
- "epoch": 3.83,
1072
- "learning_rate": 3.083460804086706e-05,
1073
- "loss": 0.055,
1074
- "step": 1070000
1075
- },
1076
- {
1077
- "epoch": 3.87,
1078
- "learning_rate": 3.065549222816489e-05,
1079
- "loss": 0.0547,
1080
- "step": 1080000
1081
- },
1082
- {
1083
- "epoch": 3.87,
1084
- "eval_loss": 0.05532824248075485,
1085
- "eval_runtime": 713.1552,
1086
- "eval_samples_per_second": 86.984,
1087
- "eval_steps_per_second": 21.747,
1088
- "step": 1080000
1089
- },
1090
- {
1091
- "epoch": 3.9,
1092
- "learning_rate": 3.047637641546271e-05,
1093
- "loss": 0.0526,
1094
- "step": 1090000
1095
- },
1096
- {
1097
- "epoch": 3.94,
1098
- "learning_rate": 3.0297260602760537e-05,
1099
- "loss": 0.0536,
1100
- "step": 1100000
1101
- },
1102
- {
1103
- "epoch": 3.94,
1104
- "eval_loss": 0.05594659596681595,
1105
- "eval_runtime": 765.7749,
1106
- "eval_samples_per_second": 81.007,
1107
- "eval_steps_per_second": 20.253,
1108
- "step": 1100000
1109
  }
1110
  ],
1111
- "max_steps": 2791490,
1112
  "num_train_epochs": 10,
1113
- "total_flos": 7.350678561491497e+17,
1114
  "trial_name": null,
1115
  "trial_params": null
1116
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.678837487353996,
5
+ "global_step": 320000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
+ "epoch": 0.06,
12
+ "learning_rate": 4.971259082130047e-05,
13
+ "loss": 0.0475,
14
+ "step": 5000
15
  },
16
  {
17
+ "epoch": 0.11,
18
+ "learning_rate": 4.9425181642600946e-05,
19
+ "loss": 0.0494,
20
+ "step": 10000
 
 
 
 
 
 
 
 
21
  },
22
  {
23
  "epoch": 0.11,
24
+ "eval_loss": 0.04334285110235214,
25
+ "eval_runtime": 233.2791,
26
+ "eval_samples_per_second": 82.862,
27
+ "eval_steps_per_second": 20.718,
28
+ "step": 10000
29
  },
30
  {
31
+ "epoch": 0.17,
32
+ "learning_rate": 4.913777246390141e-05,
33
+ "loss": 0.049,
34
+ "step": 15000
35
  },
36
  {
37
+ "epoch": 0.23,
38
+ "learning_rate": 4.8850363285201876e-05,
39
+ "loss": 0.0477,
40
+ "step": 20000
 
 
41
  },
42
  {
43
+ "epoch": 0.23,
44
+ "eval_loss": 0.04320817440748215,
45
+ "eval_runtime": 230.0333,
46
+ "eval_samples_per_second": 84.031,
47
+ "eval_steps_per_second": 21.01,
48
+ "step": 20000
49
  },
50
  {
51
+ "epoch": 0.29,
52
+ "learning_rate": 4.856295410650235e-05,
53
+ "loss": 0.0494,
54
+ "step": 25000
55
  },
56
  {
57
+ "epoch": 0.34,
58
+ "learning_rate": 4.827554492780282e-05,
59
+ "loss": 0.0464,
60
+ "step": 30000
 
 
61
  },
62
  {
63
+ "epoch": 0.34,
64
+ "eval_loss": 0.04396720230579376,
65
+ "eval_runtime": 235.2064,
66
+ "eval_samples_per_second": 82.183,
67
+ "eval_steps_per_second": 20.548,
68
+ "step": 30000
69
  },
70
  {
71
+ "epoch": 0.4,
72
+ "learning_rate": 4.7988135749103285e-05,
73
+ "loss": 0.0491,
74
+ "step": 35000
75
  },
76
  {
77
+ "epoch": 0.46,
78
+ "learning_rate": 4.770072657040375e-05,
79
+ "loss": 0.0472,
80
+ "step": 40000
 
 
81
  },
82
  {
83
+ "epoch": 0.46,
84
+ "eval_loss": 0.04489925131201744,
85
+ "eval_runtime": 225.0238,
86
+ "eval_samples_per_second": 85.902,
87
+ "eval_steps_per_second": 21.478,
88
+ "step": 40000
89
  },
90
  {
91
+ "epoch": 0.69,
92
+ "learning_rate": 4.655108985560563e-05,
93
+ "loss": 0.0484,
94
+ "step": 60000
95
  },
96
  {
97
+ "epoch": 0.92,
98
+ "learning_rate": 4.540145314080751e-05,
99
+ "loss": 0.0481,
100
+ "step": 80000
 
 
101
  },
102
  {
103
+ "epoch": 0.92,
104
+ "eval_loss": 0.044077709317207336,
105
+ "eval_runtime": 226.2289,
106
+ "eval_samples_per_second": 85.444,
107
+ "eval_steps_per_second": 21.363,
108
+ "step": 80000
109
  },
110
  {
111
+ "epoch": 1.15,
112
+ "learning_rate": 4.4251816426009386e-05,
113
+ "loss": 0.0483,
114
+ "step": 100000
115
  },
116
  {
117
+ "epoch": 1.38,
118
+ "learning_rate": 4.310217971121126e-05,
119
+ "loss": 0.0463,
 
 
120
  "step": 120000
121
  },
122
  {
123
+ "epoch": 1.38,
124
+ "eval_loss": 0.04454144090414047,
125
+ "eval_runtime": 228.877,
126
+ "eval_samples_per_second": 84.456,
127
+ "eval_steps_per_second": 21.116,
128
+ "step": 120000
 
 
 
 
129
  },
130
  {
131
+ "epoch": 1.61,
132
+ "learning_rate": 4.1952542996413133e-05,
133
+ "loss": 0.0467,
 
 
134
  "step": 140000
135
  },
136
  {
137
+ "epoch": 1.84,
138
+ "learning_rate": 4.0802906281615014e-05,
139
+ "loss": 0.0465,
 
 
 
 
 
 
140
  "step": 160000
141
  },
142
  {
143
+ "epoch": 1.84,
144
+ "eval_loss": 0.04413667321205139,
145
+ "eval_runtime": 228.9059,
146
+ "eval_samples_per_second": 84.445,
147
+ "eval_steps_per_second": 21.113,
148
  "step": 160000
149
  },
150
  {
151
+ "epoch": 2.07,
152
+ "learning_rate": 3.965326956681689e-05,
153
+ "loss": 0.0469,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  "step": 180000
155
  },
156
  {
157
+ "epoch": 2.3,
158
+ "learning_rate": 3.850363285201876e-05,
159
+ "loss": 0.0447,
 
 
 
 
 
 
160
  "step": 200000
161
  },
162
  {
163
+ "epoch": 2.3,
164
+ "eval_loss": 0.04485001042485237,
165
+ "eval_runtime": 237.9271,
166
+ "eval_samples_per_second": 81.243,
167
+ "eval_steps_per_second": 20.313,
168
  "step": 200000
169
  },
170
  {
171
+ "epoch": 2.53,
172
+ "learning_rate": 3.735399613722064e-05,
173
+ "loss": 0.0445,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  "step": 220000
175
  },
176
  {
177
+ "epoch": 2.76,
178
+ "learning_rate": 3.620435942242252e-05,
179
+ "loss": 0.0435,
 
 
 
 
 
 
180
  "step": 240000
181
  },
182
  {
183
+ "epoch": 2.76,
184
+ "eval_loss": 0.04417626932263374,
185
+ "eval_runtime": 239.7656,
186
+ "eval_samples_per_second": 80.62,
187
+ "eval_steps_per_second": 20.157,
188
  "step": 240000
189
  },
190
  {
191
+ "epoch": 2.99,
192
+ "learning_rate": 3.505472270762439e-05,
193
+ "loss": 0.0443,
 
 
 
 
 
 
194
  "step": 260000
195
  },
196
  {
197
+ "epoch": 3.22,
198
+ "learning_rate": 3.390508599282627e-05,
199
+ "loss": 0.0425,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  "step": 280000
201
  },
202
  {
203
+ "epoch": 3.22,
204
+ "eval_loss": 0.04555518552660942,
205
+ "eval_runtime": 239.0043,
206
+ "eval_samples_per_second": 80.877,
207
+ "eval_steps_per_second": 20.221,
208
  "step": 280000
209
  },
210
  {
211
+ "epoch": 3.45,
212
+ "learning_rate": 3.2755449278028145e-05,
213
+ "loss": 0.0435,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  "step": 300000
215
  },
216
  {
217
+ "epoch": 3.68,
218
+ "learning_rate": 3.160581256323002e-05,
219
+ "loss": 0.0437,
 
 
 
 
 
 
220
  "step": 320000
221
  },
222
  {
223
+ "epoch": 3.68,
224
+ "eval_loss": 0.04474752023816109,
225
+ "eval_runtime": 291.4231,
226
+ "eval_samples_per_second": 66.33,
227
+ "eval_steps_per_second": 16.584,
228
  "step": 320000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  }
230
  ],
231
+ "max_steps": 869840,
232
  "num_train_epochs": 10,
233
+ "total_flos": 2.2518483734502605e+17,
234
  "trial_name": null,
235
  "trial_params": null
236
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:01e951f41d1c838fb03d74c0997ade49e243ec0d07cb18ff2559ff234bae126c
3
  size 3119
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:981e074dd006f0b82f1adae94afef8c0b53f496530dbead237c5195367cb5167
3
  size 3119