davanstrien HF staff commited on
Commit
e230b5f
1 Parent(s): 0fec038

End of training

Browse files
Files changed (4) hide show
  1. all_results.json +10 -5
  2. eval_results.json +6 -5
  3. train_results.json +7 -0
  4. trainer_state.json +1207 -0
all_results.json CHANGED
@@ -1,7 +1,12 @@
1
  {
2
- "eval_f1": 0.0,
3
- "eval_loss": 9.742283821105957,
4
- "eval_runtime": 68.5247,
5
- "eval_samples_per_second": 223.89,
6
- "eval_steps_per_second": 3.502
 
 
 
 
 
7
  }
1
  {
2
+ "epoch": 30.0,
3
+ "eval_f1": 0.003679096298311292,
4
+ "eval_loss": 5.585614204406738,
5
+ "eval_runtime": 105.002,
6
+ "eval_samples_per_second": 219.158,
7
+ "eval_steps_per_second": 3.429,
8
+ "train_loss": 2.558345020757144,
9
+ "train_runtime": 41575.9952,
10
+ "train_samples_per_second": 94.093,
11
+ "train_steps_per_second": 1.471
12
  }
eval_results.json CHANGED
@@ -1,7 +1,8 @@
1
  {
2
- "eval_f1": 0.0,
3
- "eval_loss": 9.742283821105957,
4
- "eval_runtime": 68.5247,
5
- "eval_samples_per_second": 223.89,
6
- "eval_steps_per_second": 3.502
 
7
  }
1
  {
2
+ "epoch": 30.0,
3
+ "eval_f1": 0.003679096298311292,
4
+ "eval_loss": 5.585614204406738,
5
+ "eval_runtime": 105.002,
6
+ "eval_samples_per_second": 219.158,
7
+ "eval_steps_per_second": 3.429
8
  }
train_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 30.0,
3
+ "train_loss": 2.558345020757144,
4
+ "train_runtime": 41575.9952,
5
+ "train_samples_per_second": 94.093,
6
+ "train_steps_per_second": 1.471
7
+ }
trainer_state.json ADDED
@@ -0,0 +1,1207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 5.585614204406738,
3
+ "best_model_checkpoint": "./iiif_manuscripts_label_ge_50_convnext/checkpoint-12228",
4
+ "epoch": 30.0,
5
+ "global_step": 61140,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.2,
12
+ "learning_rate": 0.00019869152764147858,
13
+ "loss": 7.9686,
14
+ "step": 400
15
+ },
16
+ {
17
+ "epoch": 0.39,
18
+ "learning_rate": 0.00019738632646385348,
19
+ "loss": 7.2475,
20
+ "step": 800
21
+ },
22
+ {
23
+ "epoch": 0.59,
24
+ "learning_rate": 0.00019607785410533205,
25
+ "loss": 6.9462,
26
+ "step": 1200
27
+ },
28
+ {
29
+ "epoch": 0.79,
30
+ "learning_rate": 0.0001947693817468106,
31
+ "loss": 6.7615,
32
+ "step": 1600
33
+ },
34
+ {
35
+ "epoch": 0.98,
36
+ "learning_rate": 0.0001934641805691855,
37
+ "loss": 6.5753,
38
+ "step": 2000
39
+ },
40
+ {
41
+ "epoch": 1.0,
42
+ "eval_f1": 0.0015526657039264577,
43
+ "eval_loss": 6.412076950073242,
44
+ "eval_runtime": 102.5706,
45
+ "eval_samples_per_second": 224.353,
46
+ "eval_steps_per_second": 3.51,
47
+ "step": 2038
48
+ },
49
+ {
50
+ "epoch": 1.18,
51
+ "learning_rate": 0.00019215570821066406,
52
+ "loss": 6.324,
53
+ "step": 2400
54
+ },
55
+ {
56
+ "epoch": 1.37,
57
+ "learning_rate": 0.00019084723585214262,
58
+ "loss": 6.2462,
59
+ "step": 2800
60
+ },
61
+ {
62
+ "epoch": 1.57,
63
+ "learning_rate": 0.00018953876349362122,
64
+ "loss": 6.1389,
65
+ "step": 3200
66
+ },
67
+ {
68
+ "epoch": 1.77,
69
+ "learning_rate": 0.00018823029113509978,
70
+ "loss": 6.0919,
71
+ "step": 3600
72
+ },
73
+ {
74
+ "epoch": 1.96,
75
+ "learning_rate": 0.00018692508995747466,
76
+ "loss": 5.9865,
77
+ "step": 4000
78
+ },
79
+ {
80
+ "epoch": 2.0,
81
+ "eval_f1": 0.002120415247137495,
82
+ "eval_loss": 5.946579456329346,
83
+ "eval_runtime": 101.947,
84
+ "eval_samples_per_second": 225.725,
85
+ "eval_steps_per_second": 3.531,
86
+ "step": 4076
87
+ },
88
+ {
89
+ "epoch": 2.16,
90
+ "learning_rate": 0.00018561661759895323,
91
+ "loss": 5.7381,
92
+ "step": 4400
93
+ },
94
+ {
95
+ "epoch": 2.36,
96
+ "learning_rate": 0.0001843114164213281,
97
+ "loss": 5.6812,
98
+ "step": 4800
99
+ },
100
+ {
101
+ "epoch": 2.55,
102
+ "learning_rate": 0.0001830029440628067,
103
+ "loss": 5.6661,
104
+ "step": 5200
105
+ },
106
+ {
107
+ "epoch": 2.75,
108
+ "learning_rate": 0.00018169447170428526,
109
+ "loss": 5.6508,
110
+ "step": 5600
111
+ },
112
+ {
113
+ "epoch": 2.94,
114
+ "learning_rate": 0.00018038599934576383,
115
+ "loss": 5.6521,
116
+ "step": 6000
117
+ },
118
+ {
119
+ "epoch": 3.0,
120
+ "eval_f1": 0.002850390938715168,
121
+ "eval_loss": 5.764512062072754,
122
+ "eval_runtime": 99.5338,
123
+ "eval_samples_per_second": 231.198,
124
+ "eval_steps_per_second": 3.617,
125
+ "step": 6114
126
+ },
127
+ {
128
+ "epoch": 3.14,
129
+ "learning_rate": 0.0001790775269872424,
130
+ "loss": 5.3744,
131
+ "step": 6400
132
+ },
133
+ {
134
+ "epoch": 3.34,
135
+ "learning_rate": 0.00017776905462872097,
136
+ "loss": 5.3159,
137
+ "step": 6800
138
+ },
139
+ {
140
+ "epoch": 3.53,
141
+ "learning_rate": 0.00017646058227019956,
142
+ "loss": 5.3162,
143
+ "step": 7200
144
+ },
145
+ {
146
+ "epoch": 3.73,
147
+ "learning_rate": 0.00017515210991167813,
148
+ "loss": 5.3139,
149
+ "step": 7600
150
+ },
151
+ {
152
+ "epoch": 3.93,
153
+ "learning_rate": 0.0001738436375531567,
154
+ "loss": 5.3123,
155
+ "step": 8000
156
+ },
157
+ {
158
+ "epoch": 4.0,
159
+ "eval_f1": 0.0032535394402472113,
160
+ "eval_loss": 5.688981056213379,
161
+ "eval_runtime": 98.5752,
162
+ "eval_samples_per_second": 233.446,
163
+ "eval_steps_per_second": 3.652,
164
+ "step": 8152
165
+ },
166
+ {
167
+ "epoch": 4.12,
168
+ "learning_rate": 0.00017253516519463526,
169
+ "loss": 5.0622,
170
+ "step": 8400
171
+ },
172
+ {
173
+ "epoch": 4.32,
174
+ "learning_rate": 0.00017122996401701014,
175
+ "loss": 4.9616,
176
+ "step": 8800
177
+ },
178
+ {
179
+ "epoch": 4.51,
180
+ "learning_rate": 0.00016992476283938504,
181
+ "loss": 5.0117,
182
+ "step": 9200
183
+ },
184
+ {
185
+ "epoch": 4.71,
186
+ "learning_rate": 0.0001686162904808636,
187
+ "loss": 5.0168,
188
+ "step": 9600
189
+ },
190
+ {
191
+ "epoch": 4.91,
192
+ "learning_rate": 0.00016730781812234217,
193
+ "loss": 5.0337,
194
+ "step": 10000
195
+ },
196
+ {
197
+ "epoch": 5.0,
198
+ "eval_f1": 0.003394306441476293,
199
+ "eval_loss": 5.6691813468933105,
200
+ "eval_runtime": 106.5631,
201
+ "eval_samples_per_second": 215.947,
202
+ "eval_steps_per_second": 3.378,
203
+ "step": 10190
204
+ },
205
+ {
206
+ "epoch": 5.1,
207
+ "learning_rate": 0.00016599934576382074,
208
+ "loss": 4.8182,
209
+ "step": 10400
210
+ },
211
+ {
212
+ "epoch": 5.3,
213
+ "learning_rate": 0.0001646908734052993,
214
+ "loss": 4.6457,
215
+ "step": 10800
216
+ },
217
+ {
218
+ "epoch": 5.5,
219
+ "learning_rate": 0.0001633824010467779,
220
+ "loss": 4.6918,
221
+ "step": 11200
222
+ },
223
+ {
224
+ "epoch": 5.69,
225
+ "learning_rate": 0.00016207719986915278,
226
+ "loss": 4.7441,
227
+ "step": 11600
228
+ },
229
+ {
230
+ "epoch": 5.89,
231
+ "learning_rate": 0.00016076872751063134,
232
+ "loss": 4.743,
233
+ "step": 12000
234
+ },
235
+ {
236
+ "epoch": 6.0,
237
+ "eval_f1": 0.003679096298311292,
238
+ "eval_loss": 5.585614204406738,
239
+ "eval_runtime": 103.5743,
240
+ "eval_samples_per_second": 222.179,
241
+ "eval_steps_per_second": 3.476,
242
+ "step": 12228
243
+ },
244
+ {
245
+ "epoch": 6.08,
246
+ "learning_rate": 0.0001594602551521099,
247
+ "loss": 4.5617,
248
+ "step": 12400
249
+ },
250
+ {
251
+ "epoch": 6.28,
252
+ "learning_rate": 0.00015815178279358848,
253
+ "loss": 4.3335,
254
+ "step": 12800
255
+ },
256
+ {
257
+ "epoch": 6.48,
258
+ "learning_rate": 0.00015684331043506705,
259
+ "loss": 4.3856,
260
+ "step": 13200
261
+ },
262
+ {
263
+ "epoch": 6.67,
264
+ "learning_rate": 0.00015553810925744195,
265
+ "loss": 4.3955,
266
+ "step": 13600
267
+ },
268
+ {
269
+ "epoch": 6.87,
270
+ "learning_rate": 0.00015422963689892052,
271
+ "loss": 4.4387,
272
+ "step": 14000
273
+ },
274
+ {
275
+ "epoch": 7.0,
276
+ "eval_f1": 0.004160028924400371,
277
+ "eval_loss": 5.596897125244141,
278
+ "eval_runtime": 104.1264,
279
+ "eval_samples_per_second": 221.001,
280
+ "eval_steps_per_second": 3.457,
281
+ "step": 14266
282
+ },
283
+ {
284
+ "epoch": 7.07,
285
+ "learning_rate": 0.00015292116454039908,
286
+ "loss": 4.3069,
287
+ "step": 14400
288
+ },
289
+ {
290
+ "epoch": 7.26,
291
+ "learning_rate": 0.00015161269218187765,
292
+ "loss": 3.9931,
293
+ "step": 14800
294
+ },
295
+ {
296
+ "epoch": 7.46,
297
+ "learning_rate": 0.00015030421982335622,
298
+ "loss": 4.0845,
299
+ "step": 15200
300
+ },
301
+ {
302
+ "epoch": 7.65,
303
+ "learning_rate": 0.00014899901864573112,
304
+ "loss": 4.1085,
305
+ "step": 15600
306
+ },
307
+ {
308
+ "epoch": 7.85,
309
+ "learning_rate": 0.00014769054628720969,
310
+ "loss": 4.1422,
311
+ "step": 16000
312
+ },
313
+ {
314
+ "epoch": 8.0,
315
+ "eval_f1": 0.004266402669461609,
316
+ "eval_loss": 5.671103477478027,
317
+ "eval_runtime": 107.1495,
318
+ "eval_samples_per_second": 214.765,
319
+ "eval_steps_per_second": 3.36,
320
+ "step": 16304
321
+ },
322
+ {
323
+ "epoch": 8.05,
324
+ "learning_rate": 0.00014638207392868825,
325
+ "loss": 4.0586,
326
+ "step": 16400
327
+ },
328
+ {
329
+ "epoch": 8.24,
330
+ "learning_rate": 0.00014507360157016682,
331
+ "loss": 3.6644,
332
+ "step": 16800
333
+ },
334
+ {
335
+ "epoch": 8.44,
336
+ "learning_rate": 0.00014376512921164541,
337
+ "loss": 3.7197,
338
+ "step": 17200
339
+ },
340
+ {
341
+ "epoch": 8.64,
342
+ "learning_rate": 0.0001424599280340203,
343
+ "loss": 3.796,
344
+ "step": 17600
345
+ },
346
+ {
347
+ "epoch": 8.83,
348
+ "learning_rate": 0.00014115145567549886,
349
+ "loss": 3.8372,
350
+ "step": 18000
351
+ },
352
+ {
353
+ "epoch": 9.0,
354
+ "eval_f1": 0.004420450860447925,
355
+ "eval_loss": 5.676065921783447,
356
+ "eval_runtime": 104.689,
357
+ "eval_samples_per_second": 219.813,
358
+ "eval_steps_per_second": 3.439,
359
+ "step": 18342
360
+ },
361
+ {
362
+ "epoch": 9.03,
363
+ "learning_rate": 0.00013984298331697742,
364
+ "loss": 3.7798,
365
+ "step": 18400
366
+ },
367
+ {
368
+ "epoch": 9.22,
369
+ "learning_rate": 0.000138534510958456,
370
+ "loss": 3.3063,
371
+ "step": 18800
372
+ },
373
+ {
374
+ "epoch": 9.42,
375
+ "learning_rate": 0.00013722603859993459,
376
+ "loss": 3.389,
377
+ "step": 19200
378
+ },
379
+ {
380
+ "epoch": 9.62,
381
+ "learning_rate": 0.00013592083742230946,
382
+ "loss": 3.4438,
383
+ "step": 19600
384
+ },
385
+ {
386
+ "epoch": 9.81,
387
+ "learning_rate": 0.00013461236506378803,
388
+ "loss": 3.5244,
389
+ "step": 20000
390
+ },
391
+ {
392
+ "epoch": 10.0,
393
+ "eval_f1": 0.004167666888342888,
394
+ "eval_loss": 5.8468732833862305,
395
+ "eval_runtime": 105.8207,
396
+ "eval_samples_per_second": 217.462,
397
+ "eval_steps_per_second": 3.402,
398
+ "step": 20380
399
+ },
400
+ {
401
+ "epoch": 10.01,
402
+ "learning_rate": 0.0001333038927052666,
403
+ "loss": 3.542,
404
+ "step": 20400
405
+ },
406
+ {
407
+ "epoch": 10.21,
408
+ "learning_rate": 0.0001319954203467452,
409
+ "loss": 3.023,
410
+ "step": 20800
411
+ },
412
+ {
413
+ "epoch": 10.4,
414
+ "learning_rate": 0.00013068694798822376,
415
+ "loss": 3.0507,
416
+ "step": 21200
417
+ },
418
+ {
419
+ "epoch": 10.6,
420
+ "learning_rate": 0.00012938174681059863,
421
+ "loss": 3.132,
422
+ "step": 21600
423
+ },
424
+ {
425
+ "epoch": 10.79,
426
+ "learning_rate": 0.0001280732744520772,
427
+ "loss": 3.1857,
428
+ "step": 22000
429
+ },
430
+ {
431
+ "epoch": 10.99,
432
+ "learning_rate": 0.0001267648020935558,
433
+ "loss": 3.2321,
434
+ "step": 22400
435
+ },
436
+ {
437
+ "epoch": 11.0,
438
+ "eval_f1": 0.004533528112732692,
439
+ "eval_loss": 5.87736177444458,
440
+ "eval_runtime": 105.6895,
441
+ "eval_samples_per_second": 217.732,
442
+ "eval_steps_per_second": 3.406,
443
+ "step": 22418
444
+ },
445
+ {
446
+ "epoch": 11.19,
447
+ "learning_rate": 0.00012545632973503436,
448
+ "loss": 2.705,
449
+ "step": 22800
450
+ },
451
+ {
452
+ "epoch": 11.38,
453
+ "learning_rate": 0.00012414785737651293,
454
+ "loss": 2.7688,
455
+ "step": 23200
456
+ },
457
+ {
458
+ "epoch": 11.58,
459
+ "learning_rate": 0.0001228393850179915,
460
+ "loss": 2.8223,
461
+ "step": 23600
462
+ },
463
+ {
464
+ "epoch": 11.78,
465
+ "learning_rate": 0.00012153745502126268,
466
+ "loss": 2.8647,
467
+ "step": 24000
468
+ },
469
+ {
470
+ "epoch": 11.97,
471
+ "learning_rate": 0.00012022898266274125,
472
+ "loss": 2.9004,
473
+ "step": 24400
474
+ },
475
+ {
476
+ "epoch": 12.0,
477
+ "eval_f1": 0.004656155608992903,
478
+ "eval_loss": 6.118570804595947,
479
+ "eval_runtime": 103.8763,
480
+ "eval_samples_per_second": 221.533,
481
+ "eval_steps_per_second": 3.466,
482
+ "step": 24456
483
+ },
484
+ {
485
+ "epoch": 12.17,
486
+ "learning_rate": 0.00011892051030421983,
487
+ "loss": 2.4632,
488
+ "step": 24800
489
+ },
490
+ {
491
+ "epoch": 12.37,
492
+ "learning_rate": 0.0001176120379456984,
493
+ "loss": 2.4821,
494
+ "step": 25200
495
+ },
496
+ {
497
+ "epoch": 12.56,
498
+ "learning_rate": 0.00011630356558717696,
499
+ "loss": 2.5336,
500
+ "step": 25600
501
+ },
502
+ {
503
+ "epoch": 12.76,
504
+ "learning_rate": 0.00011499509322865556,
505
+ "loss": 2.5646,
506
+ "step": 26000
507
+ },
508
+ {
509
+ "epoch": 12.95,
510
+ "learning_rate": 0.00011368662087013414,
511
+ "loss": 2.5937,
512
+ "step": 26400
513
+ },
514
+ {
515
+ "epoch": 13.0,
516
+ "eval_f1": 0.004601475826989957,
517
+ "eval_loss": 6.239825248718262,
518
+ "eval_runtime": 107.5857,
519
+ "eval_samples_per_second": 213.895,
520
+ "eval_steps_per_second": 3.346,
521
+ "step": 26494
522
+ },
523
+ {
524
+ "epoch": 13.15,
525
+ "learning_rate": 0.0001123781485116127,
526
+ "loss": 2.231,
527
+ "step": 26800
528
+ },
529
+ {
530
+ "epoch": 13.35,
531
+ "learning_rate": 0.00011106967615309127,
532
+ "loss": 2.149,
533
+ "step": 27200
534
+ },
535
+ {
536
+ "epoch": 13.54,
537
+ "learning_rate": 0.00010976120379456985,
538
+ "loss": 2.2258,
539
+ "step": 27600
540
+ },
541
+ {
542
+ "epoch": 13.74,
543
+ "learning_rate": 0.00010845600261694473,
544
+ "loss": 2.2778,
545
+ "step": 28000
546
+ },
547
+ {
548
+ "epoch": 13.94,
549
+ "learning_rate": 0.00010715080143931959,
550
+ "loss": 2.2983,
551
+ "step": 28400
552
+ },
553
+ {
554
+ "epoch": 14.0,
555
+ "eval_f1": 0.004861690958274386,
556
+ "eval_loss": 6.3732194900512695,
557
+ "eval_runtime": 106.2123,
558
+ "eval_samples_per_second": 216.66,
559
+ "eval_steps_per_second": 3.389,
560
+ "step": 28532
561
+ },
562
+ {
563
+ "epoch": 14.13,
564
+ "learning_rate": 0.00010584232908079817,
565
+ "loss": 2.0303,
566
+ "step": 28800
567
+ },
568
+ {
569
+ "epoch": 14.33,
570
+ "learning_rate": 0.00010453385672227676,
571
+ "loss": 1.9106,
572
+ "step": 29200
573
+ },
574
+ {
575
+ "epoch": 14.52,
576
+ "learning_rate": 0.00010322538436375533,
577
+ "loss": 1.9854,
578
+ "step": 29600
579
+ },
580
+ {
581
+ "epoch": 14.72,
582
+ "learning_rate": 0.0001019169120052339,
583
+ "loss": 2.0189,
584
+ "step": 30000
585
+ },
586
+ {
587
+ "epoch": 14.92,
588
+ "learning_rate": 0.00010060843964671246,
589
+ "loss": 2.0611,
590
+ "step": 30400
591
+ },
592
+ {
593
+ "epoch": 15.0,
594
+ "eval_f1": 0.004546370201449235,
595
+ "eval_loss": 6.502394199371338,
596
+ "eval_runtime": 106.7026,
597
+ "eval_samples_per_second": 215.665,
598
+ "eval_steps_per_second": 3.374,
599
+ "step": 30570
600
+ },
601
+ {
602
+ "epoch": 15.11,
603
+ "learning_rate": 9.929996728819105e-05,
604
+ "loss": 1.8034,
605
+ "step": 30800
606
+ },
607
+ {
608
+ "epoch": 15.31,
609
+ "learning_rate": 9.799476611056592e-05,
610
+ "loss": 1.7043,
611
+ "step": 31200
612
+ },
613
+ {
614
+ "epoch": 15.51,
615
+ "learning_rate": 9.668629375204449e-05,
616
+ "loss": 1.731,
617
+ "step": 31600
618
+ },
619
+ {
620
+ "epoch": 15.7,
621
+ "learning_rate": 9.537782139352307e-05,
622
+ "loss": 1.7611,
623
+ "step": 32000
624
+ },
625
+ {
626
+ "epoch": 15.9,
627
+ "learning_rate": 9.406934903500164e-05,
628
+ "loss": 1.8153,
629
+ "step": 32400
630
+ },
631
+ {
632
+ "epoch": 16.0,
633
+ "eval_f1": 0.004668311047328734,
634
+ "eval_loss": 6.658481121063232,
635
+ "eval_runtime": 106.8749,
636
+ "eval_samples_per_second": 215.317,
637
+ "eval_steps_per_second": 3.368,
638
+ "step": 32608
639
+ },
640
+ {
641
+ "epoch": 16.09,
642
+ "learning_rate": 9.276087667648022e-05,
643
+ "loss": 1.6449,
644
+ "step": 32800
645
+ },
646
+ {
647
+ "epoch": 16.29,
648
+ "learning_rate": 9.145567549885509e-05,
649
+ "loss": 1.4815,
650
+ "step": 33200
651
+ },
652
+ {
653
+ "epoch": 16.49,
654
+ "learning_rate": 9.014720314033367e-05,
655
+ "loss": 1.5311,
656
+ "step": 33600
657
+ },
658
+ {
659
+ "epoch": 16.68,
660
+ "learning_rate": 8.883873078181224e-05,
661
+ "loss": 1.5722,
662
+ "step": 34000
663
+ },
664
+ {
665
+ "epoch": 16.88,
666
+ "learning_rate": 8.75302584232908e-05,
667
+ "loss": 1.6075,
668
+ "step": 34400
669
+ },
670
+ {
671
+ "epoch": 17.0,
672
+ "eval_f1": 0.004309847995944622,
673
+ "eval_loss": 6.833281993865967,
674
+ "eval_runtime": 104.6363,
675
+ "eval_samples_per_second": 219.924,
676
+ "eval_steps_per_second": 3.44,
677
+ "step": 34646
678
+ },
679
+ {
680
+ "epoch": 17.08,
681
+ "learning_rate": 8.622178606476939e-05,
682
+ "loss": 1.4882,
683
+ "step": 34800
684
+ },
685
+ {
686
+ "epoch": 17.27,
687
+ "learning_rate": 8.491331370624795e-05,
688
+ "loss": 1.3091,
689
+ "step": 35200
690
+ },
691
+ {
692
+ "epoch": 17.47,
693
+ "learning_rate": 8.360811252862284e-05,
694
+ "loss": 1.3476,
695
+ "step": 35600
696
+ },
697
+ {
698
+ "epoch": 17.66,
699
+ "learning_rate": 8.229964017010141e-05,
700
+ "loss": 1.3937,
701
+ "step": 36000
702
+ },
703
+ {
704
+ "epoch": 17.86,
705
+ "learning_rate": 8.099116781157998e-05,
706
+ "loss": 1.4342,
707
+ "step": 36400
708
+ },
709
+ {
710
+ "epoch": 18.0,
711
+ "eval_f1": 0.004381425028314271,
712
+ "eval_loss": 6.952894687652588,
713
+ "eval_runtime": 106.0924,
714
+ "eval_samples_per_second": 216.905,
715
+ "eval_steps_per_second": 3.393,
716
+ "step": 36684
717
+ },
718
+ {
719
+ "epoch": 18.06,
720
+ "learning_rate": 7.968269545305856e-05,
721
+ "loss": 1.3502,
722
+ "step": 36800
723
+ },
724
+ {
725
+ "epoch": 18.25,
726
+ "learning_rate": 7.837422309453714e-05,
727
+ "loss": 1.1797,
728
+ "step": 37200
729
+ },
730
+ {
731
+ "epoch": 18.45,
732
+ "learning_rate": 7.70657507360157e-05,
733
+ "loss": 1.234,
734
+ "step": 37600
735
+ },
736
+ {
737
+ "epoch": 18.65,
738
+ "learning_rate": 7.575727837749427e-05,
739
+ "loss": 1.2631,
740
+ "step": 38000
741
+ },
742
+ {
743
+ "epoch": 18.84,
744
+ "learning_rate": 7.445207719986915e-05,
745
+ "loss": 1.2614,
746
+ "step": 38400
747
+ },
748
+ {
749
+ "epoch": 19.0,
750
+ "eval_f1": 0.0045730417031692346,
751
+ "eval_loss": 7.112914085388184,
752
+ "eval_runtime": 106.9736,
753
+ "eval_samples_per_second": 215.118,
754
+ "eval_steps_per_second": 3.365,
755
+ "step": 38722
756
+ },
757
+ {
758
+ "epoch": 19.04,
759
+ "learning_rate": 7.314360484134773e-05,
760
+ "loss": 1.2353,
761
+ "step": 38800
762
+ },
763
+ {
764
+ "epoch": 19.23,
765
+ "learning_rate": 7.183513248282631e-05,
766
+ "loss": 1.0694,
767
+ "step": 39200
768
+ },
769
+ {
770
+ "epoch": 19.43,
771
+ "learning_rate": 7.052666012430488e-05,
772
+ "loss": 1.0988,
773
+ "step": 39600
774
+ },
775
+ {
776
+ "epoch": 19.63,
777
+ "learning_rate": 6.921818776578344e-05,
778
+ "loss": 1.125,
779
+ "step": 40000
780
+ },
781
+ {
782
+ "epoch": 19.82,
783
+ "learning_rate": 6.791298658815833e-05,
784
+ "loss": 1.1463,
785
+ "step": 40400
786
+ },
787
+ {
788
+ "epoch": 20.0,
789
+ "eval_f1": 0.003947589716911441,
790
+ "eval_loss": 7.197678089141846,
791
+ "eval_runtime": 105.623,
792
+ "eval_samples_per_second": 217.869,
793
+ "eval_steps_per_second": 3.408,
794
+ "step": 40760
795
+ },
796
+ {
797
+ "epoch": 20.02,
798
+ "learning_rate": 6.66045142296369e-05,
799
+ "loss": 1.1429,
800
+ "step": 40800
801
+ },
802
+ {
803
+ "epoch": 20.22,
804
+ "learning_rate": 6.529604187111548e-05,
805
+ "loss": 0.9563,
806
+ "step": 41200
807
+ },
808
+ {
809
+ "epoch": 20.41,
810
+ "learning_rate": 6.398756951259405e-05,
811
+ "loss": 1.0085,
812
+ "step": 41600
813
+ },
814
+ {
815
+ "epoch": 20.61,
816
+ "learning_rate": 6.267909715407262e-05,
817
+ "loss": 1.0361,
818
+ "step": 42000
819
+ },
820
+ {
821
+ "epoch": 20.8,
822
+ "learning_rate": 6.13738959764475e-05,
823
+ "loss": 1.0387,
824
+ "step": 42400
825
+ },
826
+ {
827
+ "epoch": 21.0,
828
+ "eval_f1": 0.004382334218618053,
829
+ "eval_loss": 7.270018100738525,
830
+ "eval_runtime": 104.2085,
831
+ "eval_samples_per_second": 220.827,
832
+ "eval_steps_per_second": 3.455,
833
+ "step": 42798
834
+ },
835
+ {
836
+ "epoch": 21.0,
837
+ "learning_rate": 6.006542361792608e-05,
838
+ "loss": 1.0315,
839
+ "step": 42800
840
+ },
841
+ {
842
+ "epoch": 21.2,
843
+ "learning_rate": 5.8756951259404646e-05,
844
+ "loss": 0.8991,
845
+ "step": 43200
846
+ },
847
+ {
848
+ "epoch": 21.39,
849
+ "learning_rate": 5.744847890088322e-05,
850
+ "loss": 0.907,
851
+ "step": 43600
852
+ },
853
+ {
854
+ "epoch": 21.59,
855
+ "learning_rate": 5.61400065423618e-05,
856
+ "loss": 0.9289,
857
+ "step": 44000
858
+ },
859
+ {
860
+ "epoch": 21.79,
861
+ "learning_rate": 5.4831534183840375e-05,
862
+ "loss": 0.9542,
863
+ "step": 44400
864
+ },
865
+ {
866
+ "epoch": 21.98,
867
+ "learning_rate": 5.352306182531894e-05,
868
+ "loss": 0.9635,
869
+ "step": 44800
870
+ },
871
+ {
872
+ "epoch": 22.0,
873
+ "eval_f1": 0.0039901118782942465,
874
+ "eval_loss": 7.337534427642822,
875
+ "eval_runtime": 105.9291,
876
+ "eval_samples_per_second": 217.24,
877
+ "eval_steps_per_second": 3.399,
878
+ "step": 44836
879
+ },
880
+ {
881
+ "epoch": 22.18,
882
+ "learning_rate": 5.2214589466797516e-05,
883
+ "loss": 0.8345,
884
+ "step": 45200
885
+ },
886
+ {
887
+ "epoch": 22.37,
888
+ "learning_rate": 5.090938828917239e-05,
889
+ "loss": 0.8363,
890
+ "step": 45600
891
+ },
892
+ {
893
+ "epoch": 22.57,
894
+ "learning_rate": 4.9600915930650965e-05,
895
+ "loss": 0.8663,
896
+ "step": 46000
897
+ },
898
+ {
899
+ "epoch": 22.77,
900
+ "learning_rate": 4.829244357212954e-05,
901
+ "loss": 0.8618,
902
+ "step": 46400
903
+ },
904
+ {
905
+ "epoch": 22.96,
906
+ "learning_rate": 4.698397121360811e-05,
907
+ "loss": 0.8872,
908
+ "step": 46800
909
+ },
910
+ {
911
+ "epoch": 23.0,
912
+ "eval_f1": 0.003935013281715019,
913
+ "eval_loss": 7.400303363800049,
914
+ "eval_runtime": 115.223,
915
+ "eval_samples_per_second": 199.717,
916
+ "eval_steps_per_second": 3.124,
917
+ "step": 46874
918
+ },
919
+ {
920
+ "epoch": 23.16,
921
+ "learning_rate": 4.5675498855086694e-05,
922
+ "loss": 0.7942,
923
+ "step": 47200
924
+ },
925
+ {
926
+ "epoch": 23.36,
927
+ "learning_rate": 4.437029767746156e-05,
928
+ "loss": 0.803,
929
+ "step": 47600
930
+ },
931
+ {
932
+ "epoch": 23.55,
933
+ "learning_rate": 4.306182531894014e-05,
934
+ "loss": 0.7919,
935
+ "step": 48000
936
+ },
937
+ {
938
+ "epoch": 23.75,
939
+ "learning_rate": 4.175335296041871e-05,
940
+ "loss": 0.8018,
941
+ "step": 48400
942
+ },
943
+ {
944
+ "epoch": 23.95,
945
+ "learning_rate": 4.044488060189729e-05,
946
+ "loss": 0.8156,
947
+ "step": 48800
948
+ },
949
+ {
950
+ "epoch": 24.0,
951
+ "eval_f1": 0.003947052519025861,
952
+ "eval_loss": 7.488423824310303,
953
+ "eval_runtime": 106.3546,
954
+ "eval_samples_per_second": 216.371,
955
+ "eval_steps_per_second": 3.385,
956
+ "step": 48912
957
+ },
958
+ {
959
+ "epoch": 24.14,
960
+ "learning_rate": 3.913640824337586e-05,
961
+ "loss": 0.7462,
962
+ "step": 49200
963
+ },
964
+ {
965
+ "epoch": 24.34,
966
+ "learning_rate": 3.782793588485443e-05,
967
+ "loss": 0.7192,
968
+ "step": 49600
969
+ },
970
+ {
971
+ "epoch": 24.53,
972
+ "learning_rate": 3.6519463526333006e-05,
973
+ "loss": 0.7476,
974
+ "step": 50000
975
+ },
976
+ {
977
+ "epoch": 24.73,
978
+ "learning_rate": 3.521099116781158e-05,
979
+ "loss": 0.7602,
980
+ "step": 50400
981
+ },
982
+ {
983
+ "epoch": 24.93,
984
+ "learning_rate": 3.3902518809290154e-05,
985
+ "loss": 0.7544,
986
+ "step": 50800
987
+ },
988
+ {
989
+ "epoch": 25.0,
990
+ "eval_f1": 0.003877386201510839,
991
+ "eval_loss": 7.476434707641602,
992
+ "eval_runtime": 105.8268,
993
+ "eval_samples_per_second": 217.45,
994
+ "eval_steps_per_second": 3.402,
995
+ "step": 50950
996
+ },
997
+ {
998
+ "epoch": 25.12,
999
+ "learning_rate": 3.259404645076873e-05,
1000
+ "loss": 0.7114,
1001
+ "step": 51200
1002
+ },
1003
+ {
1004
+ "epoch": 25.32,
1005
+ "learning_rate": 3.12855740922473e-05,
1006
+ "loss": 0.683,
1007
+ "step": 51600
1008
+ },
1009
+ {
1010
+ "epoch": 25.52,
1011
+ "learning_rate": 2.998037291462218e-05,
1012
+ "loss": 0.6972,
1013
+ "step": 52000
1014
+ },
1015
+ {
1016
+ "epoch": 25.71,
1017
+ "learning_rate": 2.8671900556100754e-05,
1018
+ "loss": 0.7087,
1019
+ "step": 52400
1020
+ },
1021
+ {
1022
+ "epoch": 25.91,
1023
+ "learning_rate": 2.736669937847563e-05,
1024
+ "loss": 0.6893,
1025
+ "step": 52800
1026
+ },
1027
+ {
1028
+ "epoch": 26.0,
1029
+ "eval_f1": 0.004237152566679073,
1030
+ "eval_loss": 7.515334606170654,
1031
+ "eval_runtime": 104.8855,
1032
+ "eval_samples_per_second": 219.401,
1033
+ "eval_steps_per_second": 3.432,
1034
+ "step": 52988
1035
+ },
1036
+ {
1037
+ "epoch": 26.1,
1038
+ "learning_rate": 2.6058227019954207e-05,
1039
+ "loss": 0.6589,
1040
+ "step": 53200
1041
+ },
1042
+ {
1043
+ "epoch": 26.3,
1044
+ "learning_rate": 2.4749754661432777e-05,
1045
+ "loss": 0.6609,
1046
+ "step": 53600
1047
+ },
1048
+ {
1049
+ "epoch": 26.5,
1050
+ "learning_rate": 2.344128230291135e-05,
1051
+ "loss": 0.6485,
1052
+ "step": 54000
1053
+ },
1054
+ {
1055
+ "epoch": 26.69,
1056
+ "learning_rate": 2.2132809944389925e-05,
1057
+ "loss": 0.667,
1058
+ "step": 54400
1059
+ },
1060
+ {
1061
+ "epoch": 26.89,
1062
+ "learning_rate": 2.08243375858685e-05,
1063
+ "loss": 0.6767,
1064
+ "step": 54800
1065
+ },
1066
+ {
1067
+ "epoch": 27.0,
1068
+ "eval_f1": 0.004296555657055152,
1069
+ "eval_loss": 7.542654514312744,
1070
+ "eval_runtime": 104.6361,
1071
+ "eval_samples_per_second": 219.924,
1072
+ "eval_steps_per_second": 3.44,
1073
+ "step": 55026
1074
+ },
1075
+ {
1076
+ "epoch": 27.09,
1077
+ "learning_rate": 1.9515865227347073e-05,
1078
+ "loss": 0.639,
1079
+ "step": 55200
1080
+ },
1081
+ {
1082
+ "epoch": 27.28,
1083
+ "learning_rate": 1.8207392868825644e-05,
1084
+ "loss": 0.6216,
1085
+ "step": 55600
1086
+ },
1087
+ {
1088
+ "epoch": 27.48,
1089
+ "learning_rate": 1.689892051030422e-05,
1090
+ "loss": 0.627,
1091
+ "step": 56000
1092
+ },
1093
+ {
1094
+ "epoch": 27.67,
1095
+ "learning_rate": 1.5590448151782795e-05,
1096
+ "loss": 0.613,
1097
+ "step": 56400
1098
+ },
1099
+ {
1100
+ "epoch": 27.87,
1101
+ "learning_rate": 1.428524697415767e-05,
1102
+ "loss": 0.6098,
1103
+ "step": 56800
1104
+ },
1105
+ {
1106
+ "epoch": 28.0,
1107
+ "eval_f1": 0.004164219476454643,
1108
+ "eval_loss": 7.554748058319092,
1109
+ "eval_runtime": 103.9745,
1110
+ "eval_samples_per_second": 221.324,
1111
+ "eval_steps_per_second": 3.462,
1112
+ "step": 57064
1113
+ },
1114
+ {
1115
+ "epoch": 28.07,
1116
+ "learning_rate": 1.2976774615636244e-05,
1117
+ "loss": 0.6228,
1118
+ "step": 57200
1119
+ },
1120
+ {
1121
+ "epoch": 28.26,
1122
+ "learning_rate": 1.1668302257114818e-05,
1123
+ "loss": 0.5921,
1124
+ "step": 57600
1125
+ },
1126
+ {
1127
+ "epoch": 28.46,
1128
+ "learning_rate": 1.0359829898593392e-05,
1129
+ "loss": 0.6023,
1130
+ "step": 58000
1131
+ },
1132
+ {
1133
+ "epoch": 28.66,
1134
+ "learning_rate": 9.051357540071966e-06,
1135
+ "loss": 0.5876,
1136
+ "step": 58400
1137
+ },
1138
+ {
1139
+ "epoch": 28.85,
1140
+ "learning_rate": 7.746156362446843e-06,
1141
+ "loss": 0.5871,
1142
+ "step": 58800
1143
+ },
1144
+ {
1145
+ "epoch": 29.0,
1146
+ "eval_f1": 0.00413622990961682,
1147
+ "eval_loss": 7.553284645080566,
1148
+ "eval_runtime": 104.0213,
1149
+ "eval_samples_per_second": 221.224,
1150
+ "eval_steps_per_second": 3.461,
1151
+ "step": 59102
1152
+ },
1153
+ {
1154
+ "epoch": 29.05,
1155
+ "learning_rate": 6.437684003925418e-06,
1156
+ "loss": 0.5787,
1157
+ "step": 59200
1158
+ },
1159
+ {
1160
+ "epoch": 29.24,
1161
+ "learning_rate": 5.129211645403991e-06,
1162
+ "loss": 0.5671,
1163
+ "step": 59600
1164
+ },
1165
+ {
1166
+ "epoch": 29.44,
1167
+ "learning_rate": 3.820739286882564e-06,
1168
+ "loss": 0.5596,
1169
+ "step": 60000
1170
+ },
1171
+ {
1172
+ "epoch": 29.64,
1173
+ "learning_rate": 2.5122669283611384e-06,
1174
+ "loss": 0.5595,
1175
+ "step": 60400
1176
+ },
1177
+ {
1178
+ "epoch": 29.83,
1179
+ "learning_rate": 1.2070657507360158e-06,
1180
+ "loss": 0.5696,
1181
+ "step": 60800
1182
+ },
1183
+ {
1184
+ "epoch": 30.0,
1185
+ "eval_f1": 0.004123942255601062,
1186
+ "eval_loss": 7.559465408325195,
1187
+ "eval_runtime": 98.4712,
1188
+ "eval_samples_per_second": 233.693,
1189
+ "eval_steps_per_second": 3.656,
1190
+ "step": 61140
1191
+ },
1192
+ {
1193
+ "epoch": 30.0,
1194
+ "step": 61140,
1195
+ "total_flos": 3.687496208074506e+20,
1196
+ "train_loss": 2.558345020757144,
1197
+ "train_runtime": 41575.9952,
1198
+ "train_samples_per_second": 94.093,
1199
+ "train_steps_per_second": 1.471
1200
+ }
1201
+ ],
1202
+ "max_steps": 61140,
1203
+ "num_train_epochs": 30,
1204
+ "total_flos": 3.687496208074506e+20,
1205
+ "trial_name": null,
1206
+ "trial_params": null
1207
+ }