crncskn commited on
Commit
124dffa
1 Parent(s): 79ae8b2

End of training

Browse files
README.md CHANGED
@@ -1,5 +1,6 @@
1
  ---
2
  tags:
 
3
  - generated_from_trainer
4
  datasets:
5
  - imagefolder
@@ -13,7 +14,9 @@ should probably proofread and complete it, then remove this comment. -->
13
 
14
  # radiovers16v
15
 
16
- This model is a fine-tuned version of [](https://huggingface.co/) on the imagefolder dataset.
 
 
17
 
18
  ## Model description
19
 
 
1
  ---
2
  tags:
3
+ - masked-auto-encoding
4
  - generated_from_trainer
5
  datasets:
6
  - imagefolder
 
14
 
15
  # radiovers16v
16
 
17
+ This model is a fine-tuned version of [](https://huggingface.co/) on the /kaggle/radioai/radiology_ai dataset.
18
+ It achieves the following results on the evaluation set:
19
+ - Loss: 0.4036
20
 
21
  ## Model description
22
 
all_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 40.0,
3
+ "eval_loss": 0.40356436371803284,
4
+ "eval_runtime": 706.3909,
5
+ "eval_samples_per_second": 172.196,
6
+ "eval_steps_per_second": 21.525,
7
+ "train_loss": 0.47529568801970173,
8
+ "train_runtime": 7092.3542,
9
+ "train_samples_per_second": 76.223,
10
+ "train_steps_per_second": 9.531
11
+ }
eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 40.0,
3
+ "eval_loss": 0.40356436371803284,
4
+ "eval_runtime": 706.3909,
5
+ "eval_samples_per_second": 172.196,
6
+ "eval_steps_per_second": 21.525
7
+ }
runs/Mar11_17-03-21_bf1b508326f5/events.out.tfevents.1710184514.bf1b508326f5.24615.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e6994461c78758cd528cdb289c24c28035aae7bfb44bac60a9f7808347e8712
3
+ size 364
train_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 40.0,
3
+ "train_loss": 0.47529568801970173,
4
+ "train_runtime": 7092.3542,
5
+ "train_samples_per_second": 76.223,
6
+ "train_steps_per_second": 9.531
7
+ }
trainer_state.json ADDED
@@ -0,0 +1,975 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 40.0,
5
+ "eval_steps": 500,
6
+ "global_step": 67600,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.3,
13
+ "grad_norm": 0.0696972981095314,
14
+ "learning_rate": 3.1018860946745563e-05,
15
+ "loss": 0.785,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.59,
20
+ "grad_norm": 0.1446572095155716,
21
+ "learning_rate": 3.0787721893491126e-05,
22
+ "loss": 0.7706,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.89,
27
+ "grad_norm": 0.2664627134799957,
28
+ "learning_rate": 3.055658284023669e-05,
29
+ "loss": 0.7614,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 1.18,
34
+ "grad_norm": 0.2499350905418396,
35
+ "learning_rate": 3.032544378698225e-05,
36
+ "loss": 0.7472,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 1.48,
41
+ "grad_norm": 0.37448567152023315,
42
+ "learning_rate": 3.009430473372781e-05,
43
+ "loss": 0.7371,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 1.78,
48
+ "grad_norm": 0.5009666085243225,
49
+ "learning_rate": 2.9863165680473374e-05,
50
+ "loss": 0.7284,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 2.07,
55
+ "grad_norm": 0.4846726357936859,
56
+ "learning_rate": 2.9632026627218937e-05,
57
+ "loss": 0.707,
58
+ "step": 3500
59
+ },
60
+ {
61
+ "epoch": 2.37,
62
+ "grad_norm": 0.5636312365531921,
63
+ "learning_rate": 2.94008875739645e-05,
64
+ "loss": 0.692,
65
+ "step": 4000
66
+ },
67
+ {
68
+ "epoch": 2.66,
69
+ "grad_norm": 0.9609221816062927,
70
+ "learning_rate": 2.9169748520710063e-05,
71
+ "loss": 0.6832,
72
+ "step": 4500
73
+ },
74
+ {
75
+ "epoch": 2.96,
76
+ "grad_norm": 0.6074559092521667,
77
+ "learning_rate": 2.893860946745562e-05,
78
+ "loss": 0.6713,
79
+ "step": 5000
80
+ },
81
+ {
82
+ "epoch": 3.25,
83
+ "grad_norm": 0.6442582011222839,
84
+ "learning_rate": 2.8707470414201182e-05,
85
+ "loss": 0.6624,
86
+ "step": 5500
87
+ },
88
+ {
89
+ "epoch": 3.55,
90
+ "grad_norm": 0.52489173412323,
91
+ "learning_rate": 2.8476331360946745e-05,
92
+ "loss": 0.6533,
93
+ "step": 6000
94
+ },
95
+ {
96
+ "epoch": 3.85,
97
+ "grad_norm": 0.6586915254592896,
98
+ "learning_rate": 2.8245192307692307e-05,
99
+ "loss": 0.6414,
100
+ "step": 6500
101
+ },
102
+ {
103
+ "epoch": 4.14,
104
+ "grad_norm": 0.7074326276779175,
105
+ "learning_rate": 2.801405325443787e-05,
106
+ "loss": 0.632,
107
+ "step": 7000
108
+ },
109
+ {
110
+ "epoch": 4.44,
111
+ "grad_norm": 0.7869051694869995,
112
+ "learning_rate": 2.7782914201183433e-05,
113
+ "loss": 0.625,
114
+ "step": 7500
115
+ },
116
+ {
117
+ "epoch": 4.73,
118
+ "grad_norm": 0.7031483054161072,
119
+ "learning_rate": 2.7551775147928993e-05,
120
+ "loss": 0.6116,
121
+ "step": 8000
122
+ },
123
+ {
124
+ "epoch": 5.03,
125
+ "grad_norm": 0.7165437340736389,
126
+ "learning_rate": 2.7320636094674555e-05,
127
+ "loss": 0.6067,
128
+ "step": 8500
129
+ },
130
+ {
131
+ "epoch": 5.33,
132
+ "grad_norm": 0.6308349967002869,
133
+ "learning_rate": 2.708949704142012e-05,
134
+ "loss": 0.594,
135
+ "step": 9000
136
+ },
137
+ {
138
+ "epoch": 5.62,
139
+ "grad_norm": 0.7305271625518799,
140
+ "learning_rate": 2.685835798816568e-05,
141
+ "loss": 0.5837,
142
+ "step": 9500
143
+ },
144
+ {
145
+ "epoch": 5.92,
146
+ "grad_norm": 0.8089825510978699,
147
+ "learning_rate": 2.6627218934911244e-05,
148
+ "loss": 0.5725,
149
+ "step": 10000
150
+ },
151
+ {
152
+ "epoch": 6.21,
153
+ "grad_norm": 0.7770040035247803,
154
+ "learning_rate": 2.6396079881656807e-05,
155
+ "loss": 0.5674,
156
+ "step": 10500
157
+ },
158
+ {
159
+ "epoch": 6.51,
160
+ "grad_norm": 0.7730346322059631,
161
+ "learning_rate": 2.6164940828402366e-05,
162
+ "loss": 0.5567,
163
+ "step": 11000
164
+ },
165
+ {
166
+ "epoch": 6.8,
167
+ "grad_norm": 0.6454223990440369,
168
+ "learning_rate": 2.593380177514793e-05,
169
+ "loss": 0.5486,
170
+ "step": 11500
171
+ },
172
+ {
173
+ "epoch": 7.1,
174
+ "grad_norm": 0.5882906317710876,
175
+ "learning_rate": 2.5702662721893492e-05,
176
+ "loss": 0.5396,
177
+ "step": 12000
178
+ },
179
+ {
180
+ "epoch": 7.4,
181
+ "grad_norm": 0.8279200792312622,
182
+ "learning_rate": 2.5471523668639055e-05,
183
+ "loss": 0.5309,
184
+ "step": 12500
185
+ },
186
+ {
187
+ "epoch": 7.69,
188
+ "grad_norm": 0.8009528517723083,
189
+ "learning_rate": 2.5240384615384618e-05,
190
+ "loss": 0.5288,
191
+ "step": 13000
192
+ },
193
+ {
194
+ "epoch": 7.99,
195
+ "grad_norm": 0.7412715554237366,
196
+ "learning_rate": 2.500924556213018e-05,
197
+ "loss": 0.5198,
198
+ "step": 13500
199
+ },
200
+ {
201
+ "epoch": 8.28,
202
+ "grad_norm": 0.9230983853340149,
203
+ "learning_rate": 2.4778106508875743e-05,
204
+ "loss": 0.5163,
205
+ "step": 14000
206
+ },
207
+ {
208
+ "epoch": 8.58,
209
+ "grad_norm": 0.7999468445777893,
210
+ "learning_rate": 2.45469674556213e-05,
211
+ "loss": 0.5131,
212
+ "step": 14500
213
+ },
214
+ {
215
+ "epoch": 8.88,
216
+ "grad_norm": 0.7494385838508606,
217
+ "learning_rate": 2.4315828402366862e-05,
218
+ "loss": 0.506,
219
+ "step": 15000
220
+ },
221
+ {
222
+ "epoch": 9.17,
223
+ "grad_norm": 0.7356762886047363,
224
+ "learning_rate": 2.4084689349112425e-05,
225
+ "loss": 0.5036,
226
+ "step": 15500
227
+ },
228
+ {
229
+ "epoch": 9.47,
230
+ "grad_norm": 0.8011249303817749,
231
+ "learning_rate": 2.3853550295857988e-05,
232
+ "loss": 0.497,
233
+ "step": 16000
234
+ },
235
+ {
236
+ "epoch": 9.76,
237
+ "grad_norm": 0.713610827922821,
238
+ "learning_rate": 2.362241124260355e-05,
239
+ "loss": 0.4967,
240
+ "step": 16500
241
+ },
242
+ {
243
+ "epoch": 10.06,
244
+ "grad_norm": 0.8254227042198181,
245
+ "learning_rate": 2.3391272189349114e-05,
246
+ "loss": 0.4911,
247
+ "step": 17000
248
+ },
249
+ {
250
+ "epoch": 10.36,
251
+ "grad_norm": 0.7040392756462097,
252
+ "learning_rate": 2.3160133136094673e-05,
253
+ "loss": 0.4859,
254
+ "step": 17500
255
+ },
256
+ {
257
+ "epoch": 10.65,
258
+ "grad_norm": 0.7733869552612305,
259
+ "learning_rate": 2.2928994082840236e-05,
260
+ "loss": 0.4858,
261
+ "step": 18000
262
+ },
263
+ {
264
+ "epoch": 10.95,
265
+ "grad_norm": 0.8573015928268433,
266
+ "learning_rate": 2.26978550295858e-05,
267
+ "loss": 0.4829,
268
+ "step": 18500
269
+ },
270
+ {
271
+ "epoch": 11.24,
272
+ "grad_norm": 0.693286657333374,
273
+ "learning_rate": 2.2466715976331362e-05,
274
+ "loss": 0.4755,
275
+ "step": 19000
276
+ },
277
+ {
278
+ "epoch": 11.54,
279
+ "grad_norm": 0.7536494135856628,
280
+ "learning_rate": 2.2235576923076925e-05,
281
+ "loss": 0.4768,
282
+ "step": 19500
283
+ },
284
+ {
285
+ "epoch": 11.83,
286
+ "grad_norm": 0.6219621896743774,
287
+ "learning_rate": 2.2004437869822487e-05,
288
+ "loss": 0.4757,
289
+ "step": 20000
290
+ },
291
+ {
292
+ "epoch": 12.13,
293
+ "grad_norm": 0.7244569063186646,
294
+ "learning_rate": 2.1773298816568047e-05,
295
+ "loss": 0.473,
296
+ "step": 20500
297
+ },
298
+ {
299
+ "epoch": 12.43,
300
+ "grad_norm": 0.7847468852996826,
301
+ "learning_rate": 2.154215976331361e-05,
302
+ "loss": 0.4696,
303
+ "step": 21000
304
+ },
305
+ {
306
+ "epoch": 12.72,
307
+ "grad_norm": 0.7616731524467468,
308
+ "learning_rate": 2.1311020710059173e-05,
309
+ "loss": 0.4696,
310
+ "step": 21500
311
+ },
312
+ {
313
+ "epoch": 13.02,
314
+ "grad_norm": 0.7453758716583252,
315
+ "learning_rate": 2.1079881656804735e-05,
316
+ "loss": 0.4687,
317
+ "step": 22000
318
+ },
319
+ {
320
+ "epoch": 13.31,
321
+ "grad_norm": 0.6706910729408264,
322
+ "learning_rate": 2.0848742603550298e-05,
323
+ "loss": 0.4624,
324
+ "step": 22500
325
+ },
326
+ {
327
+ "epoch": 13.61,
328
+ "grad_norm": 0.819572389125824,
329
+ "learning_rate": 2.061760355029586e-05,
330
+ "loss": 0.4603,
331
+ "step": 23000
332
+ },
333
+ {
334
+ "epoch": 13.91,
335
+ "grad_norm": 0.6898177266120911,
336
+ "learning_rate": 2.0386464497041417e-05,
337
+ "loss": 0.4599,
338
+ "step": 23500
339
+ },
340
+ {
341
+ "epoch": 14.2,
342
+ "grad_norm": 0.6775723099708557,
343
+ "learning_rate": 2.015532544378698e-05,
344
+ "loss": 0.4622,
345
+ "step": 24000
346
+ },
347
+ {
348
+ "epoch": 14.5,
349
+ "grad_norm": 0.7278532385826111,
350
+ "learning_rate": 1.9924186390532543e-05,
351
+ "loss": 0.4573,
352
+ "step": 24500
353
+ },
354
+ {
355
+ "epoch": 14.79,
356
+ "grad_norm": 0.6195204257965088,
357
+ "learning_rate": 1.9693047337278106e-05,
358
+ "loss": 0.4536,
359
+ "step": 25000
360
+ },
361
+ {
362
+ "epoch": 15.09,
363
+ "grad_norm": 0.6975180506706238,
364
+ "learning_rate": 1.946190828402367e-05,
365
+ "loss": 0.4532,
366
+ "step": 25500
367
+ },
368
+ {
369
+ "epoch": 15.38,
370
+ "grad_norm": 0.7116599678993225,
371
+ "learning_rate": 1.923076923076923e-05,
372
+ "loss": 0.4521,
373
+ "step": 26000
374
+ },
375
+ {
376
+ "epoch": 15.68,
377
+ "grad_norm": 0.6533932685852051,
378
+ "learning_rate": 1.8999630177514794e-05,
379
+ "loss": 0.4513,
380
+ "step": 26500
381
+ },
382
+ {
383
+ "epoch": 15.98,
384
+ "grad_norm": 0.580528736114502,
385
+ "learning_rate": 1.8768491124260354e-05,
386
+ "loss": 0.4518,
387
+ "step": 27000
388
+ },
389
+ {
390
+ "epoch": 16.27,
391
+ "grad_norm": 0.8283082842826843,
392
+ "learning_rate": 1.8537352071005917e-05,
393
+ "loss": 0.4473,
394
+ "step": 27500
395
+ },
396
+ {
397
+ "epoch": 16.57,
398
+ "grad_norm": 0.6264183521270752,
399
+ "learning_rate": 1.830621301775148e-05,
400
+ "loss": 0.4466,
401
+ "step": 28000
402
+ },
403
+ {
404
+ "epoch": 16.86,
405
+ "grad_norm": 0.6502621173858643,
406
+ "learning_rate": 1.8075073964497042e-05,
407
+ "loss": 0.446,
408
+ "step": 28500
409
+ },
410
+ {
411
+ "epoch": 17.16,
412
+ "grad_norm": 0.6924391984939575,
413
+ "learning_rate": 1.7843934911242605e-05,
414
+ "loss": 0.4433,
415
+ "step": 29000
416
+ },
417
+ {
418
+ "epoch": 17.46,
419
+ "grad_norm": 0.631476879119873,
420
+ "learning_rate": 1.7612795857988168e-05,
421
+ "loss": 0.4446,
422
+ "step": 29500
423
+ },
424
+ {
425
+ "epoch": 17.75,
426
+ "grad_norm": 0.6945323348045349,
427
+ "learning_rate": 1.7381656804733727e-05,
428
+ "loss": 0.4451,
429
+ "step": 30000
430
+ },
431
+ {
432
+ "epoch": 18.05,
433
+ "grad_norm": 0.6200039386749268,
434
+ "learning_rate": 1.715051775147929e-05,
435
+ "loss": 0.4434,
436
+ "step": 30500
437
+ },
438
+ {
439
+ "epoch": 18.34,
440
+ "grad_norm": 0.6730862259864807,
441
+ "learning_rate": 1.6919378698224853e-05,
442
+ "loss": 0.4426,
443
+ "step": 31000
444
+ },
445
+ {
446
+ "epoch": 18.64,
447
+ "grad_norm": 0.6520936489105225,
448
+ "learning_rate": 1.6688239644970416e-05,
449
+ "loss": 0.4401,
450
+ "step": 31500
451
+ },
452
+ {
453
+ "epoch": 18.93,
454
+ "grad_norm": 0.7381883263587952,
455
+ "learning_rate": 1.645710059171598e-05,
456
+ "loss": 0.4381,
457
+ "step": 32000
458
+ },
459
+ {
460
+ "epoch": 19.23,
461
+ "grad_norm": 0.6962296962738037,
462
+ "learning_rate": 1.622596153846154e-05,
463
+ "loss": 0.4399,
464
+ "step": 32500
465
+ },
466
+ {
467
+ "epoch": 19.53,
468
+ "grad_norm": 0.5711750388145447,
469
+ "learning_rate": 1.5994822485207098e-05,
470
+ "loss": 0.4354,
471
+ "step": 33000
472
+ },
473
+ {
474
+ "epoch": 19.82,
475
+ "grad_norm": 0.6115343570709229,
476
+ "learning_rate": 1.576368343195266e-05,
477
+ "loss": 0.437,
478
+ "step": 33500
479
+ },
480
+ {
481
+ "epoch": 20.12,
482
+ "grad_norm": 0.6140381693840027,
483
+ "learning_rate": 1.5532544378698223e-05,
484
+ "loss": 0.4341,
485
+ "step": 34000
486
+ },
487
+ {
488
+ "epoch": 20.41,
489
+ "grad_norm": 0.648704469203949,
490
+ "learning_rate": 1.5301405325443786e-05,
491
+ "loss": 0.4337,
492
+ "step": 34500
493
+ },
494
+ {
495
+ "epoch": 20.71,
496
+ "grad_norm": 0.6556956171989441,
497
+ "learning_rate": 1.507026627218935e-05,
498
+ "loss": 0.4333,
499
+ "step": 35000
500
+ },
501
+ {
502
+ "epoch": 21.01,
503
+ "grad_norm": 0.7024092674255371,
504
+ "learning_rate": 1.483912721893491e-05,
505
+ "loss": 0.4357,
506
+ "step": 35500
507
+ },
508
+ {
509
+ "epoch": 21.3,
510
+ "grad_norm": 0.5994529128074646,
511
+ "learning_rate": 1.4607988165680473e-05,
512
+ "loss": 0.4311,
513
+ "step": 36000
514
+ },
515
+ {
516
+ "epoch": 21.6,
517
+ "grad_norm": 0.599431037902832,
518
+ "learning_rate": 1.4376849112426036e-05,
519
+ "loss": 0.4334,
520
+ "step": 36500
521
+ },
522
+ {
523
+ "epoch": 21.89,
524
+ "grad_norm": 0.6323761343955994,
525
+ "learning_rate": 1.4145710059171597e-05,
526
+ "loss": 0.4298,
527
+ "step": 37000
528
+ },
529
+ {
530
+ "epoch": 22.19,
531
+ "grad_norm": 0.6665933132171631,
532
+ "learning_rate": 1.391457100591716e-05,
533
+ "loss": 0.4281,
534
+ "step": 37500
535
+ },
536
+ {
537
+ "epoch": 22.49,
538
+ "grad_norm": 0.6103574633598328,
539
+ "learning_rate": 1.3683431952662723e-05,
540
+ "loss": 0.4311,
541
+ "step": 38000
542
+ },
543
+ {
544
+ "epoch": 22.78,
545
+ "grad_norm": 0.5954911708831787,
546
+ "learning_rate": 1.3452292899408284e-05,
547
+ "loss": 0.4277,
548
+ "step": 38500
549
+ },
550
+ {
551
+ "epoch": 23.08,
552
+ "grad_norm": 0.5706931352615356,
553
+ "learning_rate": 1.3221153846153847e-05,
554
+ "loss": 0.4278,
555
+ "step": 39000
556
+ },
557
+ {
558
+ "epoch": 23.37,
559
+ "grad_norm": 0.5817924737930298,
560
+ "learning_rate": 1.299001479289941e-05,
561
+ "loss": 0.4239,
562
+ "step": 39500
563
+ },
564
+ {
565
+ "epoch": 23.67,
566
+ "grad_norm": 0.591736912727356,
567
+ "learning_rate": 1.2758875739644969e-05,
568
+ "loss": 0.428,
569
+ "step": 40000
570
+ },
571
+ {
572
+ "epoch": 23.96,
573
+ "grad_norm": 0.6267042756080627,
574
+ "learning_rate": 1.2527736686390532e-05,
575
+ "loss": 0.4275,
576
+ "step": 40500
577
+ },
578
+ {
579
+ "epoch": 24.26,
580
+ "grad_norm": 0.5819630026817322,
581
+ "learning_rate": 1.2296597633136095e-05,
582
+ "loss": 0.4262,
583
+ "step": 41000
584
+ },
585
+ {
586
+ "epoch": 24.56,
587
+ "grad_norm": 0.615161657333374,
588
+ "learning_rate": 1.2065458579881656e-05,
589
+ "loss": 0.4235,
590
+ "step": 41500
591
+ },
592
+ {
593
+ "epoch": 24.85,
594
+ "grad_norm": 0.7147814631462097,
595
+ "learning_rate": 1.1834319526627219e-05,
596
+ "loss": 0.423,
597
+ "step": 42000
598
+ },
599
+ {
600
+ "epoch": 25.15,
601
+ "grad_norm": 0.7751194834709167,
602
+ "learning_rate": 1.1603180473372782e-05,
603
+ "loss": 0.422,
604
+ "step": 42500
605
+ },
606
+ {
607
+ "epoch": 25.44,
608
+ "grad_norm": 0.674323320388794,
609
+ "learning_rate": 1.1372041420118345e-05,
610
+ "loss": 0.4207,
611
+ "step": 43000
612
+ },
613
+ {
614
+ "epoch": 25.74,
615
+ "grad_norm": 0.6965672969818115,
616
+ "learning_rate": 1.1140902366863906e-05,
617
+ "loss": 0.4244,
618
+ "step": 43500
619
+ },
620
+ {
621
+ "epoch": 26.04,
622
+ "grad_norm": 0.6351442337036133,
623
+ "learning_rate": 1.0909763313609469e-05,
624
+ "loss": 0.4228,
625
+ "step": 44000
626
+ },
627
+ {
628
+ "epoch": 26.33,
629
+ "grad_norm": 0.590655505657196,
630
+ "learning_rate": 1.0678624260355031e-05,
631
+ "loss": 0.4207,
632
+ "step": 44500
633
+ },
634
+ {
635
+ "epoch": 26.63,
636
+ "grad_norm": 0.6553508639335632,
637
+ "learning_rate": 1.044748520710059e-05,
638
+ "loss": 0.422,
639
+ "step": 45000
640
+ },
641
+ {
642
+ "epoch": 26.92,
643
+ "grad_norm": 0.6216753721237183,
644
+ "learning_rate": 1.0216346153846154e-05,
645
+ "loss": 0.4196,
646
+ "step": 45500
647
+ },
648
+ {
649
+ "epoch": 27.22,
650
+ "grad_norm": 0.6628888249397278,
651
+ "learning_rate": 9.985207100591717e-06,
652
+ "loss": 0.4194,
653
+ "step": 46000
654
+ },
655
+ {
656
+ "epoch": 27.51,
657
+ "grad_norm": 0.6111788749694824,
658
+ "learning_rate": 9.754068047337278e-06,
659
+ "loss": 0.4189,
660
+ "step": 46500
661
+ },
662
+ {
663
+ "epoch": 27.81,
664
+ "grad_norm": 0.5751132965087891,
665
+ "learning_rate": 9.52292899408284e-06,
666
+ "loss": 0.4182,
667
+ "step": 47000
668
+ },
669
+ {
670
+ "epoch": 28.11,
671
+ "grad_norm": 0.6333842873573303,
672
+ "learning_rate": 9.291789940828403e-06,
673
+ "loss": 0.4172,
674
+ "step": 47500
675
+ },
676
+ {
677
+ "epoch": 28.4,
678
+ "grad_norm": 0.5846462845802307,
679
+ "learning_rate": 9.060650887573965e-06,
680
+ "loss": 0.417,
681
+ "step": 48000
682
+ },
683
+ {
684
+ "epoch": 28.7,
685
+ "grad_norm": 0.5921066999435425,
686
+ "learning_rate": 8.829511834319527e-06,
687
+ "loss": 0.4178,
688
+ "step": 48500
689
+ },
690
+ {
691
+ "epoch": 28.99,
692
+ "grad_norm": 0.6645215153694153,
693
+ "learning_rate": 8.59837278106509e-06,
694
+ "loss": 0.4166,
695
+ "step": 49000
696
+ },
697
+ {
698
+ "epoch": 29.29,
699
+ "grad_norm": 0.6453720331192017,
700
+ "learning_rate": 8.36723372781065e-06,
701
+ "loss": 0.4142,
702
+ "step": 49500
703
+ },
704
+ {
705
+ "epoch": 29.59,
706
+ "grad_norm": 0.6401262283325195,
707
+ "learning_rate": 8.136094674556213e-06,
708
+ "loss": 0.4152,
709
+ "step": 50000
710
+ },
711
+ {
712
+ "epoch": 29.88,
713
+ "grad_norm": 0.6776517033576965,
714
+ "learning_rate": 7.904955621301775e-06,
715
+ "loss": 0.415,
716
+ "step": 50500
717
+ },
718
+ {
719
+ "epoch": 30.18,
720
+ "grad_norm": 0.6697096228599548,
721
+ "learning_rate": 7.673816568047338e-06,
722
+ "loss": 0.4116,
723
+ "step": 51000
724
+ },
725
+ {
726
+ "epoch": 30.47,
727
+ "grad_norm": 0.6276474595069885,
728
+ "learning_rate": 7.442677514792899e-06,
729
+ "loss": 0.4127,
730
+ "step": 51500
731
+ },
732
+ {
733
+ "epoch": 30.77,
734
+ "grad_norm": 0.7491399049758911,
735
+ "learning_rate": 7.211538461538462e-06,
736
+ "loss": 0.4188,
737
+ "step": 52000
738
+ },
739
+ {
740
+ "epoch": 31.07,
741
+ "grad_norm": 0.7292032837867737,
742
+ "learning_rate": 6.980399408284024e-06,
743
+ "loss": 0.4141,
744
+ "step": 52500
745
+ },
746
+ {
747
+ "epoch": 31.36,
748
+ "grad_norm": 0.6432758569717407,
749
+ "learning_rate": 6.749260355029585e-06,
750
+ "loss": 0.4109,
751
+ "step": 53000
752
+ },
753
+ {
754
+ "epoch": 31.66,
755
+ "grad_norm": 0.7142419815063477,
756
+ "learning_rate": 6.518121301775148e-06,
757
+ "loss": 0.4124,
758
+ "step": 53500
759
+ },
760
+ {
761
+ "epoch": 31.95,
762
+ "grad_norm": 0.737123966217041,
763
+ "learning_rate": 6.28698224852071e-06,
764
+ "loss": 0.4106,
765
+ "step": 54000
766
+ },
767
+ {
768
+ "epoch": 32.25,
769
+ "grad_norm": 0.6416926980018616,
770
+ "learning_rate": 6.055843195266272e-06,
771
+ "loss": 0.4131,
772
+ "step": 54500
773
+ },
774
+ {
775
+ "epoch": 32.54,
776
+ "grad_norm": 0.8014604449272156,
777
+ "learning_rate": 5.824704142011835e-06,
778
+ "loss": 0.4113,
779
+ "step": 55000
780
+ },
781
+ {
782
+ "epoch": 32.84,
783
+ "grad_norm": 0.745812714099884,
784
+ "learning_rate": 5.593565088757396e-06,
785
+ "loss": 0.4119,
786
+ "step": 55500
787
+ },
788
+ {
789
+ "epoch": 33.14,
790
+ "grad_norm": 0.753264307975769,
791
+ "learning_rate": 5.362426035502958e-06,
792
+ "loss": 0.4109,
793
+ "step": 56000
794
+ },
795
+ {
796
+ "epoch": 33.43,
797
+ "grad_norm": 0.7509620189666748,
798
+ "learning_rate": 5.131286982248521e-06,
799
+ "loss": 0.4094,
800
+ "step": 56500
801
+ },
802
+ {
803
+ "epoch": 33.73,
804
+ "grad_norm": 0.6729797720909119,
805
+ "learning_rate": 4.900147928994083e-06,
806
+ "loss": 0.4086,
807
+ "step": 57000
808
+ },
809
+ {
810
+ "epoch": 34.02,
811
+ "grad_norm": 0.6696120500564575,
812
+ "learning_rate": 4.669008875739646e-06,
813
+ "loss": 0.4109,
814
+ "step": 57500
815
+ },
816
+ {
817
+ "epoch": 34.32,
818
+ "grad_norm": 0.591411828994751,
819
+ "learning_rate": 4.437869822485207e-06,
820
+ "loss": 0.4082,
821
+ "step": 58000
822
+ },
823
+ {
824
+ "epoch": 34.62,
825
+ "grad_norm": 0.6425775289535522,
826
+ "learning_rate": 4.206730769230769e-06,
827
+ "loss": 0.4096,
828
+ "step": 58500
829
+ },
830
+ {
831
+ "epoch": 34.91,
832
+ "grad_norm": 0.6289854645729065,
833
+ "learning_rate": 3.975591715976332e-06,
834
+ "loss": 0.4101,
835
+ "step": 59000
836
+ },
837
+ {
838
+ "epoch": 35.21,
839
+ "grad_norm": 0.6215291023254395,
840
+ "learning_rate": 3.7444526627218935e-06,
841
+ "loss": 0.4094,
842
+ "step": 59500
843
+ },
844
+ {
845
+ "epoch": 35.5,
846
+ "grad_norm": 0.7314534783363342,
847
+ "learning_rate": 3.513313609467456e-06,
848
+ "loss": 0.4062,
849
+ "step": 60000
850
+ },
851
+ {
852
+ "epoch": 35.8,
853
+ "grad_norm": 0.6580629348754883,
854
+ "learning_rate": 3.2821745562130175e-06,
855
+ "loss": 0.4078,
856
+ "step": 60500
857
+ },
858
+ {
859
+ "epoch": 36.09,
860
+ "grad_norm": 0.6824979186058044,
861
+ "learning_rate": 3.05103550295858e-06,
862
+ "loss": 0.4098,
863
+ "step": 61000
864
+ },
865
+ {
866
+ "epoch": 36.39,
867
+ "grad_norm": 0.6403664946556091,
868
+ "learning_rate": 2.8198964497041423e-06,
869
+ "loss": 0.4078,
870
+ "step": 61500
871
+ },
872
+ {
873
+ "epoch": 36.69,
874
+ "grad_norm": 0.6590360999107361,
875
+ "learning_rate": 2.5887573964497043e-06,
876
+ "loss": 0.4076,
877
+ "step": 62000
878
+ },
879
+ {
880
+ "epoch": 36.98,
881
+ "grad_norm": 0.8094596862792969,
882
+ "learning_rate": 2.3576183431952663e-06,
883
+ "loss": 0.4059,
884
+ "step": 62500
885
+ },
886
+ {
887
+ "epoch": 37.28,
888
+ "grad_norm": 0.6609135270118713,
889
+ "learning_rate": 2.1264792899408283e-06,
890
+ "loss": 0.4065,
891
+ "step": 63000
892
+ },
893
+ {
894
+ "epoch": 37.57,
895
+ "grad_norm": 0.6913712024688721,
896
+ "learning_rate": 1.8953402366863905e-06,
897
+ "loss": 0.4033,
898
+ "step": 63500
899
+ },
900
+ {
901
+ "epoch": 37.87,
902
+ "grad_norm": 0.6110714077949524,
903
+ "learning_rate": 1.6642011834319528e-06,
904
+ "loss": 0.4067,
905
+ "step": 64000
906
+ },
907
+ {
908
+ "epoch": 38.17,
909
+ "grad_norm": 0.7397092580795288,
910
+ "learning_rate": 1.4330621301775148e-06,
911
+ "loss": 0.4056,
912
+ "step": 64500
913
+ },
914
+ {
915
+ "epoch": 38.46,
916
+ "grad_norm": 0.6656752824783325,
917
+ "learning_rate": 1.201923076923077e-06,
918
+ "loss": 0.403,
919
+ "step": 65000
920
+ },
921
+ {
922
+ "epoch": 38.76,
923
+ "grad_norm": 0.70158451795578,
924
+ "learning_rate": 9.70784023668639e-07,
925
+ "loss": 0.4063,
926
+ "step": 65500
927
+ },
928
+ {
929
+ "epoch": 39.05,
930
+ "grad_norm": 0.7131240367889404,
931
+ "learning_rate": 7.396449704142012e-07,
932
+ "loss": 0.4058,
933
+ "step": 66000
934
+ },
935
+ {
936
+ "epoch": 39.35,
937
+ "grad_norm": 0.7075223922729492,
938
+ "learning_rate": 5.085059171597633e-07,
939
+ "loss": 0.4058,
940
+ "step": 66500
941
+ },
942
+ {
943
+ "epoch": 39.64,
944
+ "grad_norm": 0.7466796040534973,
945
+ "learning_rate": 2.7736686390532544e-07,
946
+ "loss": 0.4043,
947
+ "step": 67000
948
+ },
949
+ {
950
+ "epoch": 39.94,
951
+ "grad_norm": 0.6051456332206726,
952
+ "learning_rate": 4.6227810650887574e-08,
953
+ "loss": 0.4047,
954
+ "step": 67500
955
+ },
956
+ {
957
+ "epoch": 40.0,
958
+ "step": 67600,
959
+ "total_flos": 5.463929616806707e+19,
960
+ "train_loss": 0.47529568801970173,
961
+ "train_runtime": 7092.3542,
962
+ "train_samples_per_second": 76.223,
963
+ "train_steps_per_second": 9.531
964
+ }
965
+ ],
966
+ "logging_steps": 500,
967
+ "max_steps": 67600,
968
+ "num_input_tokens_seen": 0,
969
+ "num_train_epochs": 40,
970
+ "save_steps": 1000000000,
971
+ "total_flos": 5.463929616806707e+19,
972
+ "train_batch_size": 8,
973
+ "trial_name": null,
974
+ "trial_params": null
975
+ }