kalita commited on
Commit
415bc54
1 Parent(s): 917c384

Model save

Browse files
README.md CHANGED
@@ -17,8 +17,8 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [MCG-NJU/videomae-base-finetuned-ssv2](https://huggingface.co/MCG-NJU/videomae-base-finetuned-ssv2) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 3.8114
21
- - Accuracy: 0.375
22
 
23
  ## Model description
24
 
 
17
 
18
  This model is a fine-tuned version of [MCG-NJU/videomae-base-finetuned-ssv2](https://huggingface.co/MCG-NJU/videomae-base-finetuned-ssv2) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 3.1259
21
+ - Accuracy: 0.41
22
 
23
  ## Model description
24
 
all_results.json CHANGED
@@ -1,4 +1,8 @@
1
  {
2
- "eval_accuracy": 1.0,
3
- "eval_loss": 0.026988975703716278
 
 
 
 
4
  }
 
1
  {
2
+ "epoch": 7.12,
3
+ "eval_accuracy": 0.375,
4
+ "eval_loss": 3.811370849609375,
5
+ "eval_runtime": 14.4555,
6
+ "eval_samples_per_second": 1.107,
7
+ "eval_steps_per_second": 0.553
8
  }
runs/Apr04_11-37-03_ab9c0969efb8/events.out.tfevents.1712233586.ab9c0969efb8.34.1 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:46e943c72a6b4fe5b7dcf14506d3fb35afd7cfeb836edd0d51131ec190430162
3
- size 411
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39ab275d0def41e8a2ce80e3c3037ca98ef5be9fe1d94c3b26fc0dfe97a66bf6
3
+ size 1057
test_results.json CHANGED
@@ -1,4 +1,8 @@
1
  {
2
- "eval_accuracy": 1.0,
3
- "eval_loss": 0.026988975703716278
 
 
 
 
4
  }
 
1
  {
2
+ "epoch": 7.12,
3
+ "eval_accuracy": 0.375,
4
+ "eval_loss": 3.811370849609375,
5
+ "eval_runtime": 14.4555,
6
+ "eval_samples_per_second": 1.107,
7
+ "eval_steps_per_second": 0.553
8
  }
trainer_state.json CHANGED
@@ -1,374 +1,539 @@
1
  {
2
- "best_metric": 1.0,
3
- "best_model_checkpoint": "videomae-base-finetuned-ssv2-finetuned-traffic-dataset-mae/checkpoint-168",
4
- "epoch": 7.011160714285714,
5
  "eval_steps": 500,
6
- "global_step": 397,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.02,
13
- "grad_norm": 14.671374320983887,
14
- "learning_rate": 1.1111111111111112e-05,
15
- "loss": 0.7193,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.04,
20
- "grad_norm": 7.1641387939453125,
21
- "learning_rate": 2.2222222222222223e-05,
22
- "loss": 0.5298,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.07,
27
- "grad_norm": 3.540851354598999,
28
- "learning_rate": 3.3333333333333335e-05,
29
- "loss": 0.3106,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.09,
34
- "grad_norm": 1.9156169891357422,
35
- "learning_rate": 4.4444444444444447e-05,
36
- "loss": 0.8657,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.11,
41
- "grad_norm": 2.8788669109344482,
42
- "learning_rate": 4.937965260545906e-05,
43
- "loss": 0.6144,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.12,
48
- "eval_accuracy": 0.8571428571428571,
49
- "eval_loss": 0.27961549162864685,
50
- "eval_runtime": 24.2905,
51
- "eval_samples_per_second": 0.576,
52
- "eval_steps_per_second": 0.288,
53
- "step": 56
54
- },
55
- {
56
- "epoch": 1.01,
57
- "grad_norm": 16.718944549560547,
58
- "learning_rate": 4.8138957816377175e-05,
59
- "loss": 0.4883,
60
  "step": 60
61
  },
62
  {
63
- "epoch": 1.03,
64
- "grad_norm": 0.7438091039657593,
65
- "learning_rate": 4.689826302729529e-05,
66
- "loss": 0.236,
67
  "step": 70
68
  },
69
  {
70
- "epoch": 1.05,
71
- "grad_norm": 1.806220293045044,
72
- "learning_rate": 4.56575682382134e-05,
73
- "loss": 0.4257,
 
 
 
 
 
 
 
 
 
74
  "step": 80
75
  },
76
  {
77
- "epoch": 1.08,
78
- "grad_norm": 4.333924293518066,
79
- "learning_rate": 4.441687344913151e-05,
80
- "loss": 0.169,
81
  "step": 90
82
  },
83
  {
84
- "epoch": 1.1,
85
- "grad_norm": 3.9030086994171143,
86
- "learning_rate": 4.317617866004963e-05,
87
- "loss": 0.313,
88
  "step": 100
89
  },
90
  {
91
- "epoch": 1.12,
92
- "grad_norm": 0.010061407461762428,
93
- "learning_rate": 4.1935483870967746e-05,
94
- "loss": 0.0637,
95
  "step": 110
96
  },
97
  {
98
- "epoch": 1.12,
99
- "eval_accuracy": 0.9285714285714286,
100
- "eval_loss": 0.11744929850101471,
101
- "eval_runtime": 24.0323,
102
- "eval_samples_per_second": 0.583,
103
- "eval_steps_per_second": 0.291,
104
- "step": 112
105
- },
106
- {
107
- "epoch": 2.02,
108
- "grad_norm": 0.009479613974690437,
109
- "learning_rate": 4.069478908188586e-05,
110
- "loss": 0.2538,
111
  "step": 120
112
  },
113
  {
114
- "epoch": 2.04,
115
- "grad_norm": 0.21475882828235626,
116
- "learning_rate": 3.945409429280397e-05,
117
- "loss": 0.1613,
118
  "step": 130
119
  },
120
  {
121
- "epoch": 2.06,
122
- "grad_norm": 0.05664811283349991,
123
- "learning_rate": 3.8213399503722084e-05,
124
- "loss": 0.5442,
125
  "step": 140
126
  },
127
  {
128
- "epoch": 2.08,
129
- "grad_norm": 1.2033185958862305,
130
- "learning_rate": 3.69727047146402e-05,
131
- "loss": 0.737,
132
  "step": 150
133
  },
134
  {
135
- "epoch": 2.11,
136
- "grad_norm": 0.01298306230455637,
137
- "learning_rate": 3.573200992555831e-05,
138
- "loss": 0.4911,
 
 
 
 
 
 
 
 
 
139
  "step": 160
140
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  {
142
  "epoch": 2.12,
143
- "eval_accuracy": 1.0,
144
- "eval_loss": 0.028704656288027763,
145
- "eval_runtime": 20.2796,
146
- "eval_samples_per_second": 0.69,
147
- "eval_steps_per_second": 0.345,
148
- "step": 168
149
  },
150
  {
151
  "epoch": 3.0,
152
- "grad_norm": 0.12567199766635895,
153
- "learning_rate": 3.449131513647643e-05,
154
- "loss": 0.267,
155
- "step": 170
156
  },
157
  {
158
- "epoch": 3.03,
159
- "grad_norm": 0.3274080753326416,
160
- "learning_rate": 3.325062034739454e-05,
161
- "loss": 0.2982,
162
- "step": 180
 
 
 
 
 
 
 
163
  },
164
  {
165
  "epoch": 3.05,
166
- "grad_norm": 5.183434963226318,
167
- "learning_rate": 3.200992555831266e-05,
168
- "loss": 0.2744,
169
- "step": 190
170
  },
171
  {
172
  "epoch": 3.07,
173
- "grad_norm": 0.20806659758090973,
174
- "learning_rate": 3.0769230769230774e-05,
175
- "loss": 0.0124,
176
- "step": 200
177
  },
178
  {
179
  "epoch": 3.09,
180
- "grad_norm": 0.04709279537200928,
181
- "learning_rate": 2.9528535980148887e-05,
182
- "loss": 0.0025,
183
- "step": 210
 
 
 
 
 
 
 
184
  },
185
  {
186
  "epoch": 3.12,
187
- "grad_norm": 0.0054036942310631275,
188
- "learning_rate": 2.8287841191067e-05,
189
- "loss": 0.3625,
190
- "step": 220
191
  },
192
  {
193
  "epoch": 3.12,
194
- "eval_accuracy": 1.0,
195
- "eval_loss": 0.08213090896606445,
196
- "eval_runtime": 24.8422,
197
- "eval_samples_per_second": 0.564,
198
- "eval_steps_per_second": 0.282,
199
- "step": 224
200
  },
201
  {
202
  "epoch": 4.01,
203
- "grad_norm": 0.09801805019378662,
204
- "learning_rate": 2.7047146401985113e-05,
205
- "loss": 0.0082,
206
- "step": 230
 
 
 
 
 
 
 
207
  },
208
  {
209
  "epoch": 4.04,
210
- "grad_norm": 0.009371934458613396,
211
- "learning_rate": 2.5806451612903226e-05,
212
- "loss": 0.2331,
213
- "step": 240
214
  },
215
  {
216
  "epoch": 4.06,
217
- "grad_norm": 0.33655962347984314,
218
- "learning_rate": 2.4565756823821338e-05,
219
- "loss": 0.1461,
220
- "step": 250
221
  },
222
  {
223
  "epoch": 4.08,
224
- "grad_norm": 0.009524806402623653,
225
- "learning_rate": 2.3325062034739454e-05,
226
- "loss": 0.3265,
227
- "step": 260
228
  },
229
  {
230
- "epoch": 4.1,
231
- "grad_norm": 0.0037162320222705603,
232
- "learning_rate": 2.208436724565757e-05,
233
- "loss": 0.001,
234
- "step": 270
 
 
 
 
 
 
 
235
  },
236
  {
237
  "epoch": 4.12,
238
- "grad_norm": 0.005654833745211363,
239
- "learning_rate": 2.0843672456575683e-05,
240
- "loss": 0.1472,
241
- "step": 280
242
  },
243
  {
244
  "epoch": 4.12,
245
- "eval_accuracy": 0.8571428571428571,
246
- "eval_loss": 0.27267220616340637,
247
- "eval_runtime": 24.1835,
248
- "eval_samples_per_second": 0.579,
249
- "eval_steps_per_second": 0.289,
250
- "step": 280
251
  },
252
  {
253
  "epoch": 5.02,
254
- "grad_norm": 1.0853424072265625,
255
- "learning_rate": 1.9602977667493796e-05,
256
- "loss": 0.0061,
257
- "step": 290
258
  },
259
  {
260
- "epoch": 5.04,
261
- "grad_norm": 0.10893365740776062,
262
- "learning_rate": 1.8362282878411912e-05,
263
- "loss": 0.0329,
264
- "step": 300
 
 
 
 
 
 
 
265
  },
266
  {
267
  "epoch": 5.07,
268
- "grad_norm": 0.003927825018763542,
269
- "learning_rate": 1.7121588089330025e-05,
270
- "loss": 0.0124,
271
- "step": 310
272
  },
273
  {
274
- "epoch": 5.09,
275
- "grad_norm": 0.03731616213917732,
276
- "learning_rate": 1.588089330024814e-05,
277
- "loss": 0.1577,
278
- "step": 320
279
  },
280
  {
281
- "epoch": 5.11,
282
- "grad_norm": 0.027107276022434235,
283
- "learning_rate": 1.4640198511166252e-05,
284
- "loss": 0.0068,
285
- "step": 330
 
 
 
 
 
 
 
286
  },
287
  {
288
  "epoch": 5.12,
289
- "eval_accuracy": 0.7142857142857143,
290
- "eval_loss": 1.524235486984253,
291
- "eval_runtime": 22.5747,
292
- "eval_samples_per_second": 0.62,
293
- "eval_steps_per_second": 0.31,
294
- "step": 336
295
  },
296
  {
297
  "epoch": 6.01,
298
- "grad_norm": 0.015361560508608818,
299
- "learning_rate": 1.3399503722084367e-05,
300
- "loss": 0.0155,
301
- "step": 340
302
  },
303
  {
304
- "epoch": 6.03,
305
- "grad_norm": 0.009546751156449318,
306
- "learning_rate": 1.2158808933002481e-05,
307
- "loss": 0.2251,
308
- "step": 350
309
  },
310
  {
311
- "epoch": 6.05,
312
- "grad_norm": 0.00286727212369442,
313
- "learning_rate": 1.0918114143920596e-05,
314
- "loss": 0.0003,
315
- "step": 360
316
  },
317
  {
318
- "epoch": 6.08,
319
- "grad_norm": 0.006020919419825077,
320
- "learning_rate": 9.67741935483871e-06,
321
- "loss": 0.0016,
322
- "step": 370
323
  },
324
  {
325
- "epoch": 6.1,
326
- "grad_norm": 0.0052633825689554214,
327
- "learning_rate": 8.436724565756825e-06,
328
- "loss": 0.2976,
329
- "step": 380
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
  },
331
  {
332
  "epoch": 6.12,
333
- "grad_norm": 0.006017320789396763,
334
- "learning_rate": 7.1960297766749385e-06,
335
- "loss": 0.0009,
336
- "step": 390
337
  },
338
  {
339
  "epoch": 6.12,
340
- "eval_accuracy": 0.8571428571428571,
341
- "eval_loss": 0.7878251671791077,
342
- "eval_runtime": 24.9105,
343
- "eval_samples_per_second": 0.562,
344
- "eval_steps_per_second": 0.281,
345
- "step": 392
346
  },
347
  {
348
  "epoch": 7.01,
349
- "eval_accuracy": 1.0,
350
- "eval_loss": 0.026988975703716278,
351
- "eval_runtime": 5.8255,
352
- "eval_samples_per_second": 0.687,
353
- "eval_steps_per_second": 0.343,
354
- "step": 397
355
  },
356
  {
357
- "epoch": 7.01,
358
- "eval_accuracy": 1.0,
359
- "eval_loss": 0.026988975703716278,
360
- "eval_runtime": 5.8152,
361
- "eval_samples_per_second": 0.688,
362
- "eval_steps_per_second": 0.344,
363
- "step": 397
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
  }
365
  ],
366
  "logging_steps": 10,
367
- "max_steps": 448,
368
  "num_input_tokens_seen": 0,
369
  "num_train_epochs": 9223372036854775807,
370
  "save_steps": 500,
371
- "total_flos": 9.769146381458473e+17,
372
  "train_batch_size": 2,
373
  "trial_name": null,
374
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.927536231884058,
3
+ "best_model_checkpoint": "videomae-base-finetuned-ssv2-finetuned-traffic-dataset-mae/checkpoint-456",
4
+ "epoch": 7.125,
5
  "eval_steps": 500,
6
+ "global_step": 608,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.02,
13
+ "grad_norm": 10.86192798614502,
14
+ "learning_rate": 8.196721311475409e-06,
15
+ "loss": 0.6065,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.03,
20
+ "grad_norm": 5.969607830047607,
21
+ "learning_rate": 1.6393442622950818e-05,
22
+ "loss": 0.5224,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.05,
27
+ "grad_norm": 7.2913336753845215,
28
+ "learning_rate": 2.459016393442623e-05,
29
+ "loss": 0.5223,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.07,
34
+ "grad_norm": 1.6693388223648071,
35
+ "learning_rate": 3.2786885245901635e-05,
36
+ "loss": 0.1888,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.08,
41
+ "grad_norm": 1.9622797966003418,
42
+ "learning_rate": 4.098360655737705e-05,
43
+ "loss": 0.2521,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.1,
48
+ "grad_norm": 1.1212589740753174,
49
+ "learning_rate": 4.918032786885246e-05,
50
+ "loss": 0.0138,
 
 
 
 
 
 
 
 
 
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.12,
55
+ "grad_norm": 0.07918030023574829,
56
+ "learning_rate": 4.917733089579525e-05,
57
+ "loss": 0.0309,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.12,
62
+ "eval_accuracy": 0.9130434782608695,
63
+ "eval_loss": 0.30077555775642395,
64
+ "eval_runtime": 121.2516,
65
+ "eval_samples_per_second": 0.569,
66
+ "eval_steps_per_second": 0.289,
67
+ "step": 76
68
+ },
69
+ {
70
+ "epoch": 1.01,
71
+ "grad_norm": 0.09372598677873611,
72
+ "learning_rate": 4.826325411334552e-05,
73
+ "loss": 0.0042,
74
  "step": 80
75
  },
76
  {
77
+ "epoch": 1.02,
78
+ "grad_norm": 0.016796614974737167,
79
+ "learning_rate": 4.73491773308958e-05,
80
+ "loss": 0.0007,
81
  "step": 90
82
  },
83
  {
84
+ "epoch": 1.04,
85
+ "grad_norm": 0.020755194127559662,
86
+ "learning_rate": 4.643510054844607e-05,
87
+ "loss": 0.3019,
88
  "step": 100
89
  },
90
  {
91
+ "epoch": 1.06,
92
+ "grad_norm": 0.011669596657156944,
93
+ "learning_rate": 4.5521023765996346e-05,
94
+ "loss": 0.0003,
95
  "step": 110
96
  },
97
  {
98
+ "epoch": 1.07,
99
+ "grad_norm": 0.003766011679545045,
100
+ "learning_rate": 4.460694698354662e-05,
101
+ "loss": 0.0002,
 
 
 
 
 
 
 
 
 
102
  "step": 120
103
  },
104
  {
105
+ "epoch": 1.09,
106
+ "grad_norm": 0.08908724784851074,
107
+ "learning_rate": 4.369287020109689e-05,
108
+ "loss": 0.0002,
109
  "step": 130
110
  },
111
  {
112
+ "epoch": 1.11,
113
+ "grad_norm": 0.005499520804733038,
114
+ "learning_rate": 4.2778793418647164e-05,
115
+ "loss": 0.0002,
116
  "step": 140
117
  },
118
  {
119
+ "epoch": 1.12,
120
+ "grad_norm": 0.005470686126500368,
121
+ "learning_rate": 4.1864716636197444e-05,
122
+ "loss": 0.0002,
123
  "step": 150
124
  },
125
  {
126
+ "epoch": 1.12,
127
+ "eval_accuracy": 0.6666666666666666,
128
+ "eval_loss": 2.1030023097991943,
129
+ "eval_runtime": 100.6515,
130
+ "eval_samples_per_second": 0.686,
131
+ "eval_steps_per_second": 0.348,
132
+ "step": 152
133
+ },
134
+ {
135
+ "epoch": 2.01,
136
+ "grad_norm": 0.0026209524367004633,
137
+ "learning_rate": 4.095063985374772e-05,
138
+ "loss": 0.0001,
139
  "step": 160
140
  },
141
+ {
142
+ "epoch": 2.03,
143
+ "grad_norm": 0.009242965839803219,
144
+ "learning_rate": 4.003656307129799e-05,
145
+ "loss": 0.0001,
146
+ "step": 170
147
+ },
148
+ {
149
+ "epoch": 2.05,
150
+ "grad_norm": 0.0026453358586877584,
151
+ "learning_rate": 3.912248628884826e-05,
152
+ "loss": 0.0001,
153
+ "step": 180
154
+ },
155
+ {
156
+ "epoch": 2.06,
157
+ "grad_norm": 0.0033517158590257168,
158
+ "learning_rate": 3.820840950639854e-05,
159
+ "loss": 0.0001,
160
+ "step": 190
161
+ },
162
+ {
163
+ "epoch": 2.08,
164
+ "grad_norm": 0.003627925645560026,
165
+ "learning_rate": 3.7294332723948815e-05,
166
+ "loss": 0.0001,
167
+ "step": 200
168
+ },
169
+ {
170
+ "epoch": 2.1,
171
+ "grad_norm": 0.002879067324101925,
172
+ "learning_rate": 3.638025594149909e-05,
173
+ "loss": 0.0001,
174
+ "step": 210
175
+ },
176
+ {
177
+ "epoch": 2.11,
178
+ "grad_norm": 0.0036734691821038723,
179
+ "learning_rate": 3.546617915904936e-05,
180
+ "loss": 0.0001,
181
+ "step": 220
182
+ },
183
  {
184
  "epoch": 2.12,
185
+ "eval_accuracy": 0.7101449275362319,
186
+ "eval_loss": 1.84578275680542,
187
+ "eval_runtime": 105.6436,
188
+ "eval_samples_per_second": 0.653,
189
+ "eval_steps_per_second": 0.331,
190
+ "step": 228
191
  },
192
  {
193
  "epoch": 3.0,
194
+ "grad_norm": 0.0011968453181907535,
195
+ "learning_rate": 3.455210237659964e-05,
196
+ "loss": 0.0001,
197
+ "step": 230
198
  },
199
  {
200
+ "epoch": 3.02,
201
+ "grad_norm": 0.0024677019100636244,
202
+ "learning_rate": 3.3638025594149906e-05,
203
+ "loss": 0.0001,
204
+ "step": 240
205
+ },
206
+ {
207
+ "epoch": 3.04,
208
+ "grad_norm": 0.0011525729205459356,
209
+ "learning_rate": 3.2723948811700186e-05,
210
+ "loss": 0.0001,
211
+ "step": 250
212
  },
213
  {
214
  "epoch": 3.05,
215
+ "grad_norm": 0.003028567647561431,
216
+ "learning_rate": 3.180987202925046e-05,
217
+ "loss": 0.0001,
218
+ "step": 260
219
  },
220
  {
221
  "epoch": 3.07,
222
+ "grad_norm": 0.0027707908302545547,
223
+ "learning_rate": 3.089579524680074e-05,
224
+ "loss": 0.0001,
225
+ "step": 270
226
  },
227
  {
228
  "epoch": 3.09,
229
+ "grad_norm": 0.0017000396037474275,
230
+ "learning_rate": 2.9981718464351004e-05,
231
+ "loss": 0.0001,
232
+ "step": 280
233
+ },
234
+ {
235
+ "epoch": 3.1,
236
+ "grad_norm": 0.002608270151540637,
237
+ "learning_rate": 2.906764168190128e-05,
238
+ "loss": 0.0001,
239
+ "step": 290
240
  },
241
  {
242
  "epoch": 3.12,
243
+ "grad_norm": 0.0012394236400723457,
244
+ "learning_rate": 2.8153564899451557e-05,
245
+ "loss": 0.0,
246
+ "step": 300
247
  },
248
  {
249
  "epoch": 3.12,
250
+ "eval_accuracy": 0.7391304347826086,
251
+ "eval_loss": 1.5200165510177612,
252
+ "eval_runtime": 117.7366,
253
+ "eval_samples_per_second": 0.586,
254
+ "eval_steps_per_second": 0.297,
255
+ "step": 304
256
  },
257
  {
258
  "epoch": 4.01,
259
+ "grad_norm": 0.00229161255992949,
260
+ "learning_rate": 2.7239488117001826e-05,
261
+ "loss": 0.0001,
262
+ "step": 310
263
+ },
264
+ {
265
+ "epoch": 4.03,
266
+ "grad_norm": 0.0013108043931424618,
267
+ "learning_rate": 2.6325411334552102e-05,
268
+ "loss": 0.0001,
269
+ "step": 320
270
  },
271
  {
272
  "epoch": 4.04,
273
+ "grad_norm": 0.0012365940492600203,
274
+ "learning_rate": 2.541133455210238e-05,
275
+ "loss": 0.0,
276
+ "step": 330
277
  },
278
  {
279
  "epoch": 4.06,
280
+ "grad_norm": 0.002082814695313573,
281
+ "learning_rate": 2.449725776965265e-05,
282
+ "loss": 0.0001,
283
+ "step": 340
284
  },
285
  {
286
  "epoch": 4.08,
287
+ "grad_norm": 0.0035416895989328623,
288
+ "learning_rate": 2.3583180987202927e-05,
289
+ "loss": 0.0001,
290
+ "step": 350
291
  },
292
  {
293
+ "epoch": 4.09,
294
+ "grad_norm": 0.0009142422350123525,
295
+ "learning_rate": 2.26691042047532e-05,
296
+ "loss": 0.0001,
297
+ "step": 360
298
+ },
299
+ {
300
+ "epoch": 4.11,
301
+ "grad_norm": 0.0014322904171422124,
302
+ "learning_rate": 2.1755027422303476e-05,
303
+ "loss": 0.0001,
304
+ "step": 370
305
  },
306
  {
307
  "epoch": 4.12,
308
+ "grad_norm": 0.0017242878675460815,
309
+ "learning_rate": 2.084095063985375e-05,
310
+ "loss": 0.0001,
311
+ "step": 380
312
  },
313
  {
314
  "epoch": 4.12,
315
+ "eval_accuracy": 0.7536231884057971,
316
+ "eval_loss": 1.4568772315979004,
317
+ "eval_runtime": 106.8623,
318
+ "eval_samples_per_second": 0.646,
319
+ "eval_steps_per_second": 0.328,
320
+ "step": 380
321
  },
322
  {
323
  "epoch": 5.02,
324
+ "grad_norm": 0.0011396174086257815,
325
+ "learning_rate": 1.9926873857404025e-05,
326
+ "loss": 0.0001,
327
+ "step": 390
328
  },
329
  {
330
+ "epoch": 5.03,
331
+ "grad_norm": 0.0009178342879749835,
332
+ "learning_rate": 1.90127970749543e-05,
333
+ "loss": 0.0,
334
+ "step": 400
335
+ },
336
+ {
337
+ "epoch": 5.05,
338
+ "grad_norm": 0.0034428227227181196,
339
+ "learning_rate": 1.809872029250457e-05,
340
+ "loss": 0.0,
341
+ "step": 410
342
  },
343
  {
344
  "epoch": 5.07,
345
+ "grad_norm": 0.0011987154139205813,
346
+ "learning_rate": 1.7184643510054847e-05,
347
+ "loss": 0.0,
348
+ "step": 420
349
  },
350
  {
351
+ "epoch": 5.08,
352
+ "grad_norm": 0.000779572525061667,
353
+ "learning_rate": 1.627056672760512e-05,
354
+ "loss": 0.0,
355
+ "step": 430
356
  },
357
  {
358
+ "epoch": 5.1,
359
+ "grad_norm": 0.0013000366743654013,
360
+ "learning_rate": 1.5356489945155393e-05,
361
+ "loss": 0.0003,
362
+ "step": 440
363
+ },
364
+ {
365
+ "epoch": 5.12,
366
+ "grad_norm": 0.0019034247379750013,
367
+ "learning_rate": 1.4442413162705667e-05,
368
+ "loss": 0.0,
369
+ "step": 450
370
  },
371
  {
372
  "epoch": 5.12,
373
+ "eval_accuracy": 0.927536231884058,
374
+ "eval_loss": 0.39408108592033386,
375
+ "eval_runtime": 103.5083,
376
+ "eval_samples_per_second": 0.667,
377
+ "eval_steps_per_second": 0.338,
378
+ "step": 456
379
  },
380
  {
381
  "epoch": 6.01,
382
+ "grad_norm": 0.0016774114919826388,
383
+ "learning_rate": 1.3528336380255944e-05,
384
+ "loss": 0.0005,
385
+ "step": 460
386
  },
387
  {
388
+ "epoch": 6.02,
389
+ "grad_norm": 0.0013074069283902645,
390
+ "learning_rate": 1.2614259597806216e-05,
391
+ "loss": 0.0001,
392
+ "step": 470
393
  },
394
  {
395
+ "epoch": 6.04,
396
+ "grad_norm": 0.0015878825215622783,
397
+ "learning_rate": 1.1700182815356491e-05,
398
+ "loss": 0.0001,
399
+ "step": 480
400
  },
401
  {
402
+ "epoch": 6.06,
403
+ "grad_norm": 0.0008045001304708421,
404
+ "learning_rate": 1.0786106032906766e-05,
405
+ "loss": 0.0001,
406
+ "step": 490
407
  },
408
  {
409
+ "epoch": 6.07,
410
+ "grad_norm": 0.0012260900111868978,
411
+ "learning_rate": 9.872029250457038e-06,
412
+ "loss": 0.0002,
413
+ "step": 500
414
+ },
415
+ {
416
+ "epoch": 6.09,
417
+ "grad_norm": 0.001507502980530262,
418
+ "learning_rate": 8.957952468007313e-06,
419
+ "loss": 0.0001,
420
+ "step": 510
421
+ },
422
+ {
423
+ "epoch": 6.11,
424
+ "grad_norm": 0.001074848580174148,
425
+ "learning_rate": 8.043875685557587e-06,
426
+ "loss": 0.0001,
427
+ "step": 520
428
  },
429
  {
430
  "epoch": 6.12,
431
+ "grad_norm": 0.0012807522434741259,
432
+ "learning_rate": 7.129798903107861e-06,
433
+ "loss": 0.0001,
434
+ "step": 530
435
  },
436
  {
437
  "epoch": 6.12,
438
+ "eval_accuracy": 0.8695652173913043,
439
+ "eval_loss": 0.9657596945762634,
440
+ "eval_runtime": 105.1521,
441
+ "eval_samples_per_second": 0.656,
442
+ "eval_steps_per_second": 0.333,
443
+ "step": 532
444
  },
445
  {
446
  "epoch": 7.01,
447
+ "grad_norm": 0.0008385140681639314,
448
+ "learning_rate": 6.2157221206581355e-06,
449
+ "loss": 0.0,
450
+ "step": 540
 
 
451
  },
452
  {
453
+ "epoch": 7.03,
454
+ "grad_norm": 0.001427665469236672,
455
+ "learning_rate": 5.301645338208409e-06,
456
+ "loss": 0.0,
457
+ "step": 550
458
+ },
459
+ {
460
+ "epoch": 7.05,
461
+ "grad_norm": 0.001054079388268292,
462
+ "learning_rate": 4.387568555758684e-06,
463
+ "loss": 0.0001,
464
+ "step": 560
465
+ },
466
+ {
467
+ "epoch": 7.06,
468
+ "grad_norm": 0.0016839707968756557,
469
+ "learning_rate": 3.4734917733089582e-06,
470
+ "loss": 0.0001,
471
+ "step": 570
472
+ },
473
+ {
474
+ "epoch": 7.08,
475
+ "grad_norm": 0.0011065697763115168,
476
+ "learning_rate": 2.5594149908592323e-06,
477
+ "loss": 0.0,
478
+ "step": 580
479
+ },
480
+ {
481
+ "epoch": 7.1,
482
+ "grad_norm": 0.0027244570665061474,
483
+ "learning_rate": 1.6453382084095064e-06,
484
+ "loss": 0.0,
485
+ "step": 590
486
+ },
487
+ {
488
+ "epoch": 7.11,
489
+ "grad_norm": 0.002320781582966447,
490
+ "learning_rate": 7.312614259597807e-07,
491
+ "loss": 0.0001,
492
+ "step": 600
493
+ },
494
+ {
495
+ "epoch": 7.12,
496
+ "eval_accuracy": 0.8405797101449275,
497
+ "eval_loss": 0.983595609664917,
498
+ "eval_runtime": 105.1841,
499
+ "eval_samples_per_second": 0.656,
500
+ "eval_steps_per_second": 0.333,
501
+ "step": 608
502
+ },
503
+ {
504
+ "epoch": 7.12,
505
+ "step": 608,
506
+ "total_flos": 1.5152145407976407e+18,
507
+ "train_loss": 0.040272084387700156,
508
+ "train_runtime": 2933.2373,
509
+ "train_samples_per_second": 0.415,
510
+ "train_steps_per_second": 0.207
511
+ },
512
+ {
513
+ "epoch": 7.12,
514
+ "eval_accuracy": 0.375,
515
+ "eval_loss": 3.811370849609375,
516
+ "eval_runtime": 15.0567,
517
+ "eval_samples_per_second": 1.063,
518
+ "eval_steps_per_second": 0.531,
519
+ "step": 608
520
+ },
521
+ {
522
+ "epoch": 7.12,
523
+ "eval_accuracy": 0.375,
524
+ "eval_loss": 3.811370849609375,
525
+ "eval_runtime": 14.4555,
526
+ "eval_samples_per_second": 1.107,
527
+ "eval_steps_per_second": 0.553,
528
+ "step": 608
529
  }
530
  ],
531
  "logging_steps": 10,
532
+ "max_steps": 608,
533
  "num_input_tokens_seen": 0,
534
  "num_train_epochs": 9223372036854775807,
535
  "save_steps": 500,
536
+ "total_flos": 1.5152145407976407e+18,
537
  "train_batch_size": 2,
538
  "trial_name": null,
539
  "trial_params": null