bimabk commited on
Commit
6868ebb
·
verified ·
1 Parent(s): 252f52f

Upload task output 1

Browse files
Files changed (4) hide show
  1. loss.txt +1 -1
  2. model.safetensors +1 -1
  3. trainer_state.json +396 -185
  4. training_args.bin +1 -1
loss.txt CHANGED
@@ -1 +1 @@
1
- 292,0.00010653198114596307
 
1
+ 438,0.00994242262095213
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db049db56e23cb51c422a8f629d2c1a3077b5b3c714d1116de01ccfd8591f159
3
  size 2200119864
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37a84856799837fe6d91a0c0f26fae5a0ad6c183ae5bd17edf654d66771d55a6
3
  size 2200119864
trainer_state.json CHANGED
@@ -2,434 +2,645 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.9863945578231292,
6
  "eval_steps": 500,
7
- "global_step": 292,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.034013605442176874,
14
- "grad_norm": 34.75,
15
- "learning_rate": 6.430830146224649e-06,
16
- "loss": 0.6785,
17
  "step": 5
18
  },
19
  {
20
  "epoch": 0.06802721088435375,
21
- "grad_norm": 110.0,
22
- "learning_rate": 1.4469367829005459e-05,
23
- "loss": 0.15,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 0.10204081632653061,
28
- "grad_norm": 0.00640869140625,
29
- "learning_rate": 2.2507905511786274e-05,
30
- "loss": 0.0071,
31
  "step": 15
32
  },
33
  {
34
  "epoch": 0.1360544217687075,
35
- "grad_norm": 0.0135498046875,
36
- "learning_rate": 3.054644319456708e-05,
37
- "loss": 0.0692,
38
  "step": 20
39
  },
40
  {
41
  "epoch": 0.17006802721088435,
42
- "grad_norm": 0.15234375,
43
- "learning_rate": 3.858498087734789e-05,
44
- "loss": 0.0616,
45
  "step": 25
46
  },
47
  {
48
  "epoch": 0.20408163265306123,
49
- "grad_norm": 0.003021240234375,
50
- "learning_rate": 4.6623518560128706e-05,
51
- "loss": 0.0001,
52
  "step": 30
53
  },
54
  {
55
  "epoch": 0.23809523809523808,
56
- "grad_norm": 28.875,
57
- "learning_rate": 5.466205624290951e-05,
58
- "loss": 0.0322,
59
  "step": 35
60
  },
61
  {
62
  "epoch": 0.272108843537415,
63
- "grad_norm": 17.75,
64
- "learning_rate": 5.6259657079961125e-05,
65
- "loss": 0.0862,
66
  "step": 40
67
  },
68
  {
69
  "epoch": 0.30612244897959184,
70
- "grad_norm": 0.283203125,
71
- "learning_rate": 5.621861520545433e-05,
72
- "loss": 0.0157,
73
  "step": 45
74
  },
75
  {
76
  "epoch": 0.3401360544217687,
77
- "grad_norm": 0.0096435546875,
78
- "learning_rate": 5.614606786714469e-05,
79
- "loss": 0.0441,
80
  "step": 50
81
  },
82
  {
83
  "epoch": 0.3741496598639456,
84
- "grad_norm": 0.271484375,
85
- "learning_rate": 5.604212364632182e-05,
86
- "loss": 0.0507,
87
  "step": 55
88
  },
89
  {
90
  "epoch": 0.40816326530612246,
91
- "grad_norm": 0.02734375,
92
- "learning_rate": 5.590693811585455e-05,
93
- "loss": 0.0012,
94
  "step": 60
95
  },
96
  {
97
  "epoch": 0.4421768707482993,
98
- "grad_norm": 0.189453125,
99
- "learning_rate": 5.5740713607345624e-05,
100
- "loss": 0.0152,
101
  "step": 65
102
  },
103
  {
104
  "epoch": 0.47619047619047616,
105
- "grad_norm": 0.05078125,
106
- "learning_rate": 5.5543698908302936e-05,
107
- "loss": 0.0634,
108
  "step": 70
109
  },
110
  {
111
  "epoch": 0.5102040816326531,
112
- "grad_norm": 42.5,
113
- "learning_rate": 5.531618888978025e-05,
114
- "loss": 0.0458,
115
  "step": 75
116
  },
117
  {
118
  "epoch": 0.54421768707483,
119
- "grad_norm": 17.25,
120
- "learning_rate": 5.505852406504504e-05,
121
- "loss": 0.0562,
122
  "step": 80
123
  },
124
  {
125
  "epoch": 0.5782312925170068,
126
- "grad_norm": 10.3125,
127
- "learning_rate": 5.47710900799337e-05,
128
- "loss": 0.1128,
129
  "step": 85
130
  },
131
  {
132
  "epoch": 0.6122448979591837,
133
- "grad_norm": 17.125,
134
- "learning_rate": 5.445431713565707e-05,
135
- "loss": 0.0146,
136
  "step": 90
137
  },
138
  {
139
  "epoch": 0.6462585034013606,
140
- "grad_norm": 42.0,
141
- "learning_rate": 5.4108679344920166e-05,
142
- "loss": 0.1167,
143
  "step": 95
144
  },
145
  {
146
  "epoch": 0.6802721088435374,
147
- "grad_norm": 0.0869140625,
148
- "learning_rate": 5.373469402231971e-05,
149
- "loss": 0.0209,
150
  "step": 100
151
  },
152
  {
153
  "epoch": 0.7142857142857143,
154
- "grad_norm": 0.8359375,
155
- "learning_rate": 5.333292091008159e-05,
156
- "loss": 0.0019,
157
  "step": 105
158
  },
159
  {
160
  "epoch": 0.7482993197278912,
161
- "grad_norm": 0.07373046875,
162
- "learning_rate": 5.290396134029709e-05,
163
- "loss": 0.0157,
164
  "step": 110
165
  },
166
  {
167
  "epoch": 0.782312925170068,
168
- "grad_norm": 0.2578125,
169
- "learning_rate": 5.244845733491173e-05,
170
- "loss": 0.0051,
171
  "step": 115
172
  },
173
  {
174
  "epoch": 0.8163265306122449,
175
- "grad_norm": 67.5,
176
- "learning_rate": 5.196709064481376e-05,
177
- "loss": 0.0181,
178
  "step": 120
179
  },
180
  {
181
  "epoch": 0.8503401360544217,
182
- "grad_norm": 0.0478515625,
183
- "learning_rate": 5.146058172946055e-05,
184
- "loss": 0.1658,
185
  "step": 125
186
  },
187
  {
188
  "epoch": 0.8843537414965986,
189
- "grad_norm": 3.46875,
190
- "learning_rate": 5.092968867857002e-05,
191
- "loss": 0.0155,
192
  "step": 130
193
  },
194
  {
195
  "epoch": 0.9183673469387755,
196
- "grad_norm": 0.040283203125,
197
- "learning_rate": 5.037520607749101e-05,
198
- "loss": 0.0025,
199
  "step": 135
200
  },
201
  {
202
  "epoch": 0.9523809523809523,
203
- "grad_norm": 0.80859375,
204
- "learning_rate": 4.979796381795077e-05,
205
- "loss": 0.0006,
206
  "step": 140
207
  },
208
  {
209
  "epoch": 0.9863945578231292,
210
- "grad_norm": 0.0189208984375,
211
- "learning_rate": 4.919882585595965e-05,
212
- "loss": 0.0738,
213
  "step": 145
214
  },
215
  {
216
  "epoch": 0.9931972789115646,
217
- "eval_loss": 0.05052933469414711,
218
- "eval_runtime": 0.8602,
219
- "eval_samples_per_second": 30.225,
220
- "eval_steps_per_second": 30.225,
221
  "step": 146
222
  },
223
  {
224
  "epoch": 1.0204081632653061,
225
- "grad_norm": 25.0,
226
- "learning_rate": 4.8578688918731776e-05,
227
- "loss": 0.0782,
228
  "step": 150
229
  },
230
  {
231
  "epoch": 1.054421768707483,
232
- "grad_norm": 0.08349609375,
233
- "learning_rate": 4.7938481162557315e-05,
234
- "loss": 0.0015,
235
  "step": 155
236
  },
237
  {
238
  "epoch": 1.08843537414966,
239
- "grad_norm": 0.0025177001953125,
240
- "learning_rate": 4.727916078363493e-05,
241
- "loss": 0.0082,
242
  "step": 160
243
  },
244
  {
245
  "epoch": 1.1224489795918366,
246
- "grad_norm": 0.03515625,
247
- "learning_rate": 4.6601714583943683e-05,
248
- "loss": 0.0,
249
  "step": 165
250
  },
251
  {
252
  "epoch": 1.1564625850340136,
253
- "grad_norm": 0.0101318359375,
254
- "learning_rate": 4.590715649430069e-05,
255
- "loss": 0.0031,
256
  "step": 170
257
  },
258
  {
259
  "epoch": 1.1904761904761905,
260
- "grad_norm": 0.01348876953125,
261
- "learning_rate": 4.519652605681526e-05,
262
- "loss": 0.0001,
263
  "step": 175
264
  },
265
  {
266
  "epoch": 1.2244897959183674,
267
- "grad_norm": 0.0238037109375,
268
- "learning_rate": 4.447088686901071e-05,
269
- "loss": 0.0119,
270
  "step": 180
271
  },
272
  {
273
  "epoch": 1.2585034013605443,
274
- "grad_norm": 2.515625,
275
- "learning_rate": 4.373132499194244e-05,
276
- "loss": 0.0016,
277
  "step": 185
278
  },
279
  {
280
  "epoch": 1.2925170068027212,
281
- "grad_norm": 0.059814453125,
282
- "learning_rate": 4.2978947324695116e-05,
283
- "loss": 0.0283,
284
  "step": 190
285
  },
286
  {
287
  "epoch": 1.3265306122448979,
288
- "grad_norm": 9.1875,
289
- "learning_rate": 4.221487994769145e-05,
290
- "loss": 0.0045,
291
  "step": 195
292
  },
293
  {
294
  "epoch": 1.3605442176870748,
295
- "grad_norm": 0.015625,
296
- "learning_rate": 4.144026643729263e-05,
297
- "loss": 0.005,
298
  "step": 200
299
  },
300
  {
301
  "epoch": 1.3945578231292517,
302
- "grad_norm": 0.0034637451171875,
303
- "learning_rate": 4.06562661542123e-05,
304
- "loss": 0.0001,
305
  "step": 205
306
  },
307
  {
308
  "epoch": 1.4285714285714286,
309
- "grad_norm": 0.016845703125,
310
- "learning_rate": 3.986405250830651e-05,
311
- "loss": 0.0369,
312
  "step": 210
313
  },
314
  {
315
  "epoch": 1.4625850340136055,
316
- "grad_norm": 0.13671875,
317
- "learning_rate": 3.9064811202336157e-05,
318
- "loss": 0.0003,
319
  "step": 215
320
  },
321
  {
322
  "epoch": 1.4965986394557822,
323
- "grad_norm": 5.03125,
324
- "learning_rate": 3.8259738457330774e-05,
325
- "loss": 0.0025,
326
  "step": 220
327
  },
328
  {
329
  "epoch": 1.5306122448979593,
330
- "grad_norm": 0.796875,
331
- "learning_rate": 3.7450039222209636e-05,
332
- "loss": 0.0006,
333
  "step": 225
334
  },
335
  {
336
  "epoch": 1.564625850340136,
337
- "grad_norm": 0.025146484375,
338
- "learning_rate": 3.663692537033991e-05,
339
- "loss": 0.0018,
340
  "step": 230
341
  },
342
  {
343
  "epoch": 1.598639455782313,
344
- "grad_norm": 0.057861328125,
345
- "learning_rate": 3.5821613885731e-05,
346
- "loss": 0.0072,
347
  "step": 235
348
  },
349
  {
350
  "epoch": 1.6326530612244898,
351
- "grad_norm": 0.02783203125,
352
- "learning_rate": 3.500532504157975e-05,
353
- "loss": 0.0003,
354
  "step": 240
355
  },
356
  {
357
  "epoch": 1.6666666666666665,
358
- "grad_norm": 0.037841796875,
359
- "learning_rate": 3.418928057389288e-05,
360
- "loss": 0.0002,
361
  "step": 245
362
  },
363
  {
364
  "epoch": 1.7006802721088436,
365
- "grad_norm": 0.01104736328125,
366
- "learning_rate": 3.337470185291987e-05,
367
- "loss": 0.0002,
368
  "step": 250
369
  },
370
  {
371
  "epoch": 1.7346938775510203,
372
- "grad_norm": 0.044677734375,
373
- "learning_rate": 3.25628080551334e-05,
374
- "loss": 0.0156,
375
  "step": 255
376
  },
377
  {
378
  "epoch": 1.7687074829931972,
379
- "grad_norm": 0.0037841796875,
380
- "learning_rate": 3.175481433849315e-05,
381
- "loss": 0.0002,
382
  "step": 260
383
  },
384
  {
385
  "epoch": 1.8027210884353742,
386
- "grad_norm": 0.376953125,
387
- "learning_rate": 3.0951930023724105e-05,
388
- "loss": 0.001,
389
  "step": 265
390
  },
391
  {
392
  "epoch": 1.836734693877551,
393
- "grad_norm": 0.00799560546875,
394
- "learning_rate": 3.015535678433143e-05,
395
- "loss": 0.0,
396
  "step": 270
397
  },
398
  {
399
  "epoch": 1.870748299319728,
400
- "grad_norm": 0.00244140625,
401
- "learning_rate": 2.936628684806084e-05,
402
- "loss": 0.0001,
403
  "step": 275
404
  },
405
  {
406
  "epoch": 1.9047619047619047,
407
- "grad_norm": 0.00170135498046875,
408
- "learning_rate": 2.858590121249654e-05,
409
- "loss": 0.0,
410
  "step": 280
411
  },
412
  {
413
  "epoch": 1.9387755102040818,
414
- "grad_norm": 0.00762939453125,
415
- "learning_rate": 2.7815367877467164e-05,
416
- "loss": 0.0974,
417
  "step": 285
418
  },
419
  {
420
  "epoch": 1.9727891156462585,
421
- "grad_norm": 0.002166748046875,
422
- "learning_rate": 2.7055840096905433e-05,
423
- "loss": 0.0278,
424
  "step": 290
425
  },
426
  {
427
  "epoch": 1.9863945578231292,
428
- "eval_loss": 0.00010653198114596307,
429
- "eval_runtime": 0.7953,
430
- "eval_samples_per_second": 32.69,
431
- "eval_steps_per_second": 32.69,
432
  "step": 292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
  }
434
  ],
435
  "logging_steps": 5,
@@ -449,7 +660,7 @@
449
  "attributes": {}
450
  }
451
  },
452
- "total_flos": 1.1110365064593408e+16,
453
  "train_batch_size": 3,
454
  "trial_name": null,
455
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.979591836734694,
6
  "eval_steps": 500,
7
+ "global_step": 438,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.034013605442176874,
14
+ "grad_norm": 28.75,
15
+ "learning_rate": 1.2405214087609793e-05,
16
+ "loss": 3.4643,
17
  "step": 5
18
  },
19
  {
20
  "epoch": 0.06802721088435375,
21
+ "grad_norm": 38.25,
22
+ "learning_rate": 2.791173169712203e-05,
23
+ "loss": 4.2336,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 0.10204081632653061,
28
+ "grad_norm": 32.0,
29
+ "learning_rate": 4.341824930663428e-05,
30
+ "loss": 2.0587,
31
  "step": 15
32
  },
33
  {
34
  "epoch": 0.1360544217687075,
35
+ "grad_norm": 0.84765625,
36
+ "learning_rate": 5.8924766916146515e-05,
37
+ "loss": 1.0773,
38
  "step": 20
39
  },
40
  {
41
  "epoch": 0.17006802721088435,
42
+ "grad_norm": 42.75,
43
+ "learning_rate": 7.443128452565876e-05,
44
+ "loss": 0.6339,
45
  "step": 25
46
  },
47
  {
48
  "epoch": 0.20408163265306123,
49
+ "grad_norm": 0.1826171875,
50
+ "learning_rate": 8.9937802135171e-05,
51
+ "loss": 0.3862,
52
  "step": 30
53
  },
54
  {
55
  "epoch": 0.23809523809523808,
56
+ "grad_norm": 72.5,
57
+ "learning_rate": 0.00010544431974468325,
58
+ "loss": 2.4785,
59
  "step": 35
60
  },
61
  {
62
  "epoch": 0.272108843537415,
63
+ "grad_norm": 35.75,
64
+ "learning_rate": 0.0001085248274824069,
65
+ "loss": 1.4148,
66
  "step": 40
67
  },
68
  {
69
  "epoch": 0.30612244897959184,
70
+ "grad_norm": 74.5,
71
+ "learning_rate": 0.0001084403787497468,
72
+ "loss": 0.8793,
73
  "step": 45
74
  },
75
  {
76
  "epoch": 0.3401360544217687,
77
+ "grad_norm": 35.5,
78
+ "learning_rate": 0.00010829110362886751,
79
+ "loss": 0.4985,
80
  "step": 50
81
  },
82
  {
83
  "epoch": 0.3741496598639456,
84
+ "grad_norm": 2.328125,
85
+ "learning_rate": 0.00010807722553920126,
86
+ "loss": 0.382,
87
  "step": 55
88
  },
89
  {
90
  "epoch": 0.40816326530612246,
91
+ "grad_norm": 17.5,
92
+ "learning_rate": 0.00010779906459116616,
93
+ "loss": 0.6253,
94
  "step": 60
95
  },
96
  {
97
  "epoch": 0.4421768707482993,
98
+ "grad_norm": 16.375,
99
+ "learning_rate": 0.00010745703710705828,
100
+ "loss": 0.3236,
101
  "step": 65
102
  },
103
  {
104
  "epoch": 0.47619047619047616,
105
+ "grad_norm": 55.75,
106
+ "learning_rate": 0.00010705165499794393,
107
+ "loss": 0.3981,
108
  "step": 70
109
  },
110
  {
111
  "epoch": 0.5102040816326531,
112
+ "grad_norm": 25.625,
113
+ "learning_rate": 0.00010658352499748452,
114
+ "loss": 0.6889,
115
  "step": 75
116
  },
117
  {
118
  "epoch": 0.54421768707483,
119
+ "grad_norm": 235.0,
120
+ "learning_rate": 0.00010605334775384088,
121
+ "loss": 1.3471,
122
  "step": 80
123
  },
124
  {
125
  "epoch": 0.5782312925170068,
126
+ "grad_norm": 6.34375,
127
+ "learning_rate": 0.00010546191678101621,
128
+ "loss": 0.7067,
129
  "step": 85
130
  },
131
  {
132
  "epoch": 0.6122448979591837,
133
+ "grad_norm": 33.0,
134
+ "learning_rate": 0.00010481011727120708,
135
+ "loss": 0.2755,
136
  "step": 90
137
  },
138
  {
139
  "epoch": 0.6462585034013606,
140
+ "grad_norm": 29.875,
141
+ "learning_rate": 0.00010409892476994003,
142
+ "loss": 0.376,
143
  "step": 95
144
  },
145
  {
146
  "epoch": 0.6802721088435374,
147
+ "grad_norm": 20.5,
148
+ "learning_rate": 0.0001033294037159768,
149
+ "loss": 0.5199,
150
  "step": 100
151
  },
152
  {
153
  "epoch": 0.7142857142857143,
154
+ "grad_norm": 9.0625,
155
+ "learning_rate": 0.00010250270584817341,
156
+ "loss": 0.4744,
157
  "step": 105
158
  },
159
  {
160
  "epoch": 0.7482993197278912,
161
+ "grad_norm": 49.25,
162
+ "learning_rate": 0.0001016200684816775,
163
+ "loss": 0.3467,
164
  "step": 110
165
  },
166
  {
167
  "epoch": 0.782312925170068,
168
+ "grad_norm": 54.75,
169
+ "learning_rate": 0.00010068281265604425,
170
+ "loss": 0.4323,
171
  "step": 115
172
  },
173
  {
174
  "epoch": 0.8163265306122449,
175
+ "grad_norm": 25.75,
176
+ "learning_rate": 9.969234115804185e-05,
177
+ "loss": 0.5117,
178
  "step": 120
179
  },
180
  {
181
  "epoch": 0.8503401360544217,
182
+ "grad_norm": 27.375,
183
+ "learning_rate": 9.865013642210685e-05,
184
+ "loss": 0.3461,
185
  "step": 125
186
  },
187
  {
188
  "epoch": 0.8843537414965986,
189
+ "grad_norm": 27.0,
190
+ "learning_rate": 9.755775831159075e-05,
191
+ "loss": 0.3838,
192
  "step": 130
193
  },
194
  {
195
  "epoch": 0.9183673469387755,
196
+ "grad_norm": 22.0,
197
+ "learning_rate": 9.641684178411933e-05,
198
+ "loss": 0.3031,
199
  "step": 135
200
  },
201
  {
202
  "epoch": 0.9523809523809523,
203
+ "grad_norm": 15.25,
204
+ "learning_rate": 9.522909444455842e-05,
205
+ "loss": 0.4872,
206
  "step": 140
207
  },
208
  {
209
  "epoch": 0.9863945578231292,
210
+ "grad_norm": 8.375,
211
+ "learning_rate": 9.399629398924927e-05,
212
+ "loss": 0.2626,
213
  "step": 145
214
  },
215
  {
216
  "epoch": 0.9931972789115646,
217
+ "eval_loss": 0.21089103817939758,
218
+ "eval_runtime": 1.2585,
219
+ "eval_samples_per_second": 20.66,
220
+ "eval_steps_per_second": 20.66,
221
  "step": 146
222
  },
223
  {
224
  "epoch": 1.0204081632653061,
225
+ "grad_norm": 23.75,
226
+ "learning_rate": 9.272028554533782e-05,
227
+ "loss": 0.1617,
228
  "step": 150
229
  },
230
  {
231
  "epoch": 1.054421768707483,
232
+ "grad_norm": 3.421875,
233
+ "learning_rate": 9.140297890918105e-05,
234
+ "loss": 0.2647,
235
  "step": 155
236
  },
237
  {
238
  "epoch": 1.08843537414966,
239
+ "grad_norm": 7.5625,
240
+ "learning_rate": 9.004634568796285e-05,
241
+ "loss": 0.4115,
242
  "step": 160
243
  },
244
  {
245
  "epoch": 1.1224489795918366,
246
+ "grad_norm": 14.1875,
247
+ "learning_rate": 8.865241634879804e-05,
248
+ "loss": 0.2492,
249
  "step": 165
250
  },
251
  {
252
  "epoch": 1.1564625850340136,
253
+ "grad_norm": 9.1875,
254
+ "learning_rate": 8.722327717974095e-05,
255
+ "loss": 0.3327,
256
  "step": 170
257
  },
258
  {
259
  "epoch": 1.1904761904761905,
260
+ "grad_norm": 9.4375,
261
+ "learning_rate": 8.57610671672471e-05,
262
+ "loss": 0.4109,
263
  "step": 175
264
  },
265
  {
266
  "epoch": 1.2244897959183674,
267
+ "grad_norm": 20.25,
268
+ "learning_rate": 8.426797479476129e-05,
269
+ "loss": 0.4426,
270
  "step": 180
271
  },
272
  {
273
  "epoch": 1.2585034013605443,
274
+ "grad_norm": 66.0,
275
+ "learning_rate": 8.27462347672239e-05,
276
+ "loss": 0.3577,
277
  "step": 185
278
  },
279
  {
280
  "epoch": 1.2925170068027212,
281
+ "grad_norm": 4.75,
282
+ "learning_rate": 8.119812466639757e-05,
283
+ "loss": 0.201,
284
  "step": 190
285
  },
286
  {
287
  "epoch": 1.3265306122448979,
288
+ "grad_norm": 5.84375,
289
+ "learning_rate": 7.962596154202029e-05,
290
+ "loss": 0.1453,
291
  "step": 195
292
  },
293
  {
294
  "epoch": 1.3605442176870748,
295
+ "grad_norm": 8.4375,
296
+ "learning_rate": 7.80320984438872e-05,
297
+ "loss": 0.072,
298
  "step": 200
299
  },
300
  {
301
  "epoch": 1.3945578231292517,
302
+ "grad_norm": 10.75,
303
+ "learning_rate": 7.641892090005088e-05,
304
+ "loss": 0.0673,
305
  "step": 205
306
  },
307
  {
308
  "epoch": 1.4285714285714286,
309
+ "grad_norm": 20.5,
310
+ "learning_rate": 7.478884334641178e-05,
311
+ "loss": 0.2137,
312
  "step": 210
313
  },
314
  {
315
  "epoch": 1.4625850340136055,
316
+ "grad_norm": 13.875,
317
+ "learning_rate": 7.314430551304253e-05,
318
+ "loss": 0.1786,
319
  "step": 215
320
  },
321
  {
322
  "epoch": 1.4965986394557822,
323
+ "grad_norm": 7.65625,
324
+ "learning_rate": 7.148776877265426e-05,
325
+ "loss": 0.0821,
326
  "step": 220
327
  },
328
  {
329
  "epoch": 1.5306122448979593,
330
+ "grad_norm": 16.25,
331
+ "learning_rate": 6.982171245667071e-05,
332
+ "loss": 0.1904,
333
  "step": 225
334
  },
335
  {
336
  "epoch": 1.564625850340136,
337
+ "grad_norm": 4.46875,
338
+ "learning_rate": 6.81486301444235e-05,
339
+ "loss": 0.0449,
340
  "step": 230
341
  },
342
  {
343
  "epoch": 1.598639455782313,
344
+ "grad_norm": 14.5625,
345
+ "learning_rate": 6.64710259310227e-05,
346
+ "loss": 0.1493,
347
  "step": 235
348
  },
349
  {
350
  "epoch": 1.6326530612244898,
351
+ "grad_norm": 25.75,
352
+ "learning_rate": 6.479141067948843e-05,
353
+ "loss": 0.1877,
354
  "step": 240
355
  },
356
  {
357
  "epoch": 1.6666666666666665,
358
+ "grad_norm": 12.75,
359
+ "learning_rate": 6.311229826275292e-05,
360
+ "loss": 0.1844,
361
  "step": 245
362
  },
363
  {
364
  "epoch": 1.7006802721088436,
365
+ "grad_norm": 13.5,
366
+ "learning_rate": 6.143620180115768e-05,
367
+ "loss": 0.2923,
368
  "step": 250
369
  },
370
  {
371
  "epoch": 1.7346938775510203,
372
+ "grad_norm": 2.15625,
373
+ "learning_rate": 5.9765629901077215e-05,
374
+ "loss": 0.059,
375
  "step": 255
376
  },
377
  {
378
  "epoch": 1.7687074829931972,
379
+ "grad_norm": 0.26953125,
380
+ "learning_rate": 5.8103082900298425e-05,
381
+ "loss": 0.0561,
382
  "step": 260
383
  },
384
  {
385
  "epoch": 1.8027210884353742,
386
+ "grad_norm": 8.875,
387
+ "learning_rate": 5.645104912577601e-05,
388
+ "loss": 0.1103,
389
  "step": 265
390
  },
391
  {
392
  "epoch": 1.836734693877551,
393
+ "grad_norm": 8.4375,
394
+ "learning_rate": 5.481200116936402e-05,
395
+ "loss": 0.1003,
396
  "step": 270
397
  },
398
  {
399
  "epoch": 1.870748299319728,
400
+ "grad_norm": 6.0,
401
+ "learning_rate": 5.31883921870983e-05,
402
+ "loss": 0.0788,
403
  "step": 275
404
  },
405
  {
406
  "epoch": 1.9047619047619047,
407
+ "grad_norm": 1.6171875,
408
+ "learning_rate": 5.158265222756847e-05,
409
+ "loss": 0.0819,
410
  "step": 280
411
  },
412
  {
413
  "epoch": 1.9387755102040818,
414
+ "grad_norm": 14.1875,
415
+ "learning_rate": 4.999718459487458e-05,
416
+ "loss": 0.0352,
417
  "step": 285
418
  },
419
  {
420
  "epoch": 1.9727891156462585,
421
+ "grad_norm": 0.60546875,
422
+ "learning_rate": 4.843436225161211e-05,
423
+ "loss": 0.1354,
424
  "step": 290
425
  },
426
  {
427
  "epoch": 1.9863945578231292,
428
+ "eval_loss": 0.17950406670570374,
429
+ "eval_runtime": 1.156,
430
+ "eval_samples_per_second": 22.491,
431
+ "eval_steps_per_second": 22.491,
432
  "step": 292
433
+ },
434
+ {
435
+ "epoch": 2.006802721088435,
436
+ "grad_norm": 1.0703125,
437
+ "learning_rate": 4.689652426726917e-05,
438
+ "loss": 0.0795,
439
+ "step": 295
440
+ },
441
+ {
442
+ "epoch": 2.0408163265306123,
443
+ "grad_norm": 0.0264892578125,
444
+ "learning_rate": 4.5385972317351206e-05,
445
+ "loss": 0.1034,
446
+ "step": 300
447
+ },
448
+ {
449
+ "epoch": 2.074829931972789,
450
+ "grad_norm": 0.060791015625,
451
+ "learning_rate": 4.3904967238473124e-05,
452
+ "loss": 0.0467,
453
+ "step": 305
454
+ },
455
+ {
456
+ "epoch": 2.108843537414966,
457
+ "grad_norm": 4.5625,
458
+ "learning_rate": 4.2455725644574884e-05,
459
+ "loss": 0.0525,
460
+ "step": 310
461
+ },
462
+ {
463
+ "epoch": 2.142857142857143,
464
+ "grad_norm": 0.08984375,
465
+ "learning_rate": 4.1040416609324844e-05,
466
+ "loss": 0.0122,
467
+ "step": 315
468
+ },
469
+ {
470
+ "epoch": 2.17687074829932,
471
+ "grad_norm": 17.25,
472
+ "learning_rate": 3.966115841967671e-05,
473
+ "loss": 0.1311,
474
+ "step": 320
475
+ },
476
+ {
477
+ "epoch": 2.2108843537414966,
478
+ "grad_norm": 3.390625,
479
+ "learning_rate": 3.832001540543833e-05,
480
+ "loss": 0.1173,
481
+ "step": 325
482
+ },
483
+ {
484
+ "epoch": 2.2448979591836733,
485
+ "grad_norm": 4.59375,
486
+ "learning_rate": 3.701899484959829e-05,
487
+ "loss": 0.0816,
488
+ "step": 330
489
+ },
490
+ {
491
+ "epoch": 2.2789115646258504,
492
+ "grad_norm": 0.06591796875,
493
+ "learning_rate": 3.5760043984034015e-05,
494
+ "loss": 0.0235,
495
+ "step": 335
496
+ },
497
+ {
498
+ "epoch": 2.312925170068027,
499
+ "grad_norm": 12.375,
500
+ "learning_rate": 3.454504707509821e-05,
501
+ "loss": 0.0807,
502
+ "step": 340
503
+ },
504
+ {
505
+ "epoch": 2.3469387755102042,
506
+ "grad_norm": 0.016357421875,
507
+ "learning_rate": 3.337582260344549e-05,
508
+ "loss": 0.0603,
509
+ "step": 345
510
+ },
511
+ {
512
+ "epoch": 2.380952380952381,
513
+ "grad_norm": 0.06884765625,
514
+ "learning_rate": 3.225412054232022e-05,
515
+ "loss": 0.0107,
516
+ "step": 350
517
+ },
518
+ {
519
+ "epoch": 2.4149659863945576,
520
+ "grad_norm": 0.07080078125,
521
+ "learning_rate": 3.118161973837903e-05,
522
+ "loss": 0.003,
523
+ "step": 355
524
+ },
525
+ {
526
+ "epoch": 2.4489795918367347,
527
+ "grad_norm": 0.55078125,
528
+ "learning_rate": 3.0159925398968314e-05,
529
+ "loss": 0.0248,
530
+ "step": 360
531
+ },
532
+ {
533
+ "epoch": 2.4829931972789114,
534
+ "grad_norm": 1.015625,
535
+ "learning_rate": 2.9190566689617188e-05,
536
+ "loss": 0.0651,
537
+ "step": 365
538
+ },
539
+ {
540
+ "epoch": 2.5170068027210886,
541
+ "grad_norm": 0.0218505859375,
542
+ "learning_rate": 2.8274994445342093e-05,
543
+ "loss": 0.0021,
544
+ "step": 370
545
+ },
546
+ {
547
+ "epoch": 2.5510204081632653,
548
+ "grad_norm": 0.037353515625,
549
+ "learning_rate": 2.741457899918822e-05,
550
+ "loss": 0.0111,
551
+ "step": 375
552
+ },
553
+ {
554
+ "epoch": 2.5850340136054424,
555
+ "grad_norm": 54.75,
556
+ "learning_rate": 2.6610608131257937e-05,
557
+ "loss": 0.0358,
558
+ "step": 380
559
+ },
560
+ {
561
+ "epoch": 2.619047619047619,
562
+ "grad_norm": 0.078125,
563
+ "learning_rate": 2.5864285141295854e-05,
564
+ "loss": 0.0004,
565
+ "step": 385
566
+ },
567
+ {
568
+ "epoch": 2.6530612244897958,
569
+ "grad_norm": 0.005767822265625,
570
+ "learning_rate": 2.517672704771522e-05,
571
+ "loss": 0.0003,
572
+ "step": 390
573
+ },
574
+ {
575
+ "epoch": 2.687074829931973,
576
+ "grad_norm": 2.015625,
577
+ "learning_rate": 2.4548962915761334e-05,
578
+ "loss": 0.0006,
579
+ "step": 395
580
+ },
581
+ {
582
+ "epoch": 2.7210884353741496,
583
+ "grad_norm": 13.875,
584
+ "learning_rate": 2.3981932317313933e-05,
585
+ "loss": 0.0117,
586
+ "step": 400
587
+ },
588
+ {
589
+ "epoch": 2.7551020408163263,
590
+ "grad_norm": 0.2216796875,
591
+ "learning_rate": 2.347648392463406e-05,
592
+ "loss": 0.0074,
593
+ "step": 405
594
+ },
595
+ {
596
+ "epoch": 2.7891156462585034,
597
+ "grad_norm": 11.4375,
598
+ "learning_rate": 2.303337424015989e-05,
599
+ "loss": 0.0291,
600
+ "step": 410
601
+ },
602
+ {
603
+ "epoch": 2.8231292517006805,
604
+ "grad_norm": 18.0,
605
+ "learning_rate": 2.2653266464252818e-05,
606
+ "loss": 0.0245,
607
+ "step": 415
608
+ },
609
+ {
610
+ "epoch": 2.857142857142857,
611
+ "grad_norm": 7.25,
612
+ "learning_rate": 2.2336729502588305e-05,
613
+ "loss": 0.0082,
614
+ "step": 420
615
+ },
616
+ {
617
+ "epoch": 2.891156462585034,
618
+ "grad_norm": 0.09716796875,
619
+ "learning_rate": 2.2084237114677194e-05,
620
+ "loss": 0.0224,
621
+ "step": 425
622
+ },
623
+ {
624
+ "epoch": 2.925170068027211,
625
+ "grad_norm": 0.012451171875,
626
+ "learning_rate": 2.18961672047919e-05,
627
+ "loss": 0.0002,
628
+ "step": 430
629
+ },
630
+ {
631
+ "epoch": 2.9591836734693877,
632
+ "grad_norm": 0.0341796875,
633
+ "learning_rate": 2.1772801256358705e-05,
634
+ "loss": 0.0299,
635
+ "step": 435
636
+ },
637
+ {
638
+ "epoch": 2.979591836734694,
639
+ "eval_loss": 0.00994242262095213,
640
+ "eval_runtime": 1.1603,
641
+ "eval_samples_per_second": 22.408,
642
+ "eval_steps_per_second": 22.408,
643
+ "step": 438
644
  }
645
  ],
646
  "logging_steps": 5,
 
660
  "attributes": {}
661
  }
662
  },
663
+ "total_flos": 3.330567101743104e+16,
664
  "train_batch_size": 3,
665
  "trial_name": null,
666
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aede922fcb038c2b2f294f933050cf01eb46c9c0de6a69807b9087c912c03c36
3
  size 5560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73e50ede3fd7e17a3aef1da9b06d58c3cc6693ace5908dc3a74142804df978c3
3
  size 5560