Danda245 commited on
Commit
ef6a502
·
verified ·
1 Parent(s): 391488e

Upload 14 files

Browse files
config.json CHANGED
@@ -30,7 +30,7 @@
30
  "summary_type": "cls_index",
31
  "summary_use_proj": true,
32
  "torch_dtype": "float32",
33
- "transformers_version": "4.46.2",
34
  "use_cache": true,
35
  "vocab_size": 50001
36
  }
 
30
  "summary_type": "cls_index",
31
  "summary_use_proj": true,
32
  "torch_dtype": "float32",
33
+ "transformers_version": "4.46.3",
34
  "use_cache": true,
35
  "vocab_size": 50001
36
  }
generation_config.json CHANGED
@@ -3,5 +3,5 @@
3
  "bos_token_id": 0,
4
  "eos_token_id": 2,
5
  "pad_token_id": 1,
6
- "transformers_version": "4.46.2"
7
  }
 
3
  "bos_token_id": 0,
4
  "eos_token_id": 2,
5
  "pad_token_id": 1,
6
+ "transformers_version": "4.46.3"
7
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4fbd0264a1186d4402829ff55ed166f8af3a1ed09115b76d65554ad76a681a27
3
  size 496987776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8360d44d5ce66341561f0d021dee5e6ec8b81a9d0ac51100ec01062377c05d0
3
  size 496987776
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a409b4c21409b3f5f3bea9fd21e0022032e9104ee78dc52fce32cd4bf8c57408
3
+ size 994069434
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f76bfe8f06baf38f8182595ee79f69c0e5bbee8d3bad925e5bf08c9df044d87
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9381fb001e35cd6e41636b5024ecdcb43506de9707280bc0ad45e4a9c2e3995a
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,715 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 1000,
6
+ "global_step": 31479,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.047650814828933574,
13
+ "grad_norm": 2.0100979804992676,
14
+ "learning_rate": 0.00019682327901140442,
15
+ "loss": 2.3566,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.09530162965786715,
20
+ "grad_norm": 1.877261996269226,
21
+ "learning_rate": 0.00019364655802280888,
22
+ "loss": 2.2178,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.09530162965786715,
27
+ "eval_loss": 2.3788223266601562,
28
+ "eval_runtime": 80.302,
29
+ "eval_samples_per_second": 173.271,
30
+ "eval_steps_per_second": 7.223,
31
+ "step": 1000
32
+ },
33
+ {
34
+ "epoch": 0.14295244448680072,
35
+ "grad_norm": 1.7388309240341187,
36
+ "learning_rate": 0.00019046983703421329,
37
+ "loss": 2.1744,
38
+ "step": 1500
39
+ },
40
+ {
41
+ "epoch": 0.1906032593157343,
42
+ "grad_norm": 1.8366143703460693,
43
+ "learning_rate": 0.00018729311604561772,
44
+ "loss": 2.163,
45
+ "step": 2000
46
+ },
47
+ {
48
+ "epoch": 0.1906032593157343,
49
+ "eval_loss": 2.3654611110687256,
50
+ "eval_runtime": 80.3504,
51
+ "eval_samples_per_second": 173.166,
52
+ "eval_steps_per_second": 7.218,
53
+ "step": 2000
54
+ },
55
+ {
56
+ "epoch": 0.23825407414466787,
57
+ "grad_norm": 1.6628751754760742,
58
+ "learning_rate": 0.00018411639505702213,
59
+ "loss": 2.1515,
60
+ "step": 2500
61
+ },
62
+ {
63
+ "epoch": 0.28590488897360145,
64
+ "grad_norm": 1.6291817426681519,
65
+ "learning_rate": 0.0001809396740684266,
66
+ "loss": 2.1196,
67
+ "step": 3000
68
+ },
69
+ {
70
+ "epoch": 0.28590488897360145,
71
+ "eval_loss": 2.3521649837493896,
72
+ "eval_runtime": 80.224,
73
+ "eval_samples_per_second": 173.439,
74
+ "eval_steps_per_second": 7.23,
75
+ "step": 3000
76
+ },
77
+ {
78
+ "epoch": 0.333555703802535,
79
+ "grad_norm": 1.7604336738586426,
80
+ "learning_rate": 0.000177762953079831,
81
+ "loss": 2.1074,
82
+ "step": 3500
83
+ },
84
+ {
85
+ "epoch": 0.3812065186314686,
86
+ "grad_norm": 1.34886634349823,
87
+ "learning_rate": 0.00017458623209123543,
88
+ "loss": 2.0926,
89
+ "step": 4000
90
+ },
91
+ {
92
+ "epoch": 0.3812065186314686,
93
+ "eval_loss": 2.3420486450195312,
94
+ "eval_runtime": 80.1942,
95
+ "eval_samples_per_second": 173.504,
96
+ "eval_steps_per_second": 7.232,
97
+ "step": 4000
98
+ },
99
+ {
100
+ "epoch": 0.42885733346040217,
101
+ "grad_norm": 1.3510360717773438,
102
+ "learning_rate": 0.00017140951110263986,
103
+ "loss": 2.074,
104
+ "step": 4500
105
+ },
106
+ {
107
+ "epoch": 0.47650814828933574,
108
+ "grad_norm": 1.272275447845459,
109
+ "learning_rate": 0.0001682327901140443,
110
+ "loss": 2.0752,
111
+ "step": 5000
112
+ },
113
+ {
114
+ "epoch": 0.47650814828933574,
115
+ "eval_loss": 2.3270885944366455,
116
+ "eval_runtime": 80.2619,
117
+ "eval_samples_per_second": 173.358,
118
+ "eval_steps_per_second": 7.226,
119
+ "step": 5000
120
+ },
121
+ {
122
+ "epoch": 0.5241589631182694,
123
+ "grad_norm": 1.289753794670105,
124
+ "learning_rate": 0.0001650560691254487,
125
+ "loss": 2.0487,
126
+ "step": 5500
127
+ },
128
+ {
129
+ "epoch": 0.5718097779472029,
130
+ "grad_norm": 1.1615971326828003,
131
+ "learning_rate": 0.00016187934813685314,
132
+ "loss": 2.0437,
133
+ "step": 6000
134
+ },
135
+ {
136
+ "epoch": 0.5718097779472029,
137
+ "eval_loss": 2.3274528980255127,
138
+ "eval_runtime": 80.2214,
139
+ "eval_samples_per_second": 173.445,
140
+ "eval_steps_per_second": 7.23,
141
+ "step": 6000
142
+ },
143
+ {
144
+ "epoch": 0.6194605927761365,
145
+ "grad_norm": 1.3484673500061035,
146
+ "learning_rate": 0.00015870262714825757,
147
+ "loss": 2.0134,
148
+ "step": 6500
149
+ },
150
+ {
151
+ "epoch": 0.66711140760507,
152
+ "grad_norm": 1.4737777709960938,
153
+ "learning_rate": 0.000155525906159662,
154
+ "loss": 2.0379,
155
+ "step": 7000
156
+ },
157
+ {
158
+ "epoch": 0.66711140760507,
159
+ "eval_loss": 2.3164169788360596,
160
+ "eval_runtime": 80.2177,
161
+ "eval_samples_per_second": 173.453,
162
+ "eval_steps_per_second": 7.23,
163
+ "step": 7000
164
+ },
165
+ {
166
+ "epoch": 0.7147622224340037,
167
+ "grad_norm": 1.1502068042755127,
168
+ "learning_rate": 0.00015234918517106642,
169
+ "loss": 1.9916,
170
+ "step": 7500
171
+ },
172
+ {
173
+ "epoch": 0.7624130372629372,
174
+ "grad_norm": 1.2299320697784424,
175
+ "learning_rate": 0.00014917246418247085,
176
+ "loss": 2.0068,
177
+ "step": 8000
178
+ },
179
+ {
180
+ "epoch": 0.7624130372629372,
181
+ "eval_loss": 2.311408042907715,
182
+ "eval_runtime": 80.2576,
183
+ "eval_samples_per_second": 173.367,
184
+ "eval_steps_per_second": 7.227,
185
+ "step": 8000
186
+ },
187
+ {
188
+ "epoch": 0.8100638520918708,
189
+ "grad_norm": 1.2537345886230469,
190
+ "learning_rate": 0.00014599574319387528,
191
+ "loss": 1.9886,
192
+ "step": 8500
193
+ },
194
+ {
195
+ "epoch": 0.8577146669208043,
196
+ "grad_norm": 1.0486429929733276,
197
+ "learning_rate": 0.00014281902220527972,
198
+ "loss": 1.9882,
199
+ "step": 9000
200
+ },
201
+ {
202
+ "epoch": 0.8577146669208043,
203
+ "eval_loss": 2.304290294647217,
204
+ "eval_runtime": 80.1372,
205
+ "eval_samples_per_second": 173.627,
206
+ "eval_steps_per_second": 7.238,
207
+ "step": 9000
208
+ },
209
+ {
210
+ "epoch": 0.905365481749738,
211
+ "grad_norm": 1.1815516948699951,
212
+ "learning_rate": 0.00013964230121668413,
213
+ "loss": 1.9732,
214
+ "step": 9500
215
+ },
216
+ {
217
+ "epoch": 0.9530162965786715,
218
+ "grad_norm": 1.2301689386367798,
219
+ "learning_rate": 0.0001364655802280886,
220
+ "loss": 1.9787,
221
+ "step": 10000
222
+ },
223
+ {
224
+ "epoch": 0.9530162965786715,
225
+ "eval_loss": 2.2939772605895996,
226
+ "eval_runtime": 80.1592,
227
+ "eval_samples_per_second": 173.579,
228
+ "eval_steps_per_second": 7.236,
229
+ "step": 10000
230
+ },
231
+ {
232
+ "epoch": 1.0006671114076051,
233
+ "grad_norm": 1.497831106185913,
234
+ "learning_rate": 0.000133288859239493,
235
+ "loss": 1.9557,
236
+ "step": 10500
237
+ },
238
+ {
239
+ "epoch": 1.0483179262365387,
240
+ "grad_norm": 1.3323341608047485,
241
+ "learning_rate": 0.00013011213825089743,
242
+ "loss": 1.7231,
243
+ "step": 11000
244
+ },
245
+ {
246
+ "epoch": 1.0483179262365387,
247
+ "eval_loss": 2.313231945037842,
248
+ "eval_runtime": 80.1199,
249
+ "eval_samples_per_second": 173.665,
250
+ "eval_steps_per_second": 7.239,
251
+ "step": 11000
252
+ },
253
+ {
254
+ "epoch": 1.0959687410654722,
255
+ "grad_norm": 1.8000659942626953,
256
+ "learning_rate": 0.00012693541726230184,
257
+ "loss": 1.714,
258
+ "step": 11500
259
+ },
260
+ {
261
+ "epoch": 1.1436195558944058,
262
+ "grad_norm": 1.2369180917739868,
263
+ "learning_rate": 0.0001237586962737063,
264
+ "loss": 1.7114,
265
+ "step": 12000
266
+ },
267
+ {
268
+ "epoch": 1.1436195558944058,
269
+ "eval_loss": 2.313917875289917,
270
+ "eval_runtime": 80.1492,
271
+ "eval_samples_per_second": 173.601,
272
+ "eval_steps_per_second": 7.237,
273
+ "step": 12000
274
+ },
275
+ {
276
+ "epoch": 1.1912703707233394,
277
+ "grad_norm": 1.431038498878479,
278
+ "learning_rate": 0.0001205819752851107,
279
+ "loss": 1.7283,
280
+ "step": 12500
281
+ },
282
+ {
283
+ "epoch": 1.238921185552273,
284
+ "grad_norm": 1.4570106267929077,
285
+ "learning_rate": 0.00011740525429651514,
286
+ "loss": 1.7033,
287
+ "step": 13000
288
+ },
289
+ {
290
+ "epoch": 1.238921185552273,
291
+ "eval_loss": 2.310853958129883,
292
+ "eval_runtime": 80.0945,
293
+ "eval_samples_per_second": 173.72,
294
+ "eval_steps_per_second": 7.241,
295
+ "step": 13000
296
+ },
297
+ {
298
+ "epoch": 1.2865720003812064,
299
+ "grad_norm": 1.557187795639038,
300
+ "learning_rate": 0.00011422853330791956,
301
+ "loss": 1.7289,
302
+ "step": 13500
303
+ },
304
+ {
305
+ "epoch": 1.33422281521014,
306
+ "grad_norm": 1.5775034427642822,
307
+ "learning_rate": 0.000111051812319324,
308
+ "loss": 1.7151,
309
+ "step": 14000
310
+ },
311
+ {
312
+ "epoch": 1.33422281521014,
313
+ "eval_loss": 2.300920009613037,
314
+ "eval_runtime": 80.1537,
315
+ "eval_samples_per_second": 173.591,
316
+ "eval_steps_per_second": 7.236,
317
+ "step": 14000
318
+ },
319
+ {
320
+ "epoch": 1.3818736300390737,
321
+ "grad_norm": 1.2451566457748413,
322
+ "learning_rate": 0.00010787509133072841,
323
+ "loss": 1.7218,
324
+ "step": 14500
325
+ },
326
+ {
327
+ "epoch": 1.4295244448680071,
328
+ "grad_norm": 1.650688886642456,
329
+ "learning_rate": 0.00010469837034213286,
330
+ "loss": 1.7202,
331
+ "step": 15000
332
+ },
333
+ {
334
+ "epoch": 1.4295244448680071,
335
+ "eval_loss": 2.290478467941284,
336
+ "eval_runtime": 80.1852,
337
+ "eval_samples_per_second": 173.523,
338
+ "eval_steps_per_second": 7.233,
339
+ "step": 15000
340
+ },
341
+ {
342
+ "epoch": 1.4771752596969407,
343
+ "grad_norm": 1.4705020189285278,
344
+ "learning_rate": 0.00010152164935353727,
345
+ "loss": 1.721,
346
+ "step": 15500
347
+ },
348
+ {
349
+ "epoch": 1.5248260745258744,
350
+ "grad_norm": 1.530394434928894,
351
+ "learning_rate": 9.834492836494172e-05,
352
+ "loss": 1.7261,
353
+ "step": 16000
354
+ },
355
+ {
356
+ "epoch": 1.5248260745258744,
357
+ "eval_loss": 2.2944624423980713,
358
+ "eval_runtime": 80.1122,
359
+ "eval_samples_per_second": 173.682,
360
+ "eval_steps_per_second": 7.24,
361
+ "step": 16000
362
+ },
363
+ {
364
+ "epoch": 1.572476889354808,
365
+ "grad_norm": 1.667024850845337,
366
+ "learning_rate": 9.516820737634614e-05,
367
+ "loss": 1.7072,
368
+ "step": 16500
369
+ },
370
+ {
371
+ "epoch": 1.6201277041837416,
372
+ "grad_norm": 1.4624521732330322,
373
+ "learning_rate": 9.199148638775057e-05,
374
+ "loss": 1.7091,
375
+ "step": 17000
376
+ },
377
+ {
378
+ "epoch": 1.6201277041837416,
379
+ "eval_loss": 2.2861549854278564,
380
+ "eval_runtime": 80.0947,
381
+ "eval_samples_per_second": 173.719,
382
+ "eval_steps_per_second": 7.241,
383
+ "step": 17000
384
+ },
385
+ {
386
+ "epoch": 1.6677785190126753,
387
+ "grad_norm": 1.7141919136047363,
388
+ "learning_rate": 8.881476539915499e-05,
389
+ "loss": 1.7281,
390
+ "step": 17500
391
+ },
392
+ {
393
+ "epoch": 1.7154293338416087,
394
+ "grad_norm": 1.367767333984375,
395
+ "learning_rate": 8.563804441055943e-05,
396
+ "loss": 1.7098,
397
+ "step": 18000
398
+ },
399
+ {
400
+ "epoch": 1.7154293338416087,
401
+ "eval_loss": 2.2811758518218994,
402
+ "eval_runtime": 80.1424,
403
+ "eval_samples_per_second": 173.616,
404
+ "eval_steps_per_second": 7.237,
405
+ "step": 18000
406
+ },
407
+ {
408
+ "epoch": 1.7630801486705423,
409
+ "grad_norm": 1.530991792678833,
410
+ "learning_rate": 8.246132342196385e-05,
411
+ "loss": 1.6994,
412
+ "step": 18500
413
+ },
414
+ {
415
+ "epoch": 1.8107309634994757,
416
+ "grad_norm": 1.4421322345733643,
417
+ "learning_rate": 7.928460243336828e-05,
418
+ "loss": 1.6943,
419
+ "step": 19000
420
+ },
421
+ {
422
+ "epoch": 1.8107309634994757,
423
+ "eval_loss": 2.273425579071045,
424
+ "eval_runtime": 80.1385,
425
+ "eval_samples_per_second": 173.624,
426
+ "eval_steps_per_second": 7.237,
427
+ "step": 19000
428
+ },
429
+ {
430
+ "epoch": 1.8583817783284093,
431
+ "grad_norm": 1.5695687532424927,
432
+ "learning_rate": 7.610788144477272e-05,
433
+ "loss": 1.7,
434
+ "step": 19500
435
+ },
436
+ {
437
+ "epoch": 1.906032593157343,
438
+ "grad_norm": 1.6507039070129395,
439
+ "learning_rate": 7.293116045617714e-05,
440
+ "loss": 1.7035,
441
+ "step": 20000
442
+ },
443
+ {
444
+ "epoch": 1.906032593157343,
445
+ "eval_loss": 2.266268730163574,
446
+ "eval_runtime": 80.1631,
447
+ "eval_samples_per_second": 173.571,
448
+ "eval_steps_per_second": 7.235,
449
+ "step": 20000
450
+ },
451
+ {
452
+ "epoch": 1.9536834079862766,
453
+ "grad_norm": 1.41545832157135,
454
+ "learning_rate": 6.975443946758157e-05,
455
+ "loss": 1.6948,
456
+ "step": 20500
457
+ },
458
+ {
459
+ "epoch": 2.0013342228152102,
460
+ "grad_norm": 1.3855451345443726,
461
+ "learning_rate": 6.657771847898599e-05,
462
+ "loss": 1.6776,
463
+ "step": 21000
464
+ },
465
+ {
466
+ "epoch": 2.0013342228152102,
467
+ "eval_loss": 2.302978515625,
468
+ "eval_runtime": 80.1675,
469
+ "eval_samples_per_second": 173.562,
470
+ "eval_steps_per_second": 7.235,
471
+ "step": 21000
472
+ },
473
+ {
474
+ "epoch": 2.048985037644144,
475
+ "grad_norm": 1.3997050523757935,
476
+ "learning_rate": 6.340099749039043e-05,
477
+ "loss": 1.438,
478
+ "step": 21500
479
+ },
480
+ {
481
+ "epoch": 2.0966358524730775,
482
+ "grad_norm": 1.4828859567642212,
483
+ "learning_rate": 6.0224276501794854e-05,
484
+ "loss": 1.4406,
485
+ "step": 22000
486
+ },
487
+ {
488
+ "epoch": 2.0966358524730775,
489
+ "eval_loss": 2.3172175884246826,
490
+ "eval_runtime": 80.1748,
491
+ "eval_samples_per_second": 173.546,
492
+ "eval_steps_per_second": 7.234,
493
+ "step": 22000
494
+ },
495
+ {
496
+ "epoch": 2.1442866673020107,
497
+ "grad_norm": 1.8176885843276978,
498
+ "learning_rate": 5.704755551319928e-05,
499
+ "loss": 1.4555,
500
+ "step": 22500
501
+ },
502
+ {
503
+ "epoch": 2.1919374821309443,
504
+ "grad_norm": 1.48106050491333,
505
+ "learning_rate": 5.387083452460371e-05,
506
+ "loss": 1.4659,
507
+ "step": 23000
508
+ },
509
+ {
510
+ "epoch": 2.1919374821309443,
511
+ "eval_loss": 2.3182783126831055,
512
+ "eval_runtime": 80.2101,
513
+ "eval_samples_per_second": 173.47,
514
+ "eval_steps_per_second": 7.231,
515
+ "step": 23000
516
+ },
517
+ {
518
+ "epoch": 2.239588296959878,
519
+ "grad_norm": 1.6957001686096191,
520
+ "learning_rate": 5.0694113536008136e-05,
521
+ "loss": 1.448,
522
+ "step": 23500
523
+ },
524
+ {
525
+ "epoch": 2.2872391117888116,
526
+ "grad_norm": 1.3845641613006592,
527
+ "learning_rate": 4.7517392547412564e-05,
528
+ "loss": 1.4608,
529
+ "step": 24000
530
+ },
531
+ {
532
+ "epoch": 2.2872391117888116,
533
+ "eval_loss": 2.318488836288452,
534
+ "eval_runtime": 80.1689,
535
+ "eval_samples_per_second": 173.559,
536
+ "eval_steps_per_second": 7.235,
537
+ "step": 24000
538
+ },
539
+ {
540
+ "epoch": 2.334889926617745,
541
+ "grad_norm": 1.9913188219070435,
542
+ "learning_rate": 4.434067155881699e-05,
543
+ "loss": 1.439,
544
+ "step": 24500
545
+ },
546
+ {
547
+ "epoch": 2.382540741446679,
548
+ "grad_norm": 1.8244202136993408,
549
+ "learning_rate": 4.116395057022142e-05,
550
+ "loss": 1.4423,
551
+ "step": 25000
552
+ },
553
+ {
554
+ "epoch": 2.382540741446679,
555
+ "eval_loss": 2.3121349811553955,
556
+ "eval_runtime": 80.1537,
557
+ "eval_samples_per_second": 173.591,
558
+ "eval_steps_per_second": 7.236,
559
+ "step": 25000
560
+ },
561
+ {
562
+ "epoch": 2.4301915562756125,
563
+ "grad_norm": 1.347023606300354,
564
+ "learning_rate": 3.7987229581625846e-05,
565
+ "loss": 1.4506,
566
+ "step": 25500
567
+ },
568
+ {
569
+ "epoch": 2.477842371104546,
570
+ "grad_norm": 1.49163019657135,
571
+ "learning_rate": 3.481050859303028e-05,
572
+ "loss": 1.4378,
573
+ "step": 26000
574
+ },
575
+ {
576
+ "epoch": 2.477842371104546,
577
+ "eval_loss": 2.3090391159057617,
578
+ "eval_runtime": 80.1708,
579
+ "eval_samples_per_second": 173.554,
580
+ "eval_steps_per_second": 7.235,
581
+ "step": 26000
582
+ },
583
+ {
584
+ "epoch": 2.5254931859334793,
585
+ "grad_norm": 1.7945301532745361,
586
+ "learning_rate": 3.163378760443471e-05,
587
+ "loss": 1.4436,
588
+ "step": 26500
589
+ },
590
+ {
591
+ "epoch": 2.573144000762413,
592
+ "grad_norm": 1.5082517862319946,
593
+ "learning_rate": 2.8457066615839136e-05,
594
+ "loss": 1.4277,
595
+ "step": 27000
596
+ },
597
+ {
598
+ "epoch": 2.573144000762413,
599
+ "eval_loss": 2.3082542419433594,
600
+ "eval_runtime": 80.1802,
601
+ "eval_samples_per_second": 173.534,
602
+ "eval_steps_per_second": 7.234,
603
+ "step": 27000
604
+ },
605
+ {
606
+ "epoch": 2.6207948155913465,
607
+ "grad_norm": 1.4329321384429932,
608
+ "learning_rate": 2.5280345627243563e-05,
609
+ "loss": 1.4301,
610
+ "step": 27500
611
+ },
612
+ {
613
+ "epoch": 2.66844563042028,
614
+ "grad_norm": 1.2606436014175415,
615
+ "learning_rate": 2.2103624638647987e-05,
616
+ "loss": 1.4251,
617
+ "step": 28000
618
+ },
619
+ {
620
+ "epoch": 2.66844563042028,
621
+ "eval_loss": 2.2960703372955322,
622
+ "eval_runtime": 80.1531,
623
+ "eval_samples_per_second": 173.593,
624
+ "eval_steps_per_second": 7.236,
625
+ "step": 28000
626
+ },
627
+ {
628
+ "epoch": 2.716096445249214,
629
+ "grad_norm": 1.4542068243026733,
630
+ "learning_rate": 1.8926903650052415e-05,
631
+ "loss": 1.4248,
632
+ "step": 28500
633
+ },
634
+ {
635
+ "epoch": 2.7637472600781474,
636
+ "grad_norm": 1.6642916202545166,
637
+ "learning_rate": 1.5750182661456846e-05,
638
+ "loss": 1.4219,
639
+ "step": 29000
640
+ },
641
+ {
642
+ "epoch": 2.7637472600781474,
643
+ "eval_loss": 2.296442985534668,
644
+ "eval_runtime": 80.1753,
645
+ "eval_samples_per_second": 173.545,
646
+ "eval_steps_per_second": 7.234,
647
+ "step": 29000
648
+ },
649
+ {
650
+ "epoch": 2.811398074907081,
651
+ "grad_norm": 2.0301756858825684,
652
+ "learning_rate": 1.2573461672861273e-05,
653
+ "loss": 1.4281,
654
+ "step": 29500
655
+ },
656
+ {
657
+ "epoch": 2.8590488897360142,
658
+ "grad_norm": 1.6031594276428223,
659
+ "learning_rate": 9.3967406842657e-06,
660
+ "loss": 1.434,
661
+ "step": 30000
662
+ },
663
+ {
664
+ "epoch": 2.8590488897360142,
665
+ "eval_loss": 2.2933690547943115,
666
+ "eval_runtime": 80.1482,
667
+ "eval_samples_per_second": 173.603,
668
+ "eval_steps_per_second": 7.237,
669
+ "step": 30000
670
+ },
671
+ {
672
+ "epoch": 2.9066997045649483,
673
+ "grad_norm": 1.6658378839492798,
674
+ "learning_rate": 6.22001969567013e-06,
675
+ "loss": 1.4291,
676
+ "step": 30500
677
+ },
678
+ {
679
+ "epoch": 2.9543505193938815,
680
+ "grad_norm": 1.589982032775879,
681
+ "learning_rate": 3.0432987070745578e-06,
682
+ "loss": 1.4279,
683
+ "step": 31000
684
+ },
685
+ {
686
+ "epoch": 2.9543505193938815,
687
+ "eval_loss": 2.2906086444854736,
688
+ "eval_runtime": 80.2746,
689
+ "eval_samples_per_second": 173.33,
690
+ "eval_steps_per_second": 7.225,
691
+ "step": 31000
692
+ }
693
+ ],
694
+ "logging_steps": 500,
695
+ "max_steps": 31479,
696
+ "num_input_tokens_seen": 0,
697
+ "num_train_epochs": 3,
698
+ "save_steps": 50000,
699
+ "stateful_callbacks": {
700
+ "TrainerControl": {
701
+ "args": {
702
+ "should_epoch_stop": false,
703
+ "should_evaluate": false,
704
+ "should_log": false,
705
+ "should_save": true,
706
+ "should_training_stop": true
707
+ },
708
+ "attributes": {}
709
+ }
710
+ },
711
+ "total_flos": 4.934891962368e+16,
712
+ "train_batch_size": 24,
713
+ "trial_name": null,
714
+ "trial_params": null
715
+ }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d8dde0e4fbbe3c1bc9dd39c9618c32d5353cf56390332c7dcdbf04ad84e6ffed
3
  size 5176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1cb82ad67a09d3ad34c485ce75404c4ecac9c33795cc43afad0edd89cf6d615
3
  size 5176