AnnantJain commited on
Commit
4caf23d
1 Parent(s): 7546b10

Upload folder using huggingface_hub

Browse files
Files changed (7) hide show
  1. config.json +29 -0
  2. model.safetensors +3 -0
  3. optimizer.pt +3 -0
  4. rng_state.pth +3 -0
  5. scheduler.pt +3 -0
  6. trainer_state.json +869 -0
  7. training_args.bin +3 -0
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "xlm-roberta-large",
3
+ "architectures": [
4
+ "XLMRobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 4096,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "xlm-roberta",
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 24,
20
+ "output_past": true,
21
+ "pad_token_id": 1,
22
+ "position_embedding_type": "absolute",
23
+ "problem_type": "single_label_classification",
24
+ "torch_dtype": "float32",
25
+ "transformers_version": "4.41.2",
26
+ "type_vocab_size": 1,
27
+ "use_cache": true,
28
+ "vocab_size": 250002
29
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:daf60a4998679ac70d0ba2add1221877663daf5b7f67b9a5c703a76b4db14fdd
3
+ size 2239618672
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9014a78f9e7e5e45d19fa14e483cff568b36f60cb370dd7762d04e03cb73ca37
3
+ size 4479472721
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ca0a19ad295042159f99032b71124ba99611957add49d5a3cdcfdba689685b8
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b86637478ff97dd29675329d90ed9365d661f9d796017ebe4586552e821444e0
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,869 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8376479645847368,
3
+ "best_model_checkpoint": "./XLMR-large2-multi-109k-multi-outputs/checkpoint-40000",
4
+ "epoch": 7.893792608539648,
5
+ "eval_steps": 1000,
6
+ "global_step": 44000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.17940437746681018,
13
+ "grad_norm": 6.077027320861816,
14
+ "learning_rate": 2.242152466367713e-07,
15
+ "loss": 0.7043,
16
+ "step": 1000
17
+ },
18
+ {
19
+ "epoch": 0.17940437746681018,
20
+ "eval_accuracy": 0.5152891310929458,
21
+ "eval_f1": 0.4964881014781424,
22
+ "eval_loss": 0.6920226216316223,
23
+ "eval_precision": 0.522852726871274,
24
+ "eval_recall": 0.4726546906187625,
25
+ "eval_runtime": 103.5147,
26
+ "eval_samples_per_second": 95.726,
27
+ "eval_steps_per_second": 5.989,
28
+ "step": 1000
29
+ },
30
+ {
31
+ "epoch": 0.35880875493362036,
32
+ "grad_norm": 7.982357501983643,
33
+ "learning_rate": 4.484304932735426e-07,
34
+ "loss": 0.6972,
35
+ "step": 2000
36
+ },
37
+ {
38
+ "epoch": 0.35880875493362036,
39
+ "eval_accuracy": 0.5537390251286709,
40
+ "eval_f1": 0.514065934065934,
41
+ "eval_loss": 0.6867982745170593,
42
+ "eval_precision": 0.571882640586797,
43
+ "eval_recall": 0.4668662674650699,
44
+ "eval_runtime": 103.803,
45
+ "eval_samples_per_second": 95.46,
46
+ "eval_steps_per_second": 5.973,
47
+ "step": 2000
48
+ },
49
+ {
50
+ "epoch": 0.5382131324004306,
51
+ "grad_norm": 10.05902099609375,
52
+ "learning_rate": 6.72645739910314e-07,
53
+ "loss": 0.6892,
54
+ "step": 3000
55
+ },
56
+ {
57
+ "epoch": 0.5382131324004306,
58
+ "eval_accuracy": 0.5728125946109597,
59
+ "eval_f1": 0.44280637093589575,
60
+ "eval_loss": 0.6743206977844238,
61
+ "eval_precision": 0.6501739466563587,
62
+ "eval_recall": 0.33572854291417165,
63
+ "eval_runtime": 103.5733,
64
+ "eval_samples_per_second": 95.671,
65
+ "eval_steps_per_second": 5.986,
66
+ "step": 3000
67
+ },
68
+ {
69
+ "epoch": 0.7176175098672407,
70
+ "grad_norm": 17.559974670410156,
71
+ "learning_rate": 8.968609865470852e-07,
72
+ "loss": 0.6686,
73
+ "step": 4000
74
+ },
75
+ {
76
+ "epoch": 0.7176175098672407,
77
+ "eval_accuracy": 0.620345140781108,
78
+ "eval_f1": 0.6229705351773902,
79
+ "eval_loss": 0.6282544136047363,
80
+ "eval_precision": 0.6256038647342995,
81
+ "eval_recall": 0.6203592814371257,
82
+ "eval_runtime": 103.1262,
83
+ "eval_samples_per_second": 96.086,
84
+ "eval_steps_per_second": 6.012,
85
+ "step": 4000
86
+ },
87
+ {
88
+ "epoch": 0.897021887334051,
89
+ "grad_norm": 17.958587646484375,
90
+ "learning_rate": 9.865444034685537e-07,
91
+ "loss": 0.6251,
92
+ "step": 5000
93
+ },
94
+ {
95
+ "epoch": 0.897021887334051,
96
+ "eval_accuracy": 0.6844282975073166,
97
+ "eval_f1": 0.671568112593215,
98
+ "eval_loss": 0.5713071227073669,
99
+ "eval_precision": 0.7087120372422966,
100
+ "eval_recall": 0.63812375249501,
101
+ "eval_runtime": 103.2452,
102
+ "eval_samples_per_second": 95.975,
103
+ "eval_steps_per_second": 6.005,
104
+ "step": 5000
105
+ },
106
+ {
107
+ "epoch": 1.0764262648008611,
108
+ "grad_norm": 21.860984802246094,
109
+ "learning_rate": 9.616266321140236e-07,
110
+ "loss": 0.559,
111
+ "step": 6000
112
+ },
113
+ {
114
+ "epoch": 1.0764262648008611,
115
+ "eval_accuracy": 0.7250983953981229,
116
+ "eval_f1": 0.6946188340807175,
117
+ "eval_loss": 0.48871228098869324,
118
+ "eval_precision": 0.7923273657289003,
119
+ "eval_recall": 0.6183632734530938,
120
+ "eval_runtime": 102.8682,
121
+ "eval_samples_per_second": 96.327,
122
+ "eval_steps_per_second": 6.027,
123
+ "step": 6000
124
+ },
125
+ {
126
+ "epoch": 1.2558306422676713,
127
+ "grad_norm": 144.90872192382812,
128
+ "learning_rate": 9.367088607594936e-07,
129
+ "loss": 0.4901,
130
+ "step": 7000
131
+ },
132
+ {
133
+ "epoch": 1.2558306422676713,
134
+ "eval_accuracy": 0.7360984963164799,
135
+ "eval_f1": 0.7147998691242229,
136
+ "eval_loss": 0.4424116909503937,
137
+ "eval_precision": 0.7879297908150997,
138
+ "eval_recall": 0.6540918163672654,
139
+ "eval_runtime": 103.0989,
140
+ "eval_samples_per_second": 96.112,
141
+ "eval_steps_per_second": 6.014,
142
+ "step": 7000
143
+ },
144
+ {
145
+ "epoch": 1.4352350197344816,
146
+ "grad_norm": 23.610597610473633,
147
+ "learning_rate": 9.117910894049637e-07,
148
+ "loss": 0.4595,
149
+ "step": 8000
150
+ },
151
+ {
152
+ "epoch": 1.4352350197344816,
153
+ "eval_accuracy": 0.7460894136643456,
154
+ "eval_f1": 0.7595106098260371,
155
+ "eval_loss": 0.4278419017791748,
156
+ "eval_precision": 0.7287234042553191,
157
+ "eval_recall": 0.7930139720558882,
158
+ "eval_runtime": 103.2272,
159
+ "eval_samples_per_second": 95.992,
160
+ "eval_steps_per_second": 6.006,
161
+ "step": 8000
162
+ },
163
+ {
164
+ "epoch": 1.6146393972012918,
165
+ "grad_norm": 13.438475608825684,
166
+ "learning_rate": 8.868733180504335e-07,
167
+ "loss": 0.4431,
168
+ "step": 9000
169
+ },
170
+ {
171
+ "epoch": 1.6146393972012918,
172
+ "eval_accuracy": 0.7517408416590978,
173
+ "eval_f1": 0.7542457542457542,
174
+ "eval_loss": 0.4201831519603729,
175
+ "eval_precision": 0.755,
176
+ "eval_recall": 0.7534930139720559,
177
+ "eval_runtime": 103.1397,
178
+ "eval_samples_per_second": 96.074,
179
+ "eval_steps_per_second": 6.011,
180
+ "step": 9000
181
+ },
182
+ {
183
+ "epoch": 1.794043774668102,
184
+ "grad_norm": 59.678855895996094,
185
+ "learning_rate": 8.619555466959035e-07,
186
+ "loss": 0.4246,
187
+ "step": 10000
188
+ },
189
+ {
190
+ "epoch": 1.794043774668102,
191
+ "eval_accuracy": 0.7612271672217177,
192
+ "eval_f1": 0.795611610228058,
193
+ "eval_loss": 0.4052415192127228,
194
+ "eval_precision": 0.7013402375875724,
195
+ "eval_recall": 0.9191616766467066,
196
+ "eval_runtime": 103.3719,
197
+ "eval_samples_per_second": 95.858,
198
+ "eval_steps_per_second": 5.998,
199
+ "step": 10000
200
+ },
201
+ {
202
+ "epoch": 1.973448152134912,
203
+ "grad_norm": 56.009273529052734,
204
+ "learning_rate": 8.370377753413735e-07,
205
+ "loss": 0.4168,
206
+ "step": 11000
207
+ },
208
+ {
209
+ "epoch": 1.973448152134912,
210
+ "eval_accuracy": 0.7611262488646685,
211
+ "eval_f1": 0.7569565663825855,
212
+ "eval_loss": 0.39738962054252625,
213
+ "eval_precision": 0.7794459716641996,
214
+ "eval_recall": 0.7357285429141717,
215
+ "eval_runtime": 103.343,
216
+ "eval_samples_per_second": 95.885,
217
+ "eval_steps_per_second": 5.999,
218
+ "step": 11000
219
+ },
220
+ {
221
+ "epoch": 2.1528525296017222,
222
+ "grad_norm": 48.44904708862305,
223
+ "learning_rate": 8.121200039868433e-07,
224
+ "loss": 0.4074,
225
+ "step": 12000
226
+ },
227
+ {
228
+ "epoch": 2.1528525296017222,
229
+ "eval_accuracy": 0.7603189020082753,
230
+ "eval_f1": 0.7932445373030382,
231
+ "eval_loss": 0.42876219749450684,
232
+ "eval_precision": 0.7034120734908137,
233
+ "eval_recall": 0.9093812375249501,
234
+ "eval_runtime": 103.2666,
235
+ "eval_samples_per_second": 95.955,
236
+ "eval_steps_per_second": 6.004,
237
+ "step": 12000
238
+ },
239
+ {
240
+ "epoch": 2.3322569070685324,
241
+ "grad_norm": 7.191207408905029,
242
+ "learning_rate": 7.872022326323134e-07,
243
+ "loss": 0.398,
244
+ "step": 13000
245
+ },
246
+ {
247
+ "epoch": 2.3322569070685324,
248
+ "eval_accuracy": 0.7668785952164698,
249
+ "eval_f1": 0.7844747154319835,
250
+ "eval_loss": 0.39464080333709717,
251
+ "eval_precision": 0.7365101611772951,
252
+ "eval_recall": 0.8391217564870259,
253
+ "eval_runtime": 103.6045,
254
+ "eval_samples_per_second": 95.643,
255
+ "eval_steps_per_second": 5.984,
256
+ "step": 13000
257
+ },
258
+ {
259
+ "epoch": 2.5116612845353425,
260
+ "grad_norm": 8.779580116271973,
261
+ "learning_rate": 7.622844612777832e-07,
262
+ "loss": 0.4009,
263
+ "step": 14000
264
+ },
265
+ {
266
+ "epoch": 2.5116612845353425,
267
+ "eval_accuracy": 0.7699061459279443,
268
+ "eval_f1": 0.7972251867662753,
269
+ "eval_loss": 0.38235536217689514,
270
+ "eval_precision": 0.7189605389797883,
271
+ "eval_recall": 0.8946107784431138,
272
+ "eval_runtime": 103.5288,
273
+ "eval_samples_per_second": 95.713,
274
+ "eval_steps_per_second": 5.989,
275
+ "step": 14000
276
+ },
277
+ {
278
+ "epoch": 2.6910656620021527,
279
+ "grad_norm": 10.75382137298584,
280
+ "learning_rate": 7.373666899232532e-07,
281
+ "loss": 0.383,
282
+ "step": 15000
283
+ },
284
+ {
285
+ "epoch": 2.6910656620021527,
286
+ "eval_accuracy": 0.7800988999899081,
287
+ "eval_f1": 0.7934401365058299,
288
+ "eval_loss": 0.4023512005805969,
289
+ "eval_precision": 0.7555515435999278,
290
+ "eval_recall": 0.8353293413173652,
291
+ "eval_runtime": 103.2649,
292
+ "eval_samples_per_second": 95.957,
293
+ "eval_steps_per_second": 6.004,
294
+ "step": 15000
295
+ },
296
+ {
297
+ "epoch": 2.8704700394689633,
298
+ "grad_norm": 8.595725059509277,
299
+ "learning_rate": 7.124489185687232e-07,
300
+ "loss": 0.3869,
301
+ "step": 16000
302
+ },
303
+ {
304
+ "epoch": 2.8704700394689633,
305
+ "eval_accuracy": 0.7843374709859724,
306
+ "eval_f1": 0.7970753014908366,
307
+ "eval_loss": 0.3746848404407501,
308
+ "eval_precision": 0.7601883716717985,
309
+ "eval_recall": 0.8377245508982036,
310
+ "eval_runtime": 103.4123,
311
+ "eval_samples_per_second": 95.82,
312
+ "eval_steps_per_second": 5.995,
313
+ "step": 16000
314
+ },
315
+ {
316
+ "epoch": 3.0498744169357734,
317
+ "grad_norm": 30.062721252441406,
318
+ "learning_rate": 6.875311472141931e-07,
319
+ "loss": 0.3761,
320
+ "step": 17000
321
+ },
322
+ {
323
+ "epoch": 3.0498744169357734,
324
+ "eval_accuracy": 0.7884751236249874,
325
+ "eval_f1": 0.7946708463949843,
326
+ "eval_loss": 0.39211228489875793,
327
+ "eval_precision": 0.7803001154290111,
328
+ "eval_recall": 0.8095808383233533,
329
+ "eval_runtime": 103.6712,
330
+ "eval_samples_per_second": 95.581,
331
+ "eval_steps_per_second": 5.98,
332
+ "step": 17000
333
+ },
334
+ {
335
+ "epoch": 3.2292787944025836,
336
+ "grad_norm": 56.15926742553711,
337
+ "learning_rate": 6.62613375859663e-07,
338
+ "loss": 0.3609,
339
+ "step": 18000
340
+ },
341
+ {
342
+ "epoch": 3.2292787944025836,
343
+ "eval_accuracy": 0.784741144414169,
344
+ "eval_f1": 0.7985645481159694,
345
+ "eval_loss": 0.39061158895492554,
346
+ "eval_precision": 0.7578419071518193,
347
+ "eval_recall": 0.8439121756487026,
348
+ "eval_runtime": 103.0596,
349
+ "eval_samples_per_second": 96.148,
350
+ "eval_steps_per_second": 6.016,
351
+ "step": 18000
352
+ },
353
+ {
354
+ "epoch": 3.4086831718693937,
355
+ "grad_norm": 7.428126811981201,
356
+ "learning_rate": 6.376956045051331e-07,
357
+ "loss": 0.3535,
358
+ "step": 19000
359
+ },
360
+ {
361
+ "epoch": 3.4086831718693937,
362
+ "eval_accuracy": 0.7953375719043294,
363
+ "eval_f1": 0.8157368707977467,
364
+ "eval_loss": 0.3811704218387604,
365
+ "eval_precision": 0.7486657771847899,
366
+ "eval_recall": 0.8960079840319362,
367
+ "eval_runtime": 103.3599,
368
+ "eval_samples_per_second": 95.869,
369
+ "eval_steps_per_second": 5.998,
370
+ "step": 19000
371
+ },
372
+ {
373
+ "epoch": 3.588087549336204,
374
+ "grad_norm": 248.54281616210938,
375
+ "learning_rate": 6.127778331506029e-07,
376
+ "loss": 0.3497,
377
+ "step": 20000
378
+ },
379
+ {
380
+ "epoch": 3.588087549336204,
381
+ "eval_accuracy": 0.7977596124735089,
382
+ "eval_f1": 0.8121836925960637,
383
+ "eval_loss": 0.37064263224601746,
384
+ "eval_precision": 0.765547703180212,
385
+ "eval_recall": 0.8648702594810379,
386
+ "eval_runtime": 103.2293,
387
+ "eval_samples_per_second": 95.99,
388
+ "eval_steps_per_second": 6.006,
389
+ "step": 20000
390
+ },
391
+ {
392
+ "epoch": 3.767491926803014,
393
+ "grad_norm": 44.91804504394531,
394
+ "learning_rate": 5.87860061796073e-07,
395
+ "loss": 0.3543,
396
+ "step": 21000
397
+ },
398
+ {
399
+ "epoch": 3.767491926803014,
400
+ "eval_accuracy": 0.8025027752548188,
401
+ "eval_f1": 0.8141676953755579,
402
+ "eval_loss": 0.3442750871181488,
403
+ "eval_precision": 0.7764897663466763,
404
+ "eval_recall": 0.855688622754491,
405
+ "eval_runtime": 103.1871,
406
+ "eval_samples_per_second": 96.029,
407
+ "eval_steps_per_second": 6.009,
408
+ "step": 21000
409
+ },
410
+ {
411
+ "epoch": 3.946896304269824,
412
+ "grad_norm": 84.26334381103516,
413
+ "learning_rate": 5.629422904415428e-07,
414
+ "loss": 0.3425,
415
+ "step": 22000
416
+ },
417
+ {
418
+ "epoch": 3.946896304269824,
419
+ "eval_accuracy": 0.8035119588253103,
420
+ "eval_f1": 0.8079692277344905,
421
+ "eval_loss": 0.3556448519229889,
422
+ "eval_precision": 0.7985962175862741,
423
+ "eval_recall": 0.817564870259481,
424
+ "eval_runtime": 102.9714,
425
+ "eval_samples_per_second": 96.231,
426
+ "eval_steps_per_second": 6.021,
427
+ "step": 22000
428
+ },
429
+ {
430
+ "epoch": 4.126300681736635,
431
+ "grad_norm": 23.619245529174805,
432
+ "learning_rate": 5.380245190870128e-07,
433
+ "loss": 0.335,
434
+ "step": 23000
435
+ },
436
+ {
437
+ "epoch": 4.126300681736635,
438
+ "eval_accuracy": 0.8024018568977697,
439
+ "eval_f1": 0.8052903739061257,
440
+ "eval_loss": 0.3544567823410034,
441
+ "eval_precision": 0.8024177566389219,
442
+ "eval_recall": 0.808183632734531,
443
+ "eval_runtime": 102.8733,
444
+ "eval_samples_per_second": 96.322,
445
+ "eval_steps_per_second": 6.027,
446
+ "step": 23000
447
+ },
448
+ {
449
+ "epoch": 4.3057050592034445,
450
+ "grad_norm": 16.840389251708984,
451
+ "learning_rate": 5.131067477324828e-07,
452
+ "loss": 0.3222,
453
+ "step": 24000
454
+ },
455
+ {
456
+ "epoch": 4.3057050592034445,
457
+ "eval_accuracy": 0.8070441013220305,
458
+ "eval_f1": 0.8103550882761357,
459
+ "eval_loss": 0.349142849445343,
460
+ "eval_precision": 0.8054022082018928,
461
+ "eval_recall": 0.8153692614770459,
462
+ "eval_runtime": 103.0742,
463
+ "eval_samples_per_second": 96.135,
464
+ "eval_steps_per_second": 6.015,
465
+ "step": 24000
466
+ },
467
+ {
468
+ "epoch": 4.485109436670255,
469
+ "grad_norm": 49.83803939819336,
470
+ "learning_rate": 4.881889763779527e-07,
471
+ "loss": 0.3157,
472
+ "step": 25000
473
+ },
474
+ {
475
+ "epoch": 4.485109436670255,
476
+ "eval_accuracy": 0.8095670602482592,
477
+ "eval_f1": 0.8254232583957813,
478
+ "eval_loss": 0.357431560754776,
479
+ "eval_precision": 0.7692705638903259,
480
+ "eval_recall": 0.8904191616766467,
481
+ "eval_runtime": 103.3608,
482
+ "eval_samples_per_second": 95.868,
483
+ "eval_steps_per_second": 5.998,
484
+ "step": 25000
485
+ },
486
+ {
487
+ "epoch": 4.664513814137065,
488
+ "grad_norm": 134.8468475341797,
489
+ "learning_rate": 4.632712050234227e-07,
490
+ "loss": 0.3207,
491
+ "step": 26000
492
+ },
493
+ {
494
+ "epoch": 4.664513814137065,
495
+ "eval_accuracy": 0.8153194066000605,
496
+ "eval_f1": 0.8328156404165905,
497
+ "eval_loss": 0.34428831934928894,
498
+ "eval_precision": 0.7678571428571429,
499
+ "eval_recall": 0.9097804391217564,
500
+ "eval_runtime": 103.0601,
501
+ "eval_samples_per_second": 96.148,
502
+ "eval_steps_per_second": 6.016,
503
+ "step": 26000
504
+ },
505
+ {
506
+ "epoch": 4.843918191603875,
507
+ "grad_norm": 12.487037658691406,
508
+ "learning_rate": 4.3835343366889267e-07,
509
+ "loss": 0.3217,
510
+ "step": 27000
511
+ },
512
+ {
513
+ "epoch": 4.843918191603875,
514
+ "eval_accuracy": 0.8124936926026844,
515
+ "eval_f1": 0.8151611619578193,
516
+ "eval_loss": 0.3367626368999481,
517
+ "eval_precision": 0.8125743752479175,
518
+ "eval_recall": 0.8177644710578842,
519
+ "eval_runtime": 103.075,
520
+ "eval_samples_per_second": 96.134,
521
+ "eval_steps_per_second": 6.015,
522
+ "step": 27000
523
+ },
524
+ {
525
+ "epoch": 5.023322569070685,
526
+ "grad_norm": 10.074256896972656,
527
+ "learning_rate": 4.1343566231436264e-07,
528
+ "loss": 0.3184,
529
+ "step": 28000
530
+ },
531
+ {
532
+ "epoch": 5.023322569070685,
533
+ "eval_accuracy": 0.8171359370269452,
534
+ "eval_f1": 0.8204518430439952,
535
+ "eval_loss": 0.3432736396789551,
536
+ "eval_precision": 0.8146399055489965,
537
+ "eval_recall": 0.8263473053892215,
538
+ "eval_runtime": 103.2718,
539
+ "eval_samples_per_second": 95.951,
540
+ "eval_steps_per_second": 6.004,
541
+ "step": 28000
542
+ },
543
+ {
544
+ "epoch": 5.202726946537496,
545
+ "grad_norm": 30.08102035522461,
546
+ "learning_rate": 3.8851789095983255e-07,
547
+ "loss": 0.2981,
548
+ "step": 29000
549
+ },
550
+ {
551
+ "epoch": 5.202726946537496,
552
+ "eval_accuracy": 0.8162276718135029,
553
+ "eval_f1": 0.8155202107182656,
554
+ "eval_loss": 0.34637027978897095,
555
+ "eval_precision": 0.8280189261468833,
556
+ "eval_recall": 0.8033932135728543,
557
+ "eval_runtime": 102.9045,
558
+ "eval_samples_per_second": 96.293,
559
+ "eval_steps_per_second": 6.025,
560
+ "step": 29000
561
+ },
562
+ {
563
+ "epoch": 5.382131324004305,
564
+ "grad_norm": 12.194862365722656,
565
+ "learning_rate": 3.6360011960530246e-07,
566
+ "loss": 0.301,
567
+ "step": 30000
568
+ },
569
+ {
570
+ "epoch": 5.382131324004305,
571
+ "eval_accuracy": 0.817640528812191,
572
+ "eval_f1": 0.8211776348342404,
573
+ "eval_loss": 0.342290461063385,
574
+ "eval_precision": 0.8143277723258097,
575
+ "eval_recall": 0.8281437125748503,
576
+ "eval_runtime": 103.5562,
577
+ "eval_samples_per_second": 95.687,
578
+ "eval_steps_per_second": 5.987,
579
+ "step": 30000
580
+ },
581
+ {
582
+ "epoch": 5.561535701471116,
583
+ "grad_norm": 154.16159057617188,
584
+ "learning_rate": 3.386823482507724e-07,
585
+ "loss": 0.2979,
586
+ "step": 31000
587
+ },
588
+ {
589
+ "epoch": 5.561535701471116,
590
+ "eval_accuracy": 0.8201634877384196,
591
+ "eval_f1": 0.8214070956103428,
592
+ "eval_loss": 0.32883062958717346,
593
+ "eval_precision": 0.8248792270531401,
594
+ "eval_recall": 0.8179640718562874,
595
+ "eval_runtime": 107.2514,
596
+ "eval_samples_per_second": 92.39,
597
+ "eval_steps_per_second": 5.781,
598
+ "step": 31000
599
+ },
600
+ {
601
+ "epoch": 5.740940078937927,
602
+ "grad_norm": 20.60382080078125,
603
+ "learning_rate": 3.137645768962424e-07,
604
+ "loss": 0.2941,
605
+ "step": 32000
606
+ },
607
+ {
608
+ "epoch": 5.740940078937927,
609
+ "eval_accuracy": 0.8215763447371077,
610
+ "eval_f1": 0.8254689042448173,
611
+ "eval_loss": 0.341677188873291,
612
+ "eval_precision": 0.8166015625,
613
+ "eval_recall": 0.8345309381237525,
614
+ "eval_runtime": 103.1122,
615
+ "eval_samples_per_second": 96.099,
616
+ "eval_steps_per_second": 6.013,
617
+ "step": 32000
618
+ },
619
+ {
620
+ "epoch": 5.920344456404736,
621
+ "grad_norm": 27.749670028686523,
622
+ "learning_rate": 2.888468055417123e-07,
623
+ "loss": 0.3015,
624
+ "step": 33000
625
+ },
626
+ {
627
+ "epoch": 5.920344456404736,
628
+ "eval_accuracy": 0.8243011403774346,
629
+ "eval_f1": 0.8335404914427765,
630
+ "eval_loss": 0.33678942918777466,
631
+ "eval_precision": 0.799963296017618,
632
+ "eval_recall": 0.870059880239521,
633
+ "eval_runtime": 103.0115,
634
+ "eval_samples_per_second": 96.193,
635
+ "eval_steps_per_second": 6.019,
636
+ "step": 33000
637
+ },
638
+ {
639
+ "epoch": 6.099748833871547,
640
+ "grad_norm": 63.67295455932617,
641
+ "learning_rate": 2.6392903418718226e-07,
642
+ "loss": 0.2953,
643
+ "step": 34000
644
+ },
645
+ {
646
+ "epoch": 6.099748833871547,
647
+ "eval_accuracy": 0.8256130790190735,
648
+ "eval_f1": 0.8240684178375076,
649
+ "eval_loss": 0.33581623435020447,
650
+ "eval_precision": 0.8410224438902744,
651
+ "eval_recall": 0.8077844311377246,
652
+ "eval_runtime": 103.1426,
653
+ "eval_samples_per_second": 96.071,
654
+ "eval_steps_per_second": 6.011,
655
+ "step": 34000
656
+ },
657
+ {
658
+ "epoch": 6.279153211338357,
659
+ "grad_norm": 26.843647003173828,
660
+ "learning_rate": 2.390112628326522e-07,
661
+ "loss": 0.2852,
662
+ "step": 35000
663
+ },
664
+ {
665
+ "epoch": 6.279153211338357,
666
+ "eval_accuracy": 0.8249066505197296,
667
+ "eval_f1": 0.8327065856715842,
668
+ "eval_loss": 0.34431934356689453,
669
+ "eval_precision": 0.8054467450102593,
670
+ "eval_recall": 0.86187624750499,
671
+ "eval_runtime": 103.3497,
672
+ "eval_samples_per_second": 95.878,
673
+ "eval_steps_per_second": 5.999,
674
+ "step": 35000
675
+ },
676
+ {
677
+ "epoch": 6.458557588805167,
678
+ "grad_norm": 183.19422912597656,
679
+ "learning_rate": 2.140934914781222e-07,
680
+ "loss": 0.2917,
681
+ "step": 36000
682
+ },
683
+ {
684
+ "epoch": 6.458557588805167,
685
+ "eval_accuracy": 0.824502977091533,
686
+ "eval_f1": 0.8318669631634922,
687
+ "eval_loss": 0.34868115186691284,
688
+ "eval_precision": 0.8066754172135758,
689
+ "eval_recall": 0.858682634730539,
690
+ "eval_runtime": 103.3302,
691
+ "eval_samples_per_second": 95.897,
692
+ "eval_steps_per_second": 6.0,
693
+ "step": 36000
694
+ },
695
+ {
696
+ "epoch": 6.637961966271977,
697
+ "grad_norm": 10.319212913513184,
698
+ "learning_rate": 1.8917572012359216e-07,
699
+ "loss": 0.2844,
700
+ "step": 37000
701
+ },
702
+ {
703
+ "epoch": 6.637961966271977,
704
+ "eval_accuracy": 0.8261176708043193,
705
+ "eval_f1": 0.8294565970503811,
706
+ "eval_loss": 0.32437703013420105,
707
+ "eval_precision": 0.8226978205379933,
708
+ "eval_recall": 0.8363273453093812,
709
+ "eval_runtime": 104.1332,
710
+ "eval_samples_per_second": 95.157,
711
+ "eval_steps_per_second": 5.954,
712
+ "step": 37000
713
+ },
714
+ {
715
+ "epoch": 6.8173663437387875,
716
+ "grad_norm": 58.22975540161133,
717
+ "learning_rate": 1.642579487690621e-07,
718
+ "loss": 0.2837,
719
+ "step": 38000
720
+ },
721
+ {
722
+ "epoch": 6.8173663437387875,
723
+ "eval_accuracy": 0.8285397113734988,
724
+ "eval_f1": 0.8333823673629499,
725
+ "eval_loss": 0.3295113742351532,
726
+ "eval_precision": 0.8191632928475033,
727
+ "eval_recall": 0.8481037924151696,
728
+ "eval_runtime": 103.1218,
729
+ "eval_samples_per_second": 96.09,
730
+ "eval_steps_per_second": 6.012,
731
+ "step": 38000
732
+ },
733
+ {
734
+ "epoch": 6.996770721205597,
735
+ "grad_norm": 8.232932090759277,
736
+ "learning_rate": 1.3934017741453206e-07,
737
+ "loss": 0.283,
738
+ "step": 39000
739
+ },
740
+ {
741
+ "epoch": 6.996770721205597,
742
+ "eval_accuracy": 0.8263195075184177,
743
+ "eval_f1": 0.8296882731321128,
744
+ "eval_loss": 0.3371128439903259,
745
+ "eval_precision": 0.8227674190382728,
746
+ "eval_recall": 0.8367265469061876,
747
+ "eval_runtime": 103.2754,
748
+ "eval_samples_per_second": 95.947,
749
+ "eval_steps_per_second": 6.003,
750
+ "step": 39000
751
+ },
752
+ {
753
+ "epoch": 7.176175098672408,
754
+ "grad_norm": 18.62181282043457,
755
+ "learning_rate": 1.14422406060002e-07,
756
+ "loss": 0.2711,
757
+ "step": 40000
758
+ },
759
+ {
760
+ "epoch": 7.176175098672408,
761
+ "eval_accuracy": 0.8297507316580887,
762
+ "eval_f1": 0.8376479645847368,
763
+ "eval_loss": 0.32895320653915405,
764
+ "eval_precision": 0.8087716037911169,
765
+ "eval_recall": 0.8686626746506986,
766
+ "eval_runtime": 103.3849,
767
+ "eval_samples_per_second": 95.846,
768
+ "eval_steps_per_second": 5.997,
769
+ "step": 40000
770
+ },
771
+ {
772
+ "epoch": 7.3555794761392175,
773
+ "grad_norm": 15.900300025939941,
774
+ "learning_rate": 8.950463470547195e-08,
775
+ "loss": 0.273,
776
+ "step": 41000
777
+ },
778
+ {
779
+ "epoch": 7.3555794761392175,
780
+ "eval_accuracy": 0.8292461398728429,
781
+ "eval_f1": 0.8348946135831382,
782
+ "eval_loss": 0.34222128987312317,
783
+ "eval_precision": 0.8167239404352806,
784
+ "eval_recall": 0.8538922155688623,
785
+ "eval_runtime": 103.3124,
786
+ "eval_samples_per_second": 95.913,
787
+ "eval_steps_per_second": 6.001,
788
+ "step": 41000
789
+ },
790
+ {
791
+ "epoch": 7.534983853606028,
792
+ "grad_norm": 54.62172317504883,
793
+ "learning_rate": 6.45868633509419e-08,
794
+ "loss": 0.2795,
795
+ "step": 42000
796
+ },
797
+ {
798
+ "epoch": 7.534983853606028,
799
+ "eval_accuracy": 0.8275305278030074,
800
+ "eval_f1": 0.8291512546236129,
801
+ "eval_loss": 0.33169299364089966,
802
+ "eval_precision": 0.8305627879030643,
803
+ "eval_recall": 0.8277445109780439,
804
+ "eval_runtime": 103.4355,
805
+ "eval_samples_per_second": 95.799,
806
+ "eval_steps_per_second": 5.994,
807
+ "step": 42000
808
+ },
809
+ {
810
+ "epoch": 7.714388231072839,
811
+ "grad_norm": 47.589847564697266,
812
+ "learning_rate": 3.9669091996411835e-08,
813
+ "loss": 0.2739,
814
+ "step": 43000
815
+ },
816
+ {
817
+ "epoch": 7.714388231072839,
818
+ "eval_accuracy": 0.8305580785144818,
819
+ "eval_f1": 0.8372904351196822,
820
+ "eval_loss": 0.336332768201828,
821
+ "eval_precision": 0.8137125635712941,
822
+ "eval_recall": 0.8622754491017964,
823
+ "eval_runtime": 103.0445,
824
+ "eval_samples_per_second": 96.162,
825
+ "eval_steps_per_second": 6.017,
826
+ "step": 43000
827
+ },
828
+ {
829
+ "epoch": 7.893792608539648,
830
+ "grad_norm": 32.284854888916016,
831
+ "learning_rate": 1.475132064188179e-08,
832
+ "loss": 0.2771,
833
+ "step": 44000
834
+ },
835
+ {
836
+ "epoch": 7.893792608539648,
837
+ "eval_accuracy": 0.8294479765869411,
838
+ "eval_f1": 0.8322747121873759,
839
+ "eval_loss": 0.33281558752059937,
840
+ "eval_precision": 0.8276746940386893,
841
+ "eval_recall": 0.8369261477045908,
842
+ "eval_runtime": 103.2292,
843
+ "eval_samples_per_second": 95.99,
844
+ "eval_steps_per_second": 6.006,
845
+ "step": 44000
846
+ }
847
+ ],
848
+ "logging_steps": 1000,
849
+ "max_steps": 44592,
850
+ "num_input_tokens_seen": 0,
851
+ "num_train_epochs": 8,
852
+ "save_steps": 1000,
853
+ "stateful_callbacks": {
854
+ "TrainerControl": {
855
+ "args": {
856
+ "should_epoch_stop": false,
857
+ "should_evaluate": false,
858
+ "should_log": false,
859
+ "should_save": true,
860
+ "should_training_stop": false
861
+ },
862
+ "attributes": {}
863
+ }
864
+ },
865
+ "total_flos": 4.949009928618441e+17,
866
+ "train_batch_size": 16,
867
+ "trial_name": null,
868
+ "trial_params": null
869
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74bc508abee099291ad8894c55837db6477c06aa2700860858f3c6e09ed6af95
3
+ size 5112