certainstar commited on
Commit
40d9b50
1 Parent(s): 0718a9e

Upload 10 files

Browse files
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "../bert/bert-base-multilingual-cased",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "directionality": "bidi",
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "pooler_fc_size": 768,
21
+ "pooler_num_attention_heads": 12,
22
+ "pooler_num_fc_layers": 3,
23
+ "pooler_size_per_head": 128,
24
+ "pooler_type": "first_token_transform",
25
+ "position_embedding_type": "absolute",
26
+ "problem_type": "single_label_classification",
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.40.1",
29
+ "type_vocab_size": 2,
30
+ "use_cache": true,
31
+ "vocab_size": 119547
32
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa26f51f5c8da79bac14946824e177b478f8c0efd679a7004cdc4b7ec10ad85b
3
+ size 711443456
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afd8766b32c28117ab1e9cb56fff58054d24ba23e768a4a1124fe9cf54ee1ed6
3
+ size 1423007994
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:200bb63df685a51d719a8fee2c6f9b9ba699d7e44eb7fd47832b81313cd2f68f
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76320ce4ec97c5c9aba00c5f9ac079ba0900242917453092d9390b52938f00f6
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": false,
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "never_split": null,
51
+ "pad_token": "[PAD]",
52
+ "sep_token": "[SEP]",
53
+ "strip_accents": null,
54
+ "tokenize_chinese_chars": true,
55
+ "tokenizer_class": "BertTokenizer",
56
+ "unk_token": "[UNK]"
57
+ }
trainer_state.json ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 11154,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.13448090371167296,
13
+ "grad_norm": 4.317680835723877,
14
+ "learning_rate": 1.9103460641922183e-05,
15
+ "loss": 0.2456,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.13448090371167296,
20
+ "eval_accuracy": 0.9731038192576654,
21
+ "eval_loss": 0.10059116035699844,
22
+ "eval_runtime": 79.8153,
23
+ "eval_samples_per_second": 93.165,
24
+ "eval_steps_per_second": 11.652,
25
+ "step": 500
26
+ },
27
+ {
28
+ "epoch": 0.2689618074233459,
29
+ "grad_norm": 15.100606918334961,
30
+ "learning_rate": 1.8206921283844363e-05,
31
+ "loss": 0.1162,
32
+ "step": 1000
33
+ },
34
+ {
35
+ "epoch": 0.2689618074233459,
36
+ "eval_accuracy": 0.9665142549757935,
37
+ "eval_loss": 0.17202958464622498,
38
+ "eval_runtime": 77.2682,
39
+ "eval_samples_per_second": 96.236,
40
+ "eval_steps_per_second": 12.036,
41
+ "step": 1000
42
+ },
43
+ {
44
+ "epoch": 0.40344271113501884,
45
+ "grad_norm": 0.02275875024497509,
46
+ "learning_rate": 1.731038192576654e-05,
47
+ "loss": 0.0779,
48
+ "step": 1500
49
+ },
50
+ {
51
+ "epoch": 0.40344271113501884,
52
+ "eval_accuracy": 0.957235072619688,
53
+ "eval_loss": 0.22287946939468384,
54
+ "eval_runtime": 86.088,
55
+ "eval_samples_per_second": 86.377,
56
+ "eval_steps_per_second": 10.803,
57
+ "step": 1500
58
+ },
59
+ {
60
+ "epoch": 0.5379236148466918,
61
+ "grad_norm": 0.004053408745676279,
62
+ "learning_rate": 1.6413842567688722e-05,
63
+ "loss": 0.082,
64
+ "step": 2000
65
+ },
66
+ {
67
+ "epoch": 0.5379236148466918,
68
+ "eval_accuracy": 0.956831629908553,
69
+ "eval_loss": 0.2240680307149887,
70
+ "eval_runtime": 76.9592,
71
+ "eval_samples_per_second": 96.623,
72
+ "eval_steps_per_second": 12.084,
73
+ "step": 2000
74
+ },
75
+ {
76
+ "epoch": 0.6724045185583647,
77
+ "grad_norm": 0.0030935597606003284,
78
+ "learning_rate": 1.5517303209610903e-05,
79
+ "loss": 0.0645,
80
+ "step": 2500
81
+ },
82
+ {
83
+ "epoch": 0.6724045185583647,
84
+ "eval_accuracy": 0.9519903173749328,
85
+ "eval_loss": 0.30673694610595703,
86
+ "eval_runtime": 72.7269,
87
+ "eval_samples_per_second": 102.246,
88
+ "eval_steps_per_second": 12.788,
89
+ "step": 2500
90
+ },
91
+ {
92
+ "epoch": 0.8068854222700377,
93
+ "grad_norm": 0.019974075257778168,
94
+ "learning_rate": 1.4620763851533084e-05,
95
+ "loss": 0.059,
96
+ "step": 3000
97
+ },
98
+ {
99
+ "epoch": 0.8068854222700377,
100
+ "eval_accuracy": 0.9909897794513179,
101
+ "eval_loss": 0.050069019198417664,
102
+ "eval_runtime": 79.0838,
103
+ "eval_samples_per_second": 94.027,
104
+ "eval_steps_per_second": 11.76,
105
+ "step": 3000
106
+ },
107
+ {
108
+ "epoch": 0.9413663259817105,
109
+ "grad_norm": 0.008741290308535099,
110
+ "learning_rate": 1.3724224493455265e-05,
111
+ "loss": 0.0582,
112
+ "step": 3500
113
+ },
114
+ {
115
+ "epoch": 0.9413663259817105,
116
+ "eval_accuracy": 0.9911242603550295,
117
+ "eval_loss": 0.03790881112217903,
118
+ "eval_runtime": 77.0838,
119
+ "eval_samples_per_second": 96.466,
120
+ "eval_steps_per_second": 12.065,
121
+ "step": 3500
122
+ },
123
+ {
124
+ "epoch": 1.0758472296933834,
125
+ "grad_norm": 38.54170608520508,
126
+ "learning_rate": 1.2827685135377444e-05,
127
+ "loss": 0.0338,
128
+ "step": 4000
129
+ },
130
+ {
131
+ "epoch": 1.0758472296933834,
132
+ "eval_accuracy": 0.9751210328133405,
133
+ "eval_loss": 0.15974809229373932,
134
+ "eval_runtime": 80.1945,
135
+ "eval_samples_per_second": 92.725,
136
+ "eval_steps_per_second": 11.597,
137
+ "step": 4000
138
+ },
139
+ {
140
+ "epoch": 1.2103281334050564,
141
+ "grad_norm": 29.925457000732422,
142
+ "learning_rate": 1.1931145777299625e-05,
143
+ "loss": 0.03,
144
+ "step": 4500
145
+ },
146
+ {
147
+ "epoch": 1.2103281334050564,
148
+ "eval_accuracy": 0.9916621839698763,
149
+ "eval_loss": 0.046014368534088135,
150
+ "eval_runtime": 57.8477,
151
+ "eval_samples_per_second": 128.544,
152
+ "eval_steps_per_second": 16.077,
153
+ "step": 4500
154
+ },
155
+ {
156
+ "epoch": 1.3448090371167294,
157
+ "grad_norm": 0.007805847562849522,
158
+ "learning_rate": 1.1034606419221806e-05,
159
+ "loss": 0.0215,
160
+ "step": 5000
161
+ },
162
+ {
163
+ "epoch": 1.3448090371167294,
164
+ "eval_accuracy": 0.9772727272727273,
165
+ "eval_loss": 0.15217718482017517,
166
+ "eval_runtime": 74.0806,
167
+ "eval_samples_per_second": 100.377,
168
+ "eval_steps_per_second": 12.554,
169
+ "step": 5000
170
+ },
171
+ {
172
+ "epoch": 1.4792899408284024,
173
+ "grad_norm": 0.0005576165858656168,
174
+ "learning_rate": 1.0138067061143987e-05,
175
+ "loss": 0.0144,
176
+ "step": 5500
177
+ },
178
+ {
179
+ "epoch": 1.4792899408284024,
180
+ "eval_accuracy": 0.9413663259817105,
181
+ "eval_loss": 0.4672979414463043,
182
+ "eval_runtime": 74.077,
183
+ "eval_samples_per_second": 100.382,
184
+ "eval_steps_per_second": 12.555,
185
+ "step": 5500
186
+ },
187
+ {
188
+ "epoch": 1.6137708445400754,
189
+ "grad_norm": 0.5596923232078552,
190
+ "learning_rate": 9.241527703066166e-06,
191
+ "loss": 0.0357,
192
+ "step": 6000
193
+ },
194
+ {
195
+ "epoch": 1.6137708445400754,
196
+ "eval_accuracy": 0.991393222162453,
197
+ "eval_loss": 0.05371003597974777,
198
+ "eval_runtime": 91.3598,
199
+ "eval_samples_per_second": 81.392,
200
+ "eval_steps_per_second": 10.18,
201
+ "step": 6000
202
+ },
203
+ {
204
+ "epoch": 1.7482517482517483,
205
+ "grad_norm": 0.0050299325957894325,
206
+ "learning_rate": 8.344988344988347e-06,
207
+ "loss": 0.0282,
208
+ "step": 6500
209
+ },
210
+ {
211
+ "epoch": 1.7482517482517483,
212
+ "eval_accuracy": 0.9795589026358257,
213
+ "eval_loss": 0.10718287527561188,
214
+ "eval_runtime": 63.8039,
215
+ "eval_samples_per_second": 116.545,
216
+ "eval_steps_per_second": 14.576,
217
+ "step": 6500
218
+ },
219
+ {
220
+ "epoch": 1.8827326519634213,
221
+ "grad_norm": 0.1358013153076172,
222
+ "learning_rate": 7.448448986910526e-06,
223
+ "loss": 0.0271,
224
+ "step": 7000
225
+ },
226
+ {
227
+ "epoch": 1.8827326519634213,
228
+ "eval_accuracy": 0.9616729424421732,
229
+ "eval_loss": 0.21952089667320251,
230
+ "eval_runtime": 62.6082,
231
+ "eval_samples_per_second": 118.77,
232
+ "eval_steps_per_second": 14.854,
233
+ "step": 7000
234
+ },
235
+ {
236
+ "epoch": 2.0172135556750943,
237
+ "grad_norm": 0.0005359902861528099,
238
+ "learning_rate": 6.551909628832707e-06,
239
+ "loss": 0.0151,
240
+ "step": 7500
241
+ },
242
+ {
243
+ "epoch": 2.0172135556750943,
244
+ "eval_accuracy": 0.972027972027972,
245
+ "eval_loss": 0.18190746009349823,
246
+ "eval_runtime": 75.7313,
247
+ "eval_samples_per_second": 98.189,
248
+ "eval_steps_per_second": 12.28,
249
+ "step": 7500
250
+ },
251
+ {
252
+ "epoch": 2.151694459386767,
253
+ "grad_norm": 0.0002543228620197624,
254
+ "learning_rate": 5.655370270754886e-06,
255
+ "loss": 0.0124,
256
+ "step": 8000
257
+ },
258
+ {
259
+ "epoch": 2.151694459386767,
260
+ "eval_accuracy": 0.9796933835395374,
261
+ "eval_loss": 0.12187998741865158,
262
+ "eval_runtime": 73.9844,
263
+ "eval_samples_per_second": 100.508,
264
+ "eval_steps_per_second": 12.57,
265
+ "step": 8000
266
+ },
267
+ {
268
+ "epoch": 2.28617536309844,
269
+ "grad_norm": 0.017442911863327026,
270
+ "learning_rate": 4.758830912677067e-06,
271
+ "loss": 0.0084,
272
+ "step": 8500
273
+ },
274
+ {
275
+ "epoch": 2.28617536309844,
276
+ "eval_accuracy": 0.9803657880580957,
277
+ "eval_loss": 0.133524090051651,
278
+ "eval_runtime": 72.5895,
279
+ "eval_samples_per_second": 102.439,
280
+ "eval_steps_per_second": 12.812,
281
+ "step": 8500
282
+ },
283
+ {
284
+ "epoch": 2.420656266810113,
285
+ "grad_norm": 0.00029117995291016996,
286
+ "learning_rate": 3.862291554599247e-06,
287
+ "loss": 0.0045,
288
+ "step": 9000
289
+ },
290
+ {
291
+ "epoch": 2.420656266810113,
292
+ "eval_accuracy": 0.9868208714362561,
293
+ "eval_loss": 0.084413081407547,
294
+ "eval_runtime": 71.5341,
295
+ "eval_samples_per_second": 103.95,
296
+ "eval_steps_per_second": 13.001,
297
+ "step": 9000
298
+ },
299
+ {
300
+ "epoch": 2.555137170521786,
301
+ "grad_norm": 0.0001818160671973601,
302
+ "learning_rate": 2.9657521965214274e-06,
303
+ "loss": 0.0027,
304
+ "step": 9500
305
+ },
306
+ {
307
+ "epoch": 2.555137170521786,
308
+ "eval_accuracy": 0.9838622915545993,
309
+ "eval_loss": 0.1180611401796341,
310
+ "eval_runtime": 72.5206,
311
+ "eval_samples_per_second": 102.536,
312
+ "eval_steps_per_second": 12.824,
313
+ "step": 9500
314
+ },
315
+ {
316
+ "epoch": 2.6896180742334588,
317
+ "grad_norm": 0.00021849782206118107,
318
+ "learning_rate": 2.069212838443608e-06,
319
+ "loss": 0.0061,
320
+ "step": 10000
321
+ },
322
+ {
323
+ "epoch": 2.6896180742334588,
324
+ "eval_accuracy": 0.9842657342657343,
325
+ "eval_loss": 0.11533553153276443,
326
+ "eval_runtime": 72.894,
327
+ "eval_samples_per_second": 102.011,
328
+ "eval_steps_per_second": 12.758,
329
+ "step": 10000
330
+ },
331
+ {
332
+ "epoch": 2.8240989779451318,
333
+ "grad_norm": 0.0002802180533763021,
334
+ "learning_rate": 1.1726734803657882e-06,
335
+ "loss": 0.0026,
336
+ "step": 10500
337
+ },
338
+ {
339
+ "epoch": 2.8240989779451318,
340
+ "eval_accuracy": 0.9905863367401829,
341
+ "eval_loss": 0.06259341537952423,
342
+ "eval_runtime": 72.005,
343
+ "eval_samples_per_second": 103.271,
344
+ "eval_steps_per_second": 12.916,
345
+ "step": 10500
346
+ },
347
+ {
348
+ "epoch": 2.9585798816568047,
349
+ "grad_norm": 0.00011676761641865596,
350
+ "learning_rate": 2.7613412228796843e-07,
351
+ "loss": 0.0008,
352
+ "step": 11000
353
+ },
354
+ {
355
+ "epoch": 2.9585798816568047,
356
+ "eval_accuracy": 0.983593329747176,
357
+ "eval_loss": 0.13157618045806885,
358
+ "eval_runtime": 95.8085,
359
+ "eval_samples_per_second": 77.613,
360
+ "eval_steps_per_second": 9.707,
361
+ "step": 11000
362
+ }
363
+ ],
364
+ "logging_steps": 500,
365
+ "max_steps": 11154,
366
+ "num_input_tokens_seen": 0,
367
+ "num_train_epochs": 3,
368
+ "save_steps": 500,
369
+ "total_flos": 1.8301403760969e+16,
370
+ "train_batch_size": 8,
371
+ "trial_name": null,
372
+ "trial_params": null
373
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfc7e249f4a7941cc06989f6ec1445b35f11da2753e6b814d6ce30cd3a2190a7
3
+ size 5048
vocab.txt ADDED
The diff for this file is too large to render. See raw diff