tarikdincer commited on
Commit
8186ce3
1 Parent(s): 38b33b4

model files are added

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
config.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/bart-base",
3
+ "activation_dropout": 0.1,
4
+ "activation_function": "gelu",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "BartForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.1,
11
+ "bos_token_id": 0,
12
+ "classif_dropout": 0.1,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 768,
15
+ "decoder_attention_heads": 12,
16
+ "decoder_ffn_dim": 3072,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 6,
19
+ "decoder_start_token_id": 2,
20
+ "dropout": 0.1,
21
+ "early_stopping": true,
22
+ "encoder_attention_heads": 12,
23
+ "encoder_ffn_dim": 3072,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 6,
26
+ "eos_token_id": 2,
27
+ "forced_bos_token_id": 0,
28
+ "forced_eos_token_id": 2,
29
+ "gradient_checkpointing": false,
30
+ "id2label": {
31
+ "0": "LABEL_0",
32
+ "1": "LABEL_1",
33
+ "2": "LABEL_2"
34
+ },
35
+ "init_std": 0.02,
36
+ "is_encoder_decoder": true,
37
+ "label2id": {
38
+ "LABEL_0": 0,
39
+ "LABEL_1": 1,
40
+ "LABEL_2": 2
41
+ },
42
+ "max_position_embeddings": 1024,
43
+ "model_type": "bart",
44
+ "no_repeat_ngram_size": 3,
45
+ "normalize_before": false,
46
+ "normalize_embedding": true,
47
+ "num_beams": 4,
48
+ "num_hidden_layers": 6,
49
+ "pad_token_id": 1,
50
+ "scale_embedding": false,
51
+ "task_specific_params": {
52
+ "summarization": {
53
+ "length_penalty": 1.0,
54
+ "max_length": 128,
55
+ "min_length": 12,
56
+ "num_beams": 4
57
+ },
58
+ "summarization_cnn": {
59
+ "length_penalty": 2.0,
60
+ "max_length": 142,
61
+ "min_length": 56,
62
+ "num_beams": 4
63
+ },
64
+ "summarization_xsum": {
65
+ "length_penalty": 1.0,
66
+ "max_length": 62,
67
+ "min_length": 11,
68
+ "num_beams": 6
69
+ }
70
+ },
71
+ "torch_dtype": "float32",
72
+ "transformers_version": "4.38.2",
73
+ "use_cache": true,
74
+ "vocab_size": 50265
75
+ }
generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 0,
3
+ "decoder_start_token_id": 2,
4
+ "early_stopping": true,
5
+ "eos_token_id": 2,
6
+ "forced_bos_token_id": 0,
7
+ "forced_eos_token_id": 2,
8
+ "no_repeat_ngram_size": 3,
9
+ "num_beams": 4,
10
+ "pad_token_id": 1,
11
+ "transformers_version": "4.38.2"
12
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc96341a118cd5e143be21b2facdd4ab838539a7bba52af1bcbfed486690c664
3
+ size 557912620
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24165cc88b9488949bb4e552f6b04151e6dd6cdde1fab6dd427e8e5a5d80caab
3
+ size 1115579898
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d97f8b6459373c118b1617234bf9c04bd797322db8c55570f4ebf6dbe844bbb
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac36bc24c0dcde8780c73dbb5e6ec02976f9779e9bcd724560176dc11c0c4e85
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": true,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": true,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "mask_token": "<mask>",
51
+ "model_max_length": 1024,
52
+ "pad_token": "<pad>",
53
+ "sep_token": "</s>",
54
+ "tokenizer_class": "BartTokenizer",
55
+ "trim_offsets": true,
56
+ "unk_token": "<unk>"
57
+ }
trainer_state.json ADDED
@@ -0,0 +1,739 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.56,
5
+ "eval_steps": 500,
6
+ "global_step": 1000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.03,
13
+ "grad_norm": 8.571518898010254,
14
+ "learning_rate": 0.0003965811965811966,
15
+ "loss": 6.0892,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.05,
20
+ "grad_norm": 2.477086067199707,
21
+ "learning_rate": 0.00039316239316239317,
22
+ "loss": 1.4485,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.08,
27
+ "grad_norm": 0.7137540578842163,
28
+ "learning_rate": 0.00038974358974358975,
29
+ "loss": 0.9732,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.1,
34
+ "grad_norm": 0.5362057685852051,
35
+ "learning_rate": 0.0003863247863247863,
36
+ "loss": 0.7804,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.13,
41
+ "grad_norm": 0.6810179948806763,
42
+ "learning_rate": 0.00038290598290598296,
43
+ "loss": 0.665,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.15,
48
+ "grad_norm": 0.4986821115016937,
49
+ "learning_rate": 0.0003794871794871795,
50
+ "loss": 0.6091,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.18,
55
+ "grad_norm": 0.4309682250022888,
56
+ "learning_rate": 0.00037606837606837606,
57
+ "loss": 0.5502,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.2,
62
+ "grad_norm": 0.3824257552623749,
63
+ "learning_rate": 0.0003726495726495727,
64
+ "loss": 0.5164,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.23,
69
+ "grad_norm": 0.3188970386981964,
70
+ "learning_rate": 0.00036923076923076927,
71
+ "loss": 0.4883,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.26,
76
+ "grad_norm": 0.3359103202819824,
77
+ "learning_rate": 0.00036581196581196584,
78
+ "loss": 0.4612,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.28,
83
+ "grad_norm": 0.4327464699745178,
84
+ "learning_rate": 0.0003623931623931624,
85
+ "loss": 0.4351,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.31,
90
+ "grad_norm": 0.4424777030944824,
91
+ "learning_rate": 0.000358974358974359,
92
+ "loss": 0.4217,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.33,
97
+ "grad_norm": 0.520322322845459,
98
+ "learning_rate": 0.00035555555555555557,
99
+ "loss": 0.4076,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.36,
104
+ "grad_norm": 0.48572778701782227,
105
+ "learning_rate": 0.00035213675213675215,
106
+ "loss": 0.3948,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.38,
111
+ "grad_norm": 0.2985605001449585,
112
+ "learning_rate": 0.0003487179487179487,
113
+ "loss": 0.3823,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.41,
118
+ "grad_norm": 0.28738752007484436,
119
+ "learning_rate": 0.00034529914529914536,
120
+ "loss": 0.375,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.44,
125
+ "grad_norm": 0.29423144459724426,
126
+ "learning_rate": 0.0003418803418803419,
127
+ "loss": 0.3591,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.46,
132
+ "grad_norm": 0.26430046558380127,
133
+ "learning_rate": 0.00033846153846153846,
134
+ "loss": 0.3494,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.49,
139
+ "grad_norm": 0.2734215259552002,
140
+ "learning_rate": 0.0003350427350427351,
141
+ "loss": 0.3396,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.51,
146
+ "grad_norm": 0.3005197048187256,
147
+ "learning_rate": 0.00033162393162393166,
148
+ "loss": 0.3352,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 0.54,
153
+ "grad_norm": 0.2822723686695099,
154
+ "learning_rate": 0.0003282051282051282,
155
+ "loss": 0.3241,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 0.56,
160
+ "grad_norm": 0.2792316973209381,
161
+ "learning_rate": 0.0003247863247863248,
162
+ "loss": 0.3208,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 0.59,
167
+ "grad_norm": 0.2761669158935547,
168
+ "learning_rate": 0.0003213675213675214,
169
+ "loss": 0.3148,
170
+ "step": 230
171
+ },
172
+ {
173
+ "epoch": 0.61,
174
+ "grad_norm": 0.2733113467693329,
175
+ "learning_rate": 0.0003179487179487179,
176
+ "loss": 0.311,
177
+ "step": 240
178
+ },
179
+ {
180
+ "epoch": 0.64,
181
+ "grad_norm": 0.3393694758415222,
182
+ "learning_rate": 0.00031452991452991455,
183
+ "loss": 0.3056,
184
+ "step": 250
185
+ },
186
+ {
187
+ "epoch": 0.67,
188
+ "grad_norm": 0.29316985607147217,
189
+ "learning_rate": 0.0003111111111111111,
190
+ "loss": 0.2975,
191
+ "step": 260
192
+ },
193
+ {
194
+ "epoch": 0.69,
195
+ "grad_norm": 0.29134783148765564,
196
+ "learning_rate": 0.0003076923076923077,
197
+ "loss": 0.2898,
198
+ "step": 270
199
+ },
200
+ {
201
+ "epoch": 0.72,
202
+ "grad_norm": 0.41234660148620605,
203
+ "learning_rate": 0.0003042735042735043,
204
+ "loss": 0.2895,
205
+ "step": 280
206
+ },
207
+ {
208
+ "epoch": 0.74,
209
+ "grad_norm": 0.26693716645240784,
210
+ "learning_rate": 0.00030085470085470086,
211
+ "loss": 0.2835,
212
+ "step": 290
213
+ },
214
+ {
215
+ "epoch": 0.77,
216
+ "grad_norm": 0.2862294614315033,
217
+ "learning_rate": 0.00029743589743589743,
218
+ "loss": 0.2747,
219
+ "step": 300
220
+ },
221
+ {
222
+ "epoch": 0.79,
223
+ "grad_norm": 0.2596249282360077,
224
+ "learning_rate": 0.00029401709401709406,
225
+ "loss": 0.2752,
226
+ "step": 310
227
+ },
228
+ {
229
+ "epoch": 0.82,
230
+ "grad_norm": 0.2555866241455078,
231
+ "learning_rate": 0.0002905982905982906,
232
+ "loss": 0.2662,
233
+ "step": 320
234
+ },
235
+ {
236
+ "epoch": 0.84,
237
+ "grad_norm": 0.3845195472240448,
238
+ "learning_rate": 0.0002871794871794872,
239
+ "loss": 0.2625,
240
+ "step": 330
241
+ },
242
+ {
243
+ "epoch": 0.87,
244
+ "grad_norm": 0.23550209403038025,
245
+ "learning_rate": 0.0002837606837606838,
246
+ "loss": 0.256,
247
+ "step": 340
248
+ },
249
+ {
250
+ "epoch": 0.9,
251
+ "grad_norm": 0.2434936910867691,
252
+ "learning_rate": 0.0002803418803418803,
253
+ "loss": 0.2545,
254
+ "step": 350
255
+ },
256
+ {
257
+ "epoch": 0.92,
258
+ "grad_norm": 0.23562268912792206,
259
+ "learning_rate": 0.00027692307692307695,
260
+ "loss": 0.2536,
261
+ "step": 360
262
+ },
263
+ {
264
+ "epoch": 0.95,
265
+ "grad_norm": 0.3110085427761078,
266
+ "learning_rate": 0.0002735042735042735,
267
+ "loss": 0.2497,
268
+ "step": 370
269
+ },
270
+ {
271
+ "epoch": 0.97,
272
+ "grad_norm": 0.2646142244338989,
273
+ "learning_rate": 0.0002700854700854701,
274
+ "loss": 0.2448,
275
+ "step": 380
276
+ },
277
+ {
278
+ "epoch": 1.0,
279
+ "grad_norm": 0.22812116146087646,
280
+ "learning_rate": 0.0002666666666666667,
281
+ "loss": 0.2409,
282
+ "step": 390
283
+ },
284
+ {
285
+ "epoch": 1.02,
286
+ "grad_norm": 0.21481893956661224,
287
+ "learning_rate": 0.00026324786324786326,
288
+ "loss": 0.225,
289
+ "step": 400
290
+ },
291
+ {
292
+ "epoch": 1.05,
293
+ "grad_norm": 0.2561526298522949,
294
+ "learning_rate": 0.00025982905982905983,
295
+ "loss": 0.2194,
296
+ "step": 410
297
+ },
298
+ {
299
+ "epoch": 1.08,
300
+ "grad_norm": 0.2297515720129013,
301
+ "learning_rate": 0.00025641025641025646,
302
+ "loss": 0.216,
303
+ "step": 420
304
+ },
305
+ {
306
+ "epoch": 1.1,
307
+ "grad_norm": 0.25526463985443115,
308
+ "learning_rate": 0.000252991452991453,
309
+ "loss": 0.2171,
310
+ "step": 430
311
+ },
312
+ {
313
+ "epoch": 1.13,
314
+ "grad_norm": 0.24202637374401093,
315
+ "learning_rate": 0.00024957264957264956,
316
+ "loss": 0.2149,
317
+ "step": 440
318
+ },
319
+ {
320
+ "epoch": 1.15,
321
+ "grad_norm": 0.20644807815551758,
322
+ "learning_rate": 0.0002461538461538462,
323
+ "loss": 0.209,
324
+ "step": 450
325
+ },
326
+ {
327
+ "epoch": 1.18,
328
+ "grad_norm": 0.2795998454093933,
329
+ "learning_rate": 0.00024273504273504272,
330
+ "loss": 0.2071,
331
+ "step": 460
332
+ },
333
+ {
334
+ "epoch": 1.2,
335
+ "grad_norm": 0.306149959564209,
336
+ "learning_rate": 0.00023931623931623932,
337
+ "loss": 0.208,
338
+ "step": 470
339
+ },
340
+ {
341
+ "epoch": 1.23,
342
+ "grad_norm": 0.2355523407459259,
343
+ "learning_rate": 0.00023589743589743593,
344
+ "loss": 0.2051,
345
+ "step": 480
346
+ },
347
+ {
348
+ "epoch": 1.25,
349
+ "grad_norm": 0.2909263074398041,
350
+ "learning_rate": 0.0002324786324786325,
351
+ "loss": 0.2023,
352
+ "step": 490
353
+ },
354
+ {
355
+ "epoch": 1.28,
356
+ "grad_norm": 0.5298261642456055,
357
+ "learning_rate": 0.00022905982905982905,
358
+ "loss": 0.2018,
359
+ "step": 500
360
+ },
361
+ {
362
+ "epoch": 1.28,
363
+ "eval_cer": 0.9319083335386478,
364
+ "eval_loss": 0.17038685083389282,
365
+ "eval_runtime": 138.0008,
366
+ "eval_samples_per_second": 14.493,
367
+ "eval_steps_per_second": 0.457,
368
+ "step": 500
369
+ },
370
+ {
371
+ "epoch": 1.31,
372
+ "grad_norm": 0.23548871278762817,
373
+ "learning_rate": 0.00022564102564102566,
374
+ "loss": 0.2008,
375
+ "step": 510
376
+ },
377
+ {
378
+ "epoch": 1.33,
379
+ "grad_norm": 0.20162977278232574,
380
+ "learning_rate": 0.00022222222222222223,
381
+ "loss": 0.1975,
382
+ "step": 520
383
+ },
384
+ {
385
+ "epoch": 1.36,
386
+ "grad_norm": 0.2593408524990082,
387
+ "learning_rate": 0.00021880341880341884,
388
+ "loss": 0.1959,
389
+ "step": 530
390
+ },
391
+ {
392
+ "epoch": 1.38,
393
+ "grad_norm": 0.21452312171459198,
394
+ "learning_rate": 0.0002153846153846154,
395
+ "loss": 0.194,
396
+ "step": 540
397
+ },
398
+ {
399
+ "epoch": 1.41,
400
+ "grad_norm": 0.2637544274330139,
401
+ "learning_rate": 0.000211965811965812,
402
+ "loss": 0.1909,
403
+ "step": 550
404
+ },
405
+ {
406
+ "epoch": 1.43,
407
+ "grad_norm": 0.24357128143310547,
408
+ "learning_rate": 0.00020854700854700857,
409
+ "loss": 0.19,
410
+ "step": 560
411
+ },
412
+ {
413
+ "epoch": 1.46,
414
+ "grad_norm": 0.2084117978811264,
415
+ "learning_rate": 0.00020512820512820512,
416
+ "loss": 0.1857,
417
+ "step": 570
418
+ },
419
+ {
420
+ "epoch": 1.48,
421
+ "grad_norm": 0.23245294392108917,
422
+ "learning_rate": 0.00020170940170940172,
423
+ "loss": 0.1858,
424
+ "step": 580
425
+ },
426
+ {
427
+ "epoch": 1.51,
428
+ "grad_norm": 0.23836293816566467,
429
+ "learning_rate": 0.0001982905982905983,
430
+ "loss": 0.183,
431
+ "step": 590
432
+ },
433
+ {
434
+ "epoch": 1.54,
435
+ "grad_norm": 0.19184565544128418,
436
+ "learning_rate": 0.00019487179487179487,
437
+ "loss": 0.183,
438
+ "step": 600
439
+ },
440
+ {
441
+ "epoch": 1.56,
442
+ "grad_norm": 0.20401564240455627,
443
+ "learning_rate": 0.00019145299145299148,
444
+ "loss": 0.1829,
445
+ "step": 610
446
+ },
447
+ {
448
+ "epoch": 1.59,
449
+ "grad_norm": 0.21579188108444214,
450
+ "learning_rate": 0.00018803418803418803,
451
+ "loss": 0.1812,
452
+ "step": 620
453
+ },
454
+ {
455
+ "epoch": 1.61,
456
+ "grad_norm": 0.23108145594596863,
457
+ "learning_rate": 0.00018461538461538463,
458
+ "loss": 0.1781,
459
+ "step": 630
460
+ },
461
+ {
462
+ "epoch": 1.64,
463
+ "grad_norm": 0.2311713844537735,
464
+ "learning_rate": 0.0001811965811965812,
465
+ "loss": 0.1755,
466
+ "step": 640
467
+ },
468
+ {
469
+ "epoch": 1.66,
470
+ "grad_norm": 0.19794794917106628,
471
+ "learning_rate": 0.00017777777777777779,
472
+ "loss": 0.1768,
473
+ "step": 650
474
+ },
475
+ {
476
+ "epoch": 1.69,
477
+ "grad_norm": 0.2516119182109833,
478
+ "learning_rate": 0.00017435897435897436,
479
+ "loss": 0.1737,
480
+ "step": 660
481
+ },
482
+ {
483
+ "epoch": 1.72,
484
+ "grad_norm": 0.20975567400455475,
485
+ "learning_rate": 0.00017094017094017094,
486
+ "loss": 0.1712,
487
+ "step": 670
488
+ },
489
+ {
490
+ "epoch": 1.74,
491
+ "grad_norm": 0.22168505191802979,
492
+ "learning_rate": 0.00016752136752136754,
493
+ "loss": 0.1693,
494
+ "step": 680
495
+ },
496
+ {
497
+ "epoch": 1.77,
498
+ "grad_norm": 0.22844062745571136,
499
+ "learning_rate": 0.0001641025641025641,
500
+ "loss": 0.168,
501
+ "step": 690
502
+ },
503
+ {
504
+ "epoch": 1.79,
505
+ "grad_norm": 0.22804197669029236,
506
+ "learning_rate": 0.0001606837606837607,
507
+ "loss": 0.1721,
508
+ "step": 700
509
+ },
510
+ {
511
+ "epoch": 1.82,
512
+ "grad_norm": 0.22620578110218048,
513
+ "learning_rate": 0.00015726495726495727,
514
+ "loss": 0.1703,
515
+ "step": 710
516
+ },
517
+ {
518
+ "epoch": 1.84,
519
+ "grad_norm": 0.21445313096046448,
520
+ "learning_rate": 0.00015384615384615385,
521
+ "loss": 0.1673,
522
+ "step": 720
523
+ },
524
+ {
525
+ "epoch": 1.87,
526
+ "grad_norm": 0.207479327917099,
527
+ "learning_rate": 0.00015042735042735043,
528
+ "loss": 0.1648,
529
+ "step": 730
530
+ },
531
+ {
532
+ "epoch": 1.89,
533
+ "grad_norm": 0.22134087979793549,
534
+ "learning_rate": 0.00014700854700854703,
535
+ "loss": 0.1629,
536
+ "step": 740
537
+ },
538
+ {
539
+ "epoch": 1.92,
540
+ "grad_norm": 0.20121484994888306,
541
+ "learning_rate": 0.0001435897435897436,
542
+ "loss": 0.1638,
543
+ "step": 750
544
+ },
545
+ {
546
+ "epoch": 1.95,
547
+ "grad_norm": 0.2002618908882141,
548
+ "learning_rate": 0.00014017094017094016,
549
+ "loss": 0.1621,
550
+ "step": 760
551
+ },
552
+ {
553
+ "epoch": 1.97,
554
+ "grad_norm": 0.19750453531742096,
555
+ "learning_rate": 0.00013675213675213676,
556
+ "loss": 0.1667,
557
+ "step": 770
558
+ },
559
+ {
560
+ "epoch": 2.0,
561
+ "grad_norm": 0.22286508977413177,
562
+ "learning_rate": 0.00013333333333333334,
563
+ "loss": 0.1642,
564
+ "step": 780
565
+ },
566
+ {
567
+ "epoch": 2.02,
568
+ "grad_norm": 0.21668635308742523,
569
+ "learning_rate": 0.00012991452991452992,
570
+ "loss": 0.1482,
571
+ "step": 790
572
+ },
573
+ {
574
+ "epoch": 2.05,
575
+ "grad_norm": 0.233961820602417,
576
+ "learning_rate": 0.0001264957264957265,
577
+ "loss": 0.1453,
578
+ "step": 800
579
+ },
580
+ {
581
+ "epoch": 2.07,
582
+ "grad_norm": 0.1865084022283554,
583
+ "learning_rate": 0.0001230769230769231,
584
+ "loss": 0.1455,
585
+ "step": 810
586
+ },
587
+ {
588
+ "epoch": 2.1,
589
+ "grad_norm": 0.1853141337633133,
590
+ "learning_rate": 0.00011965811965811966,
591
+ "loss": 0.1442,
592
+ "step": 820
593
+ },
594
+ {
595
+ "epoch": 2.12,
596
+ "grad_norm": 0.17371739447116852,
597
+ "learning_rate": 0.00011623931623931625,
598
+ "loss": 0.1382,
599
+ "step": 830
600
+ },
601
+ {
602
+ "epoch": 2.15,
603
+ "grad_norm": 0.19631154835224152,
604
+ "learning_rate": 0.00011282051282051283,
605
+ "loss": 0.1415,
606
+ "step": 840
607
+ },
608
+ {
609
+ "epoch": 2.18,
610
+ "grad_norm": 0.194850891828537,
611
+ "learning_rate": 0.00010940170940170942,
612
+ "loss": 0.141,
613
+ "step": 850
614
+ },
615
+ {
616
+ "epoch": 2.2,
617
+ "grad_norm": 0.18121449649333954,
618
+ "learning_rate": 0.000105982905982906,
619
+ "loss": 0.1388,
620
+ "step": 860
621
+ },
622
+ {
623
+ "epoch": 2.23,
624
+ "grad_norm": 0.2176773101091385,
625
+ "learning_rate": 0.00010256410256410256,
626
+ "loss": 0.1399,
627
+ "step": 870
628
+ },
629
+ {
630
+ "epoch": 2.25,
631
+ "grad_norm": 0.19013133645057678,
632
+ "learning_rate": 9.914529914529915e-05,
633
+ "loss": 0.137,
634
+ "step": 880
635
+ },
636
+ {
637
+ "epoch": 2.28,
638
+ "grad_norm": 0.22148679196834564,
639
+ "learning_rate": 9.572649572649574e-05,
640
+ "loss": 0.139,
641
+ "step": 890
642
+ },
643
+ {
644
+ "epoch": 2.3,
645
+ "grad_norm": 0.20861493051052094,
646
+ "learning_rate": 9.230769230769232e-05,
647
+ "loss": 0.139,
648
+ "step": 900
649
+ },
650
+ {
651
+ "epoch": 2.33,
652
+ "grad_norm": 0.17541790008544922,
653
+ "learning_rate": 8.888888888888889e-05,
654
+ "loss": 0.1362,
655
+ "step": 910
656
+ },
657
+ {
658
+ "epoch": 2.36,
659
+ "grad_norm": 0.1971459984779358,
660
+ "learning_rate": 8.547008547008547e-05,
661
+ "loss": 0.1346,
662
+ "step": 920
663
+ },
664
+ {
665
+ "epoch": 2.38,
666
+ "grad_norm": 0.20883004367351532,
667
+ "learning_rate": 8.205128205128205e-05,
668
+ "loss": 0.1351,
669
+ "step": 930
670
+ },
671
+ {
672
+ "epoch": 2.41,
673
+ "grad_norm": 0.18058577179908752,
674
+ "learning_rate": 7.863247863247864e-05,
675
+ "loss": 0.1363,
676
+ "step": 940
677
+ },
678
+ {
679
+ "epoch": 2.43,
680
+ "grad_norm": 0.19193512201309204,
681
+ "learning_rate": 7.521367521367521e-05,
682
+ "loss": 0.1359,
683
+ "step": 950
684
+ },
685
+ {
686
+ "epoch": 2.46,
687
+ "grad_norm": 0.17777132987976074,
688
+ "learning_rate": 7.17948717948718e-05,
689
+ "loss": 0.1363,
690
+ "step": 960
691
+ },
692
+ {
693
+ "epoch": 2.48,
694
+ "grad_norm": 0.1730206310749054,
695
+ "learning_rate": 6.837606837606838e-05,
696
+ "loss": 0.1339,
697
+ "step": 970
698
+ },
699
+ {
700
+ "epoch": 2.51,
701
+ "grad_norm": 0.172698512673378,
702
+ "learning_rate": 6.495726495726496e-05,
703
+ "loss": 0.1317,
704
+ "step": 980
705
+ },
706
+ {
707
+ "epoch": 2.53,
708
+ "grad_norm": 0.1746242642402649,
709
+ "learning_rate": 6.153846153846155e-05,
710
+ "loss": 0.132,
711
+ "step": 990
712
+ },
713
+ {
714
+ "epoch": 2.56,
715
+ "grad_norm": 0.1631608009338379,
716
+ "learning_rate": 5.8119658119658126e-05,
717
+ "loss": 0.1292,
718
+ "step": 1000
719
+ },
720
+ {
721
+ "epoch": 2.56,
722
+ "eval_cer": 0.9320702386692806,
723
+ "eval_loss": 0.11450555920600891,
724
+ "eval_runtime": 136.638,
725
+ "eval_samples_per_second": 14.637,
726
+ "eval_steps_per_second": 0.461,
727
+ "step": 1000
728
+ }
729
+ ],
730
+ "logging_steps": 10,
731
+ "max_steps": 1170,
732
+ "num_input_tokens_seen": 0,
733
+ "num_train_epochs": 3,
734
+ "save_steps": 500,
735
+ "total_flos": 6.737241075941376e+16,
736
+ "train_batch_size": 8,
737
+ "trial_name": null,
738
+ "trial_params": null
739
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c287b31111cf124ac69c0e9b53bdaf288743eceab9254160487882b78a64250
3
+ size 5048
vocab.json ADDED
The diff for this file is too large to render. See raw diff