pawlowskipawel commited on
Commit
379dc1c
1 Parent(s): 8dbf3d5

Upload 13 files

Browse files
config.json ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": "5c48f939de25655eeca55d31e7893ada48d300d9",
3
+ "_name_or_path": "microsoft/trocr-base-stage1",
4
+ "architectures": [
5
+ "VisionEncoderDecoderModel"
6
+ ],
7
+ "decoder": {
8
+ "_name_or_path": "",
9
+ "activation_dropout": 0.0,
10
+ "activation_function": "relu",
11
+ "add_cross_attention": true,
12
+ "architectures": null,
13
+ "attention_dropout": 0.0,
14
+ "bad_words_ids": null,
15
+ "begin_suppress_tokens": null,
16
+ "bos_token_id": 0,
17
+ "chunk_size_feed_forward": 0,
18
+ "classifier_dropout": 0.0,
19
+ "cross_attention_hidden_size": 768,
20
+ "d_model": 1024,
21
+ "decoder_attention_heads": 16,
22
+ "decoder_ffn_dim": 4096,
23
+ "decoder_layerdrop": 0.0,
24
+ "decoder_layers": 12,
25
+ "decoder_start_token_id": 2,
26
+ "diversity_penalty": 0.0,
27
+ "do_sample": false,
28
+ "dropout": 0.1,
29
+ "early_stopping": false,
30
+ "encoder_no_repeat_ngram_size": 0,
31
+ "eos_token_id": 2,
32
+ "exponential_decay_length_penalty": null,
33
+ "finetuning_task": null,
34
+ "forced_bos_token_id": null,
35
+ "forced_eos_token_id": null,
36
+ "id2label": {
37
+ "0": "LABEL_0",
38
+ "1": "LABEL_1"
39
+ },
40
+ "init_std": 0.02,
41
+ "is_decoder": true,
42
+ "is_encoder_decoder": false,
43
+ "label2id": {
44
+ "LABEL_0": 0,
45
+ "LABEL_1": 1
46
+ },
47
+ "layernorm_embedding": false,
48
+ "length_penalty": 1.0,
49
+ "max_length": 20,
50
+ "max_position_embeddings": 1024,
51
+ "min_length": 0,
52
+ "model_type": "trocr",
53
+ "no_repeat_ngram_size": 0,
54
+ "num_beam_groups": 1,
55
+ "num_beams": 1,
56
+ "num_return_sequences": 1,
57
+ "output_attentions": false,
58
+ "output_hidden_states": false,
59
+ "output_scores": false,
60
+ "pad_token_id": 1,
61
+ "prefix": null,
62
+ "problem_type": null,
63
+ "pruned_heads": {},
64
+ "remove_invalid_values": false,
65
+ "repetition_penalty": 1.0,
66
+ "return_dict": true,
67
+ "return_dict_in_generate": false,
68
+ "scale_embedding": true,
69
+ "sep_token_id": null,
70
+ "suppress_tokens": null,
71
+ "task_specific_params": null,
72
+ "temperature": 1.0,
73
+ "tf_legacy_loss": false,
74
+ "tie_encoder_decoder": false,
75
+ "tie_word_embeddings": false,
76
+ "tokenizer_class": null,
77
+ "top_k": 50,
78
+ "top_p": 1.0,
79
+ "torch_dtype": null,
80
+ "torchscript": false,
81
+ "transformers_version": "4.24.0",
82
+ "typical_p": 1.0,
83
+ "use_bfloat16": false,
84
+ "use_cache": false,
85
+ "use_learned_position_embeddings": false,
86
+ "vocab_size": 50265
87
+ },
88
+ "decoder_start_token_id": 0,
89
+ "early_stopping": true,
90
+ "encoder": {
91
+ "_name_or_path": "",
92
+ "add_cross_attention": false,
93
+ "architectures": null,
94
+ "attention_probs_dropout_prob": 0.0,
95
+ "bad_words_ids": null,
96
+ "begin_suppress_tokens": null,
97
+ "bos_token_id": null,
98
+ "chunk_size_feed_forward": 0,
99
+ "cross_attention_hidden_size": null,
100
+ "decoder_start_token_id": null,
101
+ "diversity_penalty": 0.0,
102
+ "do_sample": false,
103
+ "early_stopping": false,
104
+ "encoder_no_repeat_ngram_size": 0,
105
+ "encoder_stride": 16,
106
+ "eos_token_id": null,
107
+ "exponential_decay_length_penalty": null,
108
+ "finetuning_task": null,
109
+ "forced_bos_token_id": null,
110
+ "forced_eos_token_id": null,
111
+ "hidden_act": "gelu",
112
+ "hidden_dropout_prob": 0.0,
113
+ "hidden_size": 768,
114
+ "id2label": {
115
+ "0": "LABEL_0",
116
+ "1": "LABEL_1"
117
+ },
118
+ "image_size": 384,
119
+ "initializer_range": 0.02,
120
+ "intermediate_size": 3072,
121
+ "is_decoder": false,
122
+ "is_encoder_decoder": false,
123
+ "label2id": {
124
+ "LABEL_0": 0,
125
+ "LABEL_1": 1
126
+ },
127
+ "layer_norm_eps": 1e-12,
128
+ "length_penalty": 1.0,
129
+ "max_length": 20,
130
+ "min_length": 0,
131
+ "model_type": "vit",
132
+ "no_repeat_ngram_size": 0,
133
+ "num_attention_heads": 12,
134
+ "num_beam_groups": 1,
135
+ "num_beams": 1,
136
+ "num_channels": 3,
137
+ "num_hidden_layers": 12,
138
+ "num_return_sequences": 1,
139
+ "output_attentions": false,
140
+ "output_hidden_states": false,
141
+ "output_scores": false,
142
+ "pad_token_id": null,
143
+ "patch_size": 16,
144
+ "prefix": null,
145
+ "problem_type": null,
146
+ "pruned_heads": {},
147
+ "qkv_bias": false,
148
+ "remove_invalid_values": false,
149
+ "repetition_penalty": 1.0,
150
+ "return_dict": true,
151
+ "return_dict_in_generate": false,
152
+ "sep_token_id": null,
153
+ "suppress_tokens": null,
154
+ "task_specific_params": null,
155
+ "temperature": 1.0,
156
+ "tf_legacy_loss": false,
157
+ "tie_encoder_decoder": false,
158
+ "tie_word_embeddings": true,
159
+ "tokenizer_class": null,
160
+ "top_k": 50,
161
+ "top_p": 1.0,
162
+ "torch_dtype": null,
163
+ "torchscript": false,
164
+ "transformers_version": "4.24.0",
165
+ "typical_p": 1.0,
166
+ "use_bfloat16": false
167
+ },
168
+ "eos_token_id": 2,
169
+ "is_encoder_decoder": true,
170
+ "length_penalty": 2.0,
171
+ "max_length": 36,
172
+ "model_type": "vision-encoder-decoder",
173
+ "no_repeat_ngram_size": 3,
174
+ "num_beams": 4,
175
+ "pad_token_id": 1,
176
+ "tie_word_embeddings": false,
177
+ "torch_dtype": "float32",
178
+ "transformers_version": null,
179
+ "vocab_size": 50265
180
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4c97f69ee41290bcc71e70f72fda1d95cdedccc1bf46b05a540444718ff9355
3
+ size 3074473541
preprocessor_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_resize": true,
4
+ "feature_extractor_type": "ViTFeatureExtractor",
5
+ "image_mean": [
6
+ 0.5,
7
+ 0.5,
8
+ 0.5
9
+ ],
10
+ "image_std": [
11
+ 0.5,
12
+ 0.5,
13
+ 0.5
14
+ ],
15
+ "processor_class": "TrOCRProcessor",
16
+ "resample": 2,
17
+ "size": 384
18
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48c30f4b4660ec1b6bdbdb2cf35e362d3102e597cc9993f2b16fc991a2a5ffff
3
+ size 1539625673
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a4aee5adbfab97a9adfd8580e4c853534a1a67e63c8d9454230544a7727c74e
3
+ size 14575
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed373d8b695820c18a06c1f883fea8fc9d8e11cafc54bdfc114c75360d01e57d
3
+ size 627
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": {
4
+ "__type": "AddedToken",
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "cls_token": {
12
+ "__type": "AddedToken",
13
+ "content": "<s>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "eos_token": {
20
+ "__type": "AddedToken",
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "errors": "replace",
28
+ "mask_token": {
29
+ "__type": "AddedToken",
30
+ "content": "<mask>",
31
+ "lstrip": true,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false
35
+ },
36
+ "model_max_length": 512,
37
+ "name_or_path": "microsoft/trocr-base-stage1",
38
+ "pad_token": {
39
+ "__type": "AddedToken",
40
+ "content": "<pad>",
41
+ "lstrip": false,
42
+ "normalized": true,
43
+ "rstrip": false,
44
+ "single_word": false
45
+ },
46
+ "processor_class": "TrOCRProcessor",
47
+ "sep_token": {
48
+ "__type": "AddedToken",
49
+ "content": "</s>",
50
+ "lstrip": false,
51
+ "normalized": true,
52
+ "rstrip": false,
53
+ "single_word": false
54
+ },
55
+ "special_tokens_map_file": null,
56
+ "tokenizer_class": "RobertaTokenizer",
57
+ "trim_offsets": true,
58
+ "unk_token": {
59
+ "__type": "AddedToken",
60
+ "content": "<unk>",
61
+ "lstrip": false,
62
+ "normalized": true,
63
+ "rstrip": false,
64
+ "single_word": false
65
+ }
66
+ }
trainer_state.json ADDED
@@ -0,0 +1,724 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.004195804195804196,
3
+ "best_model_checkpoint": "custom_model_stage1_base_concat_synth/checkpoint-900",
4
+ "epoch": 2.9605263157894735,
5
+ "global_step": 900,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.0,
12
+ "learning_rate": 4.9945175438596495e-05,
13
+ "loss": 9.5118,
14
+ "step": 1
15
+ },
16
+ {
17
+ "epoch": 0.03,
18
+ "learning_rate": 4.9451754385964915e-05,
19
+ "loss": 3.2727,
20
+ "step": 10
21
+ },
22
+ {
23
+ "epoch": 0.07,
24
+ "learning_rate": 4.890350877192983e-05,
25
+ "loss": 2.6133,
26
+ "step": 20
27
+ },
28
+ {
29
+ "epoch": 0.1,
30
+ "learning_rate": 4.8355263157894734e-05,
31
+ "loss": 3.1323,
32
+ "step": 30
33
+ },
34
+ {
35
+ "epoch": 0.13,
36
+ "learning_rate": 4.780701754385965e-05,
37
+ "loss": 2.6306,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.16,
42
+ "learning_rate": 4.7258771929824566e-05,
43
+ "loss": 2.9819,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.16,
48
+ "eval_cer": 0.3053536021150033,
49
+ "eval_loss": 1.6067169904708862,
50
+ "eval_runtime": 22.608,
51
+ "eval_samples_per_second": 6.281,
52
+ "eval_steps_per_second": 0.796,
53
+ "step": 50
54
+ },
55
+ {
56
+ "epoch": 0.2,
57
+ "learning_rate": 4.671052631578948e-05,
58
+ "loss": 2.3285,
59
+ "step": 60
60
+ },
61
+ {
62
+ "epoch": 0.23,
63
+ "learning_rate": 4.616228070175439e-05,
64
+ "loss": 1.9334,
65
+ "step": 70
66
+ },
67
+ {
68
+ "epoch": 0.26,
69
+ "learning_rate": 4.56140350877193e-05,
70
+ "loss": 1.9042,
71
+ "step": 80
72
+ },
73
+ {
74
+ "epoch": 0.3,
75
+ "learning_rate": 4.506578947368421e-05,
76
+ "loss": 2.0802,
77
+ "step": 90
78
+ },
79
+ {
80
+ "epoch": 0.33,
81
+ "learning_rate": 4.451754385964912e-05,
82
+ "loss": 2.144,
83
+ "step": 100
84
+ },
85
+ {
86
+ "epoch": 0.33,
87
+ "eval_cer": 0.08821502412129566,
88
+ "eval_loss": 0.9317633509635925,
89
+ "eval_runtime": 27.231,
90
+ "eval_samples_per_second": 5.215,
91
+ "eval_steps_per_second": 0.661,
92
+ "step": 100
93
+ },
94
+ {
95
+ "epoch": 0.36,
96
+ "learning_rate": 4.3969298245614036e-05,
97
+ "loss": 1.7753,
98
+ "step": 110
99
+ },
100
+ {
101
+ "epoch": 0.39,
102
+ "learning_rate": 4.342105263157895e-05,
103
+ "loss": 1.9885,
104
+ "step": 120
105
+ },
106
+ {
107
+ "epoch": 0.43,
108
+ "learning_rate": 4.287280701754386e-05,
109
+ "loss": 1.8163,
110
+ "step": 130
111
+ },
112
+ {
113
+ "epoch": 0.46,
114
+ "learning_rate": 4.2324561403508774e-05,
115
+ "loss": 1.8354,
116
+ "step": 140
117
+ },
118
+ {
119
+ "epoch": 0.49,
120
+ "learning_rate": 4.177631578947369e-05,
121
+ "loss": 2.1775,
122
+ "step": 150
123
+ },
124
+ {
125
+ "epoch": 0.49,
126
+ "eval_cer": 0.05823293172690763,
127
+ "eval_loss": 0.8989996314048767,
128
+ "eval_runtime": 24.7772,
129
+ "eval_samples_per_second": 5.731,
130
+ "eval_steps_per_second": 0.726,
131
+ "step": 150
132
+ },
133
+ {
134
+ "epoch": 0.53,
135
+ "learning_rate": 4.12280701754386e-05,
136
+ "loss": 2.0764,
137
+ "step": 160
138
+ },
139
+ {
140
+ "epoch": 0.56,
141
+ "learning_rate": 4.067982456140351e-05,
142
+ "loss": 2.1435,
143
+ "step": 170
144
+ },
145
+ {
146
+ "epoch": 0.59,
147
+ "learning_rate": 4.0131578947368425e-05,
148
+ "loss": 1.9279,
149
+ "step": 180
150
+ },
151
+ {
152
+ "epoch": 0.62,
153
+ "learning_rate": 3.958333333333333e-05,
154
+ "loss": 1.6143,
155
+ "step": 190
156
+ },
157
+ {
158
+ "epoch": 0.66,
159
+ "learning_rate": 3.9035087719298244e-05,
160
+ "loss": 1.6676,
161
+ "step": 200
162
+ },
163
+ {
164
+ "epoch": 0.66,
165
+ "eval_cer": 0.05952380952380952,
166
+ "eval_loss": 0.6604741215705872,
167
+ "eval_runtime": 25.9554,
168
+ "eval_samples_per_second": 5.471,
169
+ "eval_steps_per_second": 0.693,
170
+ "step": 200
171
+ },
172
+ {
173
+ "epoch": 0.69,
174
+ "learning_rate": 3.848684210526316e-05,
175
+ "loss": 1.5091,
176
+ "step": 210
177
+ },
178
+ {
179
+ "epoch": 0.72,
180
+ "learning_rate": 3.7938596491228076e-05,
181
+ "loss": 1.5469,
182
+ "step": 220
183
+ },
184
+ {
185
+ "epoch": 0.76,
186
+ "learning_rate": 3.739035087719299e-05,
187
+ "loss": 1.7522,
188
+ "step": 230
189
+ },
190
+ {
191
+ "epoch": 0.79,
192
+ "learning_rate": 3.6842105263157895e-05,
193
+ "loss": 1.6574,
194
+ "step": 240
195
+ },
196
+ {
197
+ "epoch": 0.82,
198
+ "learning_rate": 3.629385964912281e-05,
199
+ "loss": 1.2664,
200
+ "step": 250
201
+ },
202
+ {
203
+ "epoch": 0.82,
204
+ "eval_cer": 0.019313304721030045,
205
+ "eval_loss": 0.6368018388748169,
206
+ "eval_runtime": 26.7506,
207
+ "eval_samples_per_second": 5.308,
208
+ "eval_steps_per_second": 0.673,
209
+ "step": 250
210
+ },
211
+ {
212
+ "epoch": 0.86,
213
+ "learning_rate": 3.574561403508772e-05,
214
+ "loss": 1.481,
215
+ "step": 260
216
+ },
217
+ {
218
+ "epoch": 0.89,
219
+ "learning_rate": 3.519736842105263e-05,
220
+ "loss": 1.404,
221
+ "step": 270
222
+ },
223
+ {
224
+ "epoch": 0.92,
225
+ "learning_rate": 3.4649122807017546e-05,
226
+ "loss": 1.2388,
227
+ "step": 280
228
+ },
229
+ {
230
+ "epoch": 0.95,
231
+ "learning_rate": 3.410087719298246e-05,
232
+ "loss": 1.6538,
233
+ "step": 290
234
+ },
235
+ {
236
+ "epoch": 0.99,
237
+ "learning_rate": 3.355263157894737e-05,
238
+ "loss": 1.2714,
239
+ "step": 300
240
+ },
241
+ {
242
+ "epoch": 0.99,
243
+ "eval_cer": 0.04748982360922659,
244
+ "eval_loss": 0.6383101344108582,
245
+ "eval_runtime": 27.9443,
246
+ "eval_samples_per_second": 5.082,
247
+ "eval_steps_per_second": 0.644,
248
+ "step": 300
249
+ },
250
+ {
251
+ "epoch": 1.02,
252
+ "learning_rate": 3.3004385964912284e-05,
253
+ "loss": 1.2309,
254
+ "step": 310
255
+ },
256
+ {
257
+ "epoch": 1.05,
258
+ "learning_rate": 3.24561403508772e-05,
259
+ "loss": 1.2347,
260
+ "step": 320
261
+ },
262
+ {
263
+ "epoch": 1.09,
264
+ "learning_rate": 3.190789473684211e-05,
265
+ "loss": 1.1184,
266
+ "step": 330
267
+ },
268
+ {
269
+ "epoch": 1.12,
270
+ "learning_rate": 3.1359649122807015e-05,
271
+ "loss": 1.2385,
272
+ "step": 340
273
+ },
274
+ {
275
+ "epoch": 1.15,
276
+ "learning_rate": 3.081140350877193e-05,
277
+ "loss": 0.9966,
278
+ "step": 350
279
+ },
280
+ {
281
+ "epoch": 1.15,
282
+ "eval_cer": 0.025748086290883786,
283
+ "eval_loss": 0.5416922569274902,
284
+ "eval_runtime": 27.2235,
285
+ "eval_samples_per_second": 5.216,
286
+ "eval_steps_per_second": 0.661,
287
+ "step": 350
288
+ },
289
+ {
290
+ "epoch": 1.18,
291
+ "learning_rate": 3.0263157894736844e-05,
292
+ "loss": 0.9694,
293
+ "step": 360
294
+ },
295
+ {
296
+ "epoch": 1.22,
297
+ "learning_rate": 2.9714912280701757e-05,
298
+ "loss": 1.0465,
299
+ "step": 370
300
+ },
301
+ {
302
+ "epoch": 1.25,
303
+ "learning_rate": 2.916666666666667e-05,
304
+ "loss": 0.9811,
305
+ "step": 380
306
+ },
307
+ {
308
+ "epoch": 1.28,
309
+ "learning_rate": 2.861842105263158e-05,
310
+ "loss": 1.0744,
311
+ "step": 390
312
+ },
313
+ {
314
+ "epoch": 1.32,
315
+ "learning_rate": 2.8070175438596492e-05,
316
+ "loss": 1.0068,
317
+ "step": 400
318
+ },
319
+ {
320
+ "epoch": 1.32,
321
+ "eval_cer": 0.02813127930341594,
322
+ "eval_loss": 0.4663618803024292,
323
+ "eval_runtime": 28.8768,
324
+ "eval_samples_per_second": 4.917,
325
+ "eval_steps_per_second": 0.623,
326
+ "step": 400
327
+ },
328
+ {
329
+ "epoch": 1.35,
330
+ "learning_rate": 2.7521929824561405e-05,
331
+ "loss": 1.1744,
332
+ "step": 410
333
+ },
334
+ {
335
+ "epoch": 1.38,
336
+ "learning_rate": 2.6973684210526317e-05,
337
+ "loss": 1.0358,
338
+ "step": 420
339
+ },
340
+ {
341
+ "epoch": 1.41,
342
+ "learning_rate": 2.642543859649123e-05,
343
+ "loss": 0.8434,
344
+ "step": 430
345
+ },
346
+ {
347
+ "epoch": 1.45,
348
+ "learning_rate": 2.5877192982456143e-05,
349
+ "loss": 0.901,
350
+ "step": 440
351
+ },
352
+ {
353
+ "epoch": 1.48,
354
+ "learning_rate": 2.5328947368421052e-05,
355
+ "loss": 1.1405,
356
+ "step": 450
357
+ },
358
+ {
359
+ "epoch": 1.48,
360
+ "eval_cer": 0.020134228187919462,
361
+ "eval_loss": 0.5748289823532104,
362
+ "eval_runtime": 28.573,
363
+ "eval_samples_per_second": 4.97,
364
+ "eval_steps_per_second": 0.63,
365
+ "step": 450
366
+ },
367
+ {
368
+ "epoch": 1.51,
369
+ "learning_rate": 2.4780701754385965e-05,
370
+ "loss": 0.9414,
371
+ "step": 460
372
+ },
373
+ {
374
+ "epoch": 1.55,
375
+ "learning_rate": 2.4232456140350878e-05,
376
+ "loss": 1.1748,
377
+ "step": 470
378
+ },
379
+ {
380
+ "epoch": 1.58,
381
+ "learning_rate": 2.368421052631579e-05,
382
+ "loss": 1.0134,
383
+ "step": 480
384
+ },
385
+ {
386
+ "epoch": 1.61,
387
+ "learning_rate": 2.3135964912280703e-05,
388
+ "loss": 1.0102,
389
+ "step": 490
390
+ },
391
+ {
392
+ "epoch": 1.64,
393
+ "learning_rate": 2.2587719298245616e-05,
394
+ "loss": 0.9627,
395
+ "step": 500
396
+ },
397
+ {
398
+ "epoch": 1.64,
399
+ "eval_cer": 0.02203856749311295,
400
+ "eval_loss": 0.4435074031352997,
401
+ "eval_runtime": 27.4558,
402
+ "eval_samples_per_second": 5.172,
403
+ "eval_steps_per_second": 0.656,
404
+ "step": 500
405
+ },
406
+ {
407
+ "epoch": 1.68,
408
+ "learning_rate": 2.2039473684210525e-05,
409
+ "loss": 1.0858,
410
+ "step": 510
411
+ },
412
+ {
413
+ "epoch": 1.71,
414
+ "learning_rate": 2.149122807017544e-05,
415
+ "loss": 0.8016,
416
+ "step": 520
417
+ },
418
+ {
419
+ "epoch": 1.74,
420
+ "learning_rate": 2.0942982456140354e-05,
421
+ "loss": 0.813,
422
+ "step": 530
423
+ },
424
+ {
425
+ "epoch": 1.78,
426
+ "learning_rate": 2.0394736842105264e-05,
427
+ "loss": 0.792,
428
+ "step": 540
429
+ },
430
+ {
431
+ "epoch": 1.81,
432
+ "learning_rate": 1.9846491228070176e-05,
433
+ "loss": 0.8551,
434
+ "step": 550
435
+ },
436
+ {
437
+ "epoch": 1.81,
438
+ "eval_cer": 0.0291583830351226,
439
+ "eval_loss": 0.5917633771896362,
440
+ "eval_runtime": 30.3337,
441
+ "eval_samples_per_second": 4.681,
442
+ "eval_steps_per_second": 0.593,
443
+ "step": 550
444
+ },
445
+ {
446
+ "epoch": 1.84,
447
+ "learning_rate": 1.929824561403509e-05,
448
+ "loss": 0.8459,
449
+ "step": 560
450
+ },
451
+ {
452
+ "epoch": 1.88,
453
+ "learning_rate": 1.8750000000000002e-05,
454
+ "loss": 0.8129,
455
+ "step": 570
456
+ },
457
+ {
458
+ "epoch": 1.91,
459
+ "learning_rate": 1.8201754385964914e-05,
460
+ "loss": 0.8238,
461
+ "step": 580
462
+ },
463
+ {
464
+ "epoch": 1.94,
465
+ "learning_rate": 1.7653508771929824e-05,
466
+ "loss": 0.9552,
467
+ "step": 590
468
+ },
469
+ {
470
+ "epoch": 1.97,
471
+ "learning_rate": 1.7105263157894737e-05,
472
+ "loss": 0.6585,
473
+ "step": 600
474
+ },
475
+ {
476
+ "epoch": 1.97,
477
+ "eval_cer": 0.01694915254237288,
478
+ "eval_loss": 0.4142173230648041,
479
+ "eval_runtime": 28.0204,
480
+ "eval_samples_per_second": 5.068,
481
+ "eval_steps_per_second": 0.642,
482
+ "step": 600
483
+ },
484
+ {
485
+ "epoch": 2.01,
486
+ "learning_rate": 1.655701754385965e-05,
487
+ "loss": 0.6229,
488
+ "step": 610
489
+ },
490
+ {
491
+ "epoch": 2.04,
492
+ "learning_rate": 1.6008771929824562e-05,
493
+ "loss": 0.8082,
494
+ "step": 620
495
+ },
496
+ {
497
+ "epoch": 2.07,
498
+ "learning_rate": 1.5460526315789475e-05,
499
+ "loss": 0.751,
500
+ "step": 630
501
+ },
502
+ {
503
+ "epoch": 2.11,
504
+ "learning_rate": 1.4912280701754386e-05,
505
+ "loss": 0.6503,
506
+ "step": 640
507
+ },
508
+ {
509
+ "epoch": 2.14,
510
+ "learning_rate": 1.4364035087719299e-05,
511
+ "loss": 0.6518,
512
+ "step": 650
513
+ },
514
+ {
515
+ "epoch": 2.14,
516
+ "eval_cer": 0.008419689119170985,
517
+ "eval_loss": 0.3558412492275238,
518
+ "eval_runtime": 28.5056,
519
+ "eval_samples_per_second": 4.981,
520
+ "eval_steps_per_second": 0.631,
521
+ "step": 650
522
+ },
523
+ {
524
+ "epoch": 2.17,
525
+ "learning_rate": 1.3815789473684213e-05,
526
+ "loss": 0.6165,
527
+ "step": 660
528
+ },
529
+ {
530
+ "epoch": 2.2,
531
+ "learning_rate": 1.3267543859649122e-05,
532
+ "loss": 0.7267,
533
+ "step": 670
534
+ },
535
+ {
536
+ "epoch": 2.24,
537
+ "learning_rate": 1.2719298245614037e-05,
538
+ "loss": 0.8219,
539
+ "step": 680
540
+ },
541
+ {
542
+ "epoch": 2.27,
543
+ "learning_rate": 1.2171052631578948e-05,
544
+ "loss": 0.7055,
545
+ "step": 690
546
+ },
547
+ {
548
+ "epoch": 2.3,
549
+ "learning_rate": 1.162280701754386e-05,
550
+ "loss": 0.5255,
551
+ "step": 700
552
+ },
553
+ {
554
+ "epoch": 2.3,
555
+ "eval_cer": 0.0047879616963064295,
556
+ "eval_loss": 0.3521033823490143,
557
+ "eval_runtime": 27.0564,
558
+ "eval_samples_per_second": 5.248,
559
+ "eval_steps_per_second": 0.665,
560
+ "step": 700
561
+ },
562
+ {
563
+ "epoch": 2.34,
564
+ "learning_rate": 1.1074561403508772e-05,
565
+ "loss": 0.7562,
566
+ "step": 710
567
+ },
568
+ {
569
+ "epoch": 2.37,
570
+ "learning_rate": 1.0526315789473684e-05,
571
+ "loss": 0.6348,
572
+ "step": 720
573
+ },
574
+ {
575
+ "epoch": 2.4,
576
+ "learning_rate": 9.978070175438597e-06,
577
+ "loss": 0.7211,
578
+ "step": 730
579
+ },
580
+ {
581
+ "epoch": 2.43,
582
+ "learning_rate": 9.42982456140351e-06,
583
+ "loss": 0.488,
584
+ "step": 740
585
+ },
586
+ {
587
+ "epoch": 2.47,
588
+ "learning_rate": 8.881578947368421e-06,
589
+ "loss": 0.469,
590
+ "step": 750
591
+ },
592
+ {
593
+ "epoch": 2.47,
594
+ "eval_cer": 0.004864489228630994,
595
+ "eval_loss": 0.32997873425483704,
596
+ "eval_runtime": 26.4415,
597
+ "eval_samples_per_second": 5.37,
598
+ "eval_steps_per_second": 0.681,
599
+ "step": 750
600
+ },
601
+ {
602
+ "epoch": 2.5,
603
+ "learning_rate": 8.333333333333334e-06,
604
+ "loss": 0.4705,
605
+ "step": 760
606
+ },
607
+ {
608
+ "epoch": 2.53,
609
+ "learning_rate": 7.785087719298246e-06,
610
+ "loss": 0.5315,
611
+ "step": 770
612
+ },
613
+ {
614
+ "epoch": 2.57,
615
+ "learning_rate": 7.236842105263158e-06,
616
+ "loss": 0.4967,
617
+ "step": 780
618
+ },
619
+ {
620
+ "epoch": 2.6,
621
+ "learning_rate": 6.68859649122807e-06,
622
+ "loss": 0.6951,
623
+ "step": 790
624
+ },
625
+ {
626
+ "epoch": 2.63,
627
+ "learning_rate": 6.140350877192982e-06,
628
+ "loss": 0.434,
629
+ "step": 800
630
+ },
631
+ {
632
+ "epoch": 2.63,
633
+ "eval_cer": 0.005936675461741424,
634
+ "eval_loss": 0.3225802481174469,
635
+ "eval_runtime": 27.8587,
636
+ "eval_samples_per_second": 5.097,
637
+ "eval_steps_per_second": 0.646,
638
+ "step": 800
639
+ },
640
+ {
641
+ "epoch": 2.66,
642
+ "learning_rate": 5.592105263157895e-06,
643
+ "loss": 0.4898,
644
+ "step": 810
645
+ },
646
+ {
647
+ "epoch": 2.7,
648
+ "learning_rate": 5.043859649122807e-06,
649
+ "loss": 0.8127,
650
+ "step": 820
651
+ },
652
+ {
653
+ "epoch": 2.73,
654
+ "learning_rate": 4.4956140350877196e-06,
655
+ "loss": 0.5916,
656
+ "step": 830
657
+ },
658
+ {
659
+ "epoch": 2.76,
660
+ "learning_rate": 3.9473684210526315e-06,
661
+ "loss": 0.3977,
662
+ "step": 840
663
+ },
664
+ {
665
+ "epoch": 2.8,
666
+ "learning_rate": 3.399122807017544e-06,
667
+ "loss": 0.5842,
668
+ "step": 850
669
+ },
670
+ {
671
+ "epoch": 2.8,
672
+ "eval_cer": 0.006997900629811057,
673
+ "eval_loss": 0.3334667980670929,
674
+ "eval_runtime": 26.2691,
675
+ "eval_samples_per_second": 5.406,
676
+ "eval_steps_per_second": 0.685,
677
+ "step": 850
678
+ },
679
+ {
680
+ "epoch": 2.83,
681
+ "learning_rate": 2.850877192982456e-06,
682
+ "loss": 0.4293,
683
+ "step": 860
684
+ },
685
+ {
686
+ "epoch": 2.86,
687
+ "learning_rate": 2.3026315789473684e-06,
688
+ "loss": 0.4953,
689
+ "step": 870
690
+ },
691
+ {
692
+ "epoch": 2.89,
693
+ "learning_rate": 1.7543859649122807e-06,
694
+ "loss": 0.6281,
695
+ "step": 880
696
+ },
697
+ {
698
+ "epoch": 2.93,
699
+ "learning_rate": 1.206140350877193e-06,
700
+ "loss": 0.5186,
701
+ "step": 890
702
+ },
703
+ {
704
+ "epoch": 2.96,
705
+ "learning_rate": 6.578947368421053e-07,
706
+ "loss": 0.5946,
707
+ "step": 900
708
+ },
709
+ {
710
+ "epoch": 2.96,
711
+ "eval_cer": 0.004195804195804196,
712
+ "eval_loss": 0.3096241056919098,
713
+ "eval_runtime": 27.0145,
714
+ "eval_samples_per_second": 5.256,
715
+ "eval_steps_per_second": 0.666,
716
+ "step": 900
717
+ }
718
+ ],
719
+ "max_steps": 912,
720
+ "num_train_epochs": 3,
721
+ "total_flos": 6.369477453856899e+18,
722
+ "trial_name": null,
723
+ "trial_params": null
724
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44034dde030d525d7efd97f406c7a9cee13eb34247467fc93ccac0a3f2acca6f
3
+ size 3515
vocab.json ADDED
The diff for this file is too large to render. See raw diff