davelotito commited on
Commit
437550f
1 Parent(s): d2c1c90

Training in progress, epoch 1

Browse files
config.json ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "naver-clova-ix/donut-base",
3
+ "architectures": [
4
+ "VisionEncoderDecoderModel"
5
+ ],
6
+ "decoder": {
7
+ "_name_or_path": "",
8
+ "activation_dropout": 0.0,
9
+ "activation_function": "gelu",
10
+ "add_cross_attention": true,
11
+ "add_final_layer_norm": true,
12
+ "architectures": null,
13
+ "attention_dropout": 0.0,
14
+ "bad_words_ids": null,
15
+ "begin_suppress_tokens": null,
16
+ "bos_token_id": 0,
17
+ "chunk_size_feed_forward": 0,
18
+ "classifier_dropout": 0.0,
19
+ "cross_attention_hidden_size": null,
20
+ "d_model": 1024,
21
+ "decoder_attention_heads": 16,
22
+ "decoder_ffn_dim": 4096,
23
+ "decoder_layerdrop": 0.0,
24
+ "decoder_layers": 4,
25
+ "decoder_start_token_id": null,
26
+ "diversity_penalty": 0.0,
27
+ "do_sample": false,
28
+ "dropout": 0.1,
29
+ "early_stopping": false,
30
+ "encoder_attention_heads": 16,
31
+ "encoder_ffn_dim": 4096,
32
+ "encoder_layerdrop": 0.0,
33
+ "encoder_layers": 12,
34
+ "encoder_no_repeat_ngram_size": 0,
35
+ "eos_token_id": 2,
36
+ "exponential_decay_length_penalty": null,
37
+ "finetuning_task": null,
38
+ "forced_bos_token_id": null,
39
+ "forced_eos_token_id": 2,
40
+ "id2label": {
41
+ "0": "LABEL_0",
42
+ "1": "LABEL_1"
43
+ },
44
+ "init_std": 0.02,
45
+ "is_decoder": true,
46
+ "is_encoder_decoder": false,
47
+ "label2id": {
48
+ "LABEL_0": 0,
49
+ "LABEL_1": 1
50
+ },
51
+ "length_penalty": 1.0,
52
+ "max_length": 512,
53
+ "max_position_embeddings": 1536,
54
+ "min_length": 0,
55
+ "model_type": "mbart",
56
+ "no_repeat_ngram_size": 0,
57
+ "num_beam_groups": 1,
58
+ "num_beams": 1,
59
+ "num_hidden_layers": 12,
60
+ "num_return_sequences": 1,
61
+ "output_attentions": false,
62
+ "output_hidden_states": false,
63
+ "output_scores": false,
64
+ "pad_token_id": 1,
65
+ "prefix": null,
66
+ "problem_type": null,
67
+ "pruned_heads": {},
68
+ "remove_invalid_values": false,
69
+ "repetition_penalty": 1.0,
70
+ "return_dict": true,
71
+ "return_dict_in_generate": false,
72
+ "scale_embedding": true,
73
+ "sep_token_id": null,
74
+ "suppress_tokens": null,
75
+ "task_specific_params": null,
76
+ "temperature": 1.0,
77
+ "tf_legacy_loss": false,
78
+ "tie_encoder_decoder": false,
79
+ "tie_word_embeddings": true,
80
+ "tokenizer_class": null,
81
+ "top_k": 50,
82
+ "top_p": 1.0,
83
+ "torch_dtype": null,
84
+ "torchscript": false,
85
+ "typical_p": 1.0,
86
+ "use_bfloat16": false,
87
+ "use_cache": true,
88
+ "vocab_size": 57533
89
+ },
90
+ "decoder_start_token_id": 0,
91
+ "encoder": {
92
+ "_name_or_path": "",
93
+ "add_cross_attention": false,
94
+ "architectures": null,
95
+ "attention_probs_dropout_prob": 0.0,
96
+ "bad_words_ids": null,
97
+ "begin_suppress_tokens": null,
98
+ "bos_token_id": null,
99
+ "chunk_size_feed_forward": 0,
100
+ "cross_attention_hidden_size": null,
101
+ "decoder_start_token_id": null,
102
+ "depths": [
103
+ 2,
104
+ 2,
105
+ 14,
106
+ 2
107
+ ],
108
+ "diversity_penalty": 0.0,
109
+ "do_sample": false,
110
+ "drop_path_rate": 0.1,
111
+ "early_stopping": false,
112
+ "embed_dim": 128,
113
+ "encoder_no_repeat_ngram_size": 0,
114
+ "eos_token_id": null,
115
+ "exponential_decay_length_penalty": null,
116
+ "finetuning_task": null,
117
+ "forced_bos_token_id": null,
118
+ "forced_eos_token_id": null,
119
+ "hidden_act": "gelu",
120
+ "hidden_dropout_prob": 0.0,
121
+ "hidden_size": 1024,
122
+ "id2label": {
123
+ "0": "LABEL_0",
124
+ "1": "LABEL_1"
125
+ },
126
+ "image_size": [
127
+ 960,
128
+ 720
129
+ ],
130
+ "initializer_range": 0.02,
131
+ "is_decoder": false,
132
+ "is_encoder_decoder": false,
133
+ "label2id": {
134
+ "LABEL_0": 0,
135
+ "LABEL_1": 1
136
+ },
137
+ "layer_norm_eps": 1e-05,
138
+ "length_penalty": 1.0,
139
+ "max_length": 20,
140
+ "min_length": 0,
141
+ "mlp_ratio": 4.0,
142
+ "model_type": "donut-swin",
143
+ "no_repeat_ngram_size": 0,
144
+ "num_beam_groups": 1,
145
+ "num_beams": 1,
146
+ "num_channels": 3,
147
+ "num_heads": [
148
+ 4,
149
+ 8,
150
+ 16,
151
+ 32
152
+ ],
153
+ "num_layers": 4,
154
+ "num_return_sequences": 1,
155
+ "output_attentions": false,
156
+ "output_hidden_states": false,
157
+ "output_scores": false,
158
+ "pad_token_id": null,
159
+ "patch_size": 4,
160
+ "path_norm": true,
161
+ "prefix": null,
162
+ "problem_type": null,
163
+ "pruned_heads": {},
164
+ "qkv_bias": true,
165
+ "remove_invalid_values": false,
166
+ "repetition_penalty": 1.0,
167
+ "return_dict": true,
168
+ "return_dict_in_generate": false,
169
+ "sep_token_id": null,
170
+ "suppress_tokens": null,
171
+ "task_specific_params": null,
172
+ "temperature": 1.0,
173
+ "tf_legacy_loss": false,
174
+ "tie_encoder_decoder": false,
175
+ "tie_word_embeddings": true,
176
+ "tokenizer_class": null,
177
+ "top_k": 50,
178
+ "top_p": 1.0,
179
+ "torch_dtype": null,
180
+ "torchscript": false,
181
+ "typical_p": 1.0,
182
+ "use_absolute_embeddings": false,
183
+ "use_bfloat16": false,
184
+ "window_size": 10
185
+ },
186
+ "is_encoder_decoder": true,
187
+ "model_type": "vision-encoder-decoder",
188
+ "pad_token_id": 1,
189
+ "tie_word_embeddings": false,
190
+ "torch_dtype": "float32",
191
+ "transformers_version": "4.40.0"
192
+ }
hyperparameters.yaml ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __cached__setup_devices: !!python/object/apply:torch.device
2
+ - cuda
3
+ - 0
4
+ _n_gpu: 1
5
+ accelerator_config: !!python/object:transformers.trainer_pt_utils.AcceleratorConfig
6
+ dispatch_batches: null
7
+ even_batches: true
8
+ gradient_accumulation_kwargs: null
9
+ split_batches: false
10
+ use_seedable_sampler: true
11
+ adafactor: false
12
+ adam_beta1: 0.9
13
+ adam_beta2: 0.999
14
+ adam_epsilon: 1.0e-08
15
+ auto_find_batch_size: false
16
+ bf16: false
17
+ bf16_full_eval: false
18
+ data_seed: null
19
+ dataloader_drop_last: false
20
+ dataloader_num_workers: 0
21
+ dataloader_persistent_workers: false
22
+ dataloader_pin_memory: true
23
+ dataloader_prefetch_factor: null
24
+ ddp_backend: null
25
+ ddp_broadcast_buffers: null
26
+ ddp_bucket_cap_mb: null
27
+ ddp_find_unused_parameters: null
28
+ ddp_timeout: 1800
29
+ debug: []
30
+ deepspeed: null
31
+ deepspeed_plugin: null
32
+ disable_tqdm: false
33
+ dispatch_batches: null
34
+ distributed_state: !!python/object:accelerate.state.PartialState
35
+ _cpu: false
36
+ backend: null
37
+ debug: false
38
+ device: !!python/object/apply:torch.device
39
+ - cuda
40
+ distributed_type: !!python/object/apply:accelerate.utils.dataclasses.DistributedType
41
+ - 'NO'
42
+ fork_launched: false
43
+ local_process_index: 0
44
+ num_processes: 1
45
+ process_index: 0
46
+ do_eval: true
47
+ do_predict: false
48
+ do_train: false
49
+ eval_accumulation_steps: null
50
+ eval_delay: 0
51
+ eval_do_concat_batches: true
52
+ eval_steps: null
53
+ evaluation_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy
54
+ - epoch
55
+ fp16: true
56
+ fp16_backend: auto
57
+ fp16_full_eval: false
58
+ fp16_opt_level: O1
59
+ fsdp: []
60
+ fsdp_config:
61
+ min_num_params: 0
62
+ xla: false
63
+ xla_fsdp_grad_ckpt: false
64
+ xla_fsdp_v2: false
65
+ fsdp_min_num_params: 0
66
+ fsdp_transformer_layer_cls_to_wrap: null
67
+ full_determinism: false
68
+ generation_config: null
69
+ generation_max_length: null
70
+ generation_num_beams: null
71
+ gradient_accumulation_steps: 2
72
+ gradient_checkpointing: false
73
+ gradient_checkpointing_kwargs: null
74
+ greater_is_better: false
75
+ group_by_length: false
76
+ half_precision_backend: auto
77
+ hub_always_push: false
78
+ hub_model_id: donut_experiment_bayesian_trial_7
79
+ hub_private_repo: false
80
+ hub_strategy: !!python/object/apply:transformers.trainer_utils.HubStrategy
81
+ - every_save
82
+ hub_token: null
83
+ ignore_data_skip: false
84
+ include_inputs_for_metrics: false
85
+ include_num_input_tokens_seen: false
86
+ include_tokens_per_second: false
87
+ jit_mode_eval: false
88
+ label_names: null
89
+ label_smoothing_factor: 0.0
90
+ learning_rate: 3.540464175534869e-05
91
+ length_column_name: length
92
+ load_best_model_at_end: true
93
+ local_rank: 0
94
+ log_level: passive
95
+ log_level_replica: warning
96
+ log_on_each_node: true
97
+ logging_dir: model_runs/donut_experiment_bayesian_trial_7/runs/Jun04_16-51-55_ip-172-16-170-70.ec2.internal
98
+ logging_first_step: false
99
+ logging_nan_inf_filter: true
100
+ logging_steps: 100
101
+ logging_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy
102
+ - steps
103
+ lr_scheduler_kwargs: {}
104
+ lr_scheduler_type: !!python/object/apply:transformers.trainer_utils.SchedulerType
105
+ - linear
106
+ max_grad_norm: 1.0
107
+ max_steps: -1
108
+ metric_for_best_model: loss
109
+ mp_parameters: ''
110
+ neftune_noise_alpha: null
111
+ no_cuda: false
112
+ num_train_epochs: 5
113
+ optim: !!python/object/apply:transformers.training_args.OptimizerNames
114
+ - adamw_torch
115
+ optim_args: null
116
+ optim_target_modules: null
117
+ output_dir: model_runs/donut_experiment_bayesian_trial_7
118
+ overwrite_output_dir: false
119
+ past_index: -1
120
+ per_device_eval_batch_size: 1
121
+ per_device_train_batch_size: 1
122
+ per_gpu_eval_batch_size: null
123
+ per_gpu_train_batch_size: null
124
+ predict_with_generate: true
125
+ prediction_loss_only: false
126
+ push_to_hub: true
127
+ push_to_hub_model_id: null
128
+ push_to_hub_organization: null
129
+ push_to_hub_token: null
130
+ ray_scope: last
131
+ remove_unused_columns: true
132
+ report_to:
133
+ - tensorboard
134
+ resume_from_checkpoint: null
135
+ run_name: model_runs/donut_experiment_bayesian_trial_7
136
+ save_on_each_node: false
137
+ save_only_model: false
138
+ save_safetensors: true
139
+ save_steps: 500
140
+ save_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy
141
+ - epoch
142
+ save_total_limit: 2
143
+ seed: 42
144
+ skip_memory_metrics: true
145
+ sortish_sampler: false
146
+ split_batches: null
147
+ tf32: null
148
+ torch_compile: false
149
+ torch_compile_backend: null
150
+ torch_compile_mode: null
151
+ torchdynamo: null
152
+ tpu_metrics_debug: false
153
+ tpu_num_cores: null
154
+ use_cpu: false
155
+ use_ipex: false
156
+ use_legacy_prediction_loop: false
157
+ use_mps_device: false
158
+ warmup_ratio: 0.0
159
+ warmup_steps: 0
160
+ weight_decay: 0.0017732557924795098
hyperparameters_tuned.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"learning_rate": 3.540464175534869e-05, "weight_decay": 0.0017732557924795098, "num_train_epochs": 5}
metrics.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eval_loss": 0.4203544557094574, "eval_bleu": 0.05796868826865473, "eval_precisions": [0.7710084033613446, 0.6778042959427207, 0.6132596685082873, 0.5639344262295082], "eval_brevity_penalty": 0.08890667390552527, "eval_length_ratio": 0.29238329238329236, "eval_translation_length": 476, "eval_reference_length": 1628, "eval_cer": 0.7617240589841865, "eval_wer": 0.8431430288885247, "eval_runtime": 196.2339, "eval_samples_per_second": 0.29, "eval_steps_per_second": 0.29, "epoch": 1.0}
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1699dc9d5b240cf638a41697226d2c666cd037ac90f968057c5b6a098e3dbf00
3
+ size 809103512
runs/Jun04_16-51-55_ip-172-16-170-70.ec2.internal/events.out.tfevents.1717519915.ip-172-16-170-70.ec2.internal.9169.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c5121077f6ded5f6399060bfad7a475fde7be9617a846030f9a4fdebbfe0378
3
+ size 10298
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18e5ad7f76456969566f5517a4c3494dafa29a416761310c906ca07290a62257
3
+ size 5240