LeoCordoba commited on
Commit
b52cb14
1 Parent(s): 28c77a6

commit files to HF hub

Browse files
Files changed (7) hide show
  1. README.md +103 -0
  2. config.json +160 -0
  3. eval_data.json +1 -0
  4. pytorch_model.bin +3 -0
  5. test_data.json +1 -0
  6. trainer_state.json +215 -0
  7. training_args.bin +3 -0
README.md ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+ language: es
4
+ tags:
5
+ - summarization
6
+ - spanish
7
+ - encoder-decoder
8
+ - beto
9
+ license: apache-2.0
10
+ datasets:
11
+ - mlsum - es
12
+ model-index:
13
+ - name: beto2beto-mlsum
14
+ results:
15
+ - task:
16
+ name: Abstractive Text Summarization
17
+ type: abstractive-text-summarization
18
+ dataset:
19
+ name: "MLSUM: MultiLingual SUMmarization dataset (Spanish)"
20
+ type: mlsum
21
+ metrics:
22
+ - name: Validation ROGUE-1
23
+ type: rogue-1
24
+ value: 26.1256
25
+ - name: Validation ROGUE-2
26
+ type: rogue-2
27
+ value: 9.2552
28
+ - name: Validation ROGUE-L
29
+ type: rogue-l
30
+ value: 21.4899
31
+ - name: Validation ROGUE-Lsum
32
+ type: rogue-lsum
33
+ value: 21.8194
34
+ - name: Test ROGUE-1
35
+ type: rogue-1
36
+ value: 25.8639
37
+ - name: Test ROGUE-2
38
+ type: rogue-2
39
+ value: 8.911
40
+ - name: Test ROGUE-L
41
+ type: rogue-l
42
+ value: 21.2426
43
+ - name: Test ROGUE-Lsum
44
+ type: rogue-lsum
45
+ value: 21.5859
46
+ widget:
47
+ - text: |
48
+ La chocotorta, el tradicional y práctico antojo dulce de los argentinos, fue elegida como el mejor postre del mundo por críticos de restaurants internacionales, a casi 40 años de su creación. El ránking Taste Atlas ubicó primero en su lista al postre insignia local de galletitas, queso crema y dulce de leche, por delante del helado de pistacho italiano y la tarta alemana de manzana. “Este postre argentino sin hornear fue influenciado por la cocina italiana y se inspiró en el famoso tiramisú italiano. Está elaborado con tres ingredientes básicos argentinos: galletas de chocolate, dulce de leche y queso crema”, explica la página web que exhorta a los turistas de todo el mundo a que prueben la chocotorta. En la votación, superó también a los waffles belgas y el zserbó húngaro. A nivel local le sigue el alfajor, con 4,2 puntos contra los 4,7 de la torta. En el texto que acompaña al listón dorado de “postre número uno", los expertos enseñan además cómo se hacen las chocotortas, paso por paso. “Las galletas se ablandan en leche y se cubren con una combinación de queso crema y dulce de leche. Las formas de la chocotorta pueden variar, mientras que las galletas se pueden remojar con leche con chocolate, café o incluso licor de café”, detallan. Por último, adjudican su creación a una “campaña de márketing” diseñada para promover las galletitas icónicas que le dan su nombre. La chocotorta, infaltable en los cumpleaños argentinos, fue creada en 1982 por una creativa de las agencias más importantes del país, Marité Mabragaña.
49
+
50
+ ---
51
+ ## beto2beto-mlsum
52
+ This model was trained using Amazon SageMaker and the new Hugging Face Deep Learning container.
53
+ For more information look at:
54
+ - [🤗 Transformers Documentation: Amazon SageMaker](https://huggingface.co/transformers/sagemaker.html)
55
+ - [Example Notebooks](https://github.com/huggingface/notebooks/tree/master/sagemaker)
56
+ - [Amazon SageMaker documentation for Hugging Face](https://docs.aws.amazon.com/sagemaker/latest/dg/hugging-face.html)
57
+ - [Python SDK SageMaker documentation for Hugging Face](https://sagemaker.readthedocs.io/en/stable/frameworks/huggingface/index.html)
58
+ - [Deep Learning Container](https://github.com/aws/deep-learning-containers/blob/master/available_images.md#huggingface-training-containers)
59
+ ## Hyperparameters
60
+ {
61
+ "dataset_config": "es",
62
+ "dataset_name": "mlsum",
63
+ "do_eval": true,
64
+ "do_predict": true,
65
+ "do_train": true,
66
+ "fp16": true,
67
+ "max_target_length": 64,
68
+ "num_train_epochs": 10,
69
+ "per_device_eval_batch_size": 4,
70
+ "per_device_train_batch_size": 4,
71
+ "predict_with_generate": true,
72
+ "sagemaker_container_log_level": 20,
73
+ "sagemaker_program": "run_summarization.py",
74
+ "seed": 7,
75
+ "summary_column": "summary",
76
+ "text_column": "text"
77
+ }
78
+ ## Usage
79
+ ## Results
80
+ | key | value |
81
+ | --- | ----- |
82
+ | eval_loss | 2.5021677017211914 |
83
+ | eval_rouge1 | 26.1256 |
84
+ | eval_rouge2 | 9.2552 |
85
+ | eval_rougeL | 21.4899 |
86
+ | eval_rougeLsum | 21.8194 |
87
+ | eval_gen_len | 19.2394 |
88
+ | eval_runtime | 1276.236 |
89
+ | eval_samples_per_second | 8.116 |
90
+ | epoch | 10.0 |
91
+ | eval_mem_cpu_alloc_delta | 270336 |
92
+ | eval_mem_gpu_alloc_delta | 0 |
93
+ | eval_mem_cpu_peaked_delta | 4534272 |
94
+ | eval_mem_gpu_peaked_delta | 1424628736 |
95
+ | eval_loss | 2.57672381401062 |
96
+ | eval_rouge1 | 25.8639 |
97
+ | eval_rouge2 | 8.911 |
98
+ | eval_rougeL | 21.2426 |
99
+ | eval_rougeLsum | 21.5859 |
100
+ | eval_gen_len | 19.2463 |
101
+ | eval_runtime | 1713.2216 |
102
+ | eval_samples_per_second | 8.125 |
103
+ | epoch | 10.0 |
config.json ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "LeoCordoba/int-model",
3
+ "architectures": [
4
+ "EncoderDecoderModel"
5
+ ],
6
+ "decoder": {
7
+ "_name_or_path": "dccuchile/bert-base-spanish-wwm-uncased",
8
+ "add_cross_attention": true,
9
+ "architectures": [
10
+ "BertForMaskedLM"
11
+ ],
12
+ "attention_probs_dropout_prob": 0.1,
13
+ "bad_words_ids": null,
14
+ "bos_token_id": null,
15
+ "chunk_size_feed_forward": 0,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "early_stopping": false,
20
+ "encoder_no_repeat_ngram_size": 0,
21
+ "eos_token_id": null,
22
+ "finetuning_task": null,
23
+ "forced_bos_token_id": null,
24
+ "forced_eos_token_id": null,
25
+ "gradient_checkpointing": false,
26
+ "hidden_act": "gelu",
27
+ "hidden_dropout_prob": 0.1,
28
+ "hidden_size": 768,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1"
32
+ },
33
+ "initializer_range": 0.02,
34
+ "intermediate_size": 3072,
35
+ "is_decoder": true,
36
+ "is_encoder_decoder": false,
37
+ "label2id": {
38
+ "LABEL_0": 0,
39
+ "LABEL_1": 1
40
+ },
41
+ "layer_norm_eps": 1e-12,
42
+ "length_penalty": 1.0,
43
+ "max_length": 20,
44
+ "max_position_embeddings": 512,
45
+ "min_length": 0,
46
+ "model_type": "bert",
47
+ "no_repeat_ngram_size": 0,
48
+ "num_attention_heads": 12,
49
+ "num_beam_groups": 1,
50
+ "num_beams": 1,
51
+ "num_hidden_layers": 12,
52
+ "num_return_sequences": 1,
53
+ "output_attentions": false,
54
+ "output_hidden_states": false,
55
+ "output_past": true,
56
+ "output_scores": false,
57
+ "pad_token_id": 1,
58
+ "position_embedding_type": "absolute",
59
+ "prefix": null,
60
+ "pruned_heads": {},
61
+ "remove_invalid_values": false,
62
+ "repetition_penalty": 1.0,
63
+ "return_dict": true,
64
+ "return_dict_in_generate": false,
65
+ "sep_token_id": null,
66
+ "task_specific_params": null,
67
+ "temperature": 1.0,
68
+ "tie_encoder_decoder": false,
69
+ "tie_word_embeddings": true,
70
+ "tokenizer_class": null,
71
+ "top_k": 50,
72
+ "top_p": 1.0,
73
+ "torchscript": false,
74
+ "transformers_version": "4.5.1",
75
+ "type_vocab_size": 2,
76
+ "use_bfloat16": false,
77
+ "use_cache": true,
78
+ "vocab_size": 31002
79
+ },
80
+ "decoder_start_token_id": 4,
81
+ "encoder": {
82
+ "_name_or_path": "dccuchile/bert-base-spanish-wwm-uncased",
83
+ "add_cross_attention": false,
84
+ "architectures": [
85
+ "BertForMaskedLM"
86
+ ],
87
+ "attention_probs_dropout_prob": 0.1,
88
+ "bad_words_ids": null,
89
+ "bos_token_id": null,
90
+ "chunk_size_feed_forward": 0,
91
+ "decoder_start_token_id": null,
92
+ "diversity_penalty": 0.0,
93
+ "do_sample": false,
94
+ "early_stopping": false,
95
+ "encoder_no_repeat_ngram_size": 0,
96
+ "eos_token_id": null,
97
+ "finetuning_task": null,
98
+ "forced_bos_token_id": null,
99
+ "forced_eos_token_id": null,
100
+ "gradient_checkpointing": false,
101
+ "hidden_act": "gelu",
102
+ "hidden_dropout_prob": 0.1,
103
+ "hidden_size": 768,
104
+ "id2label": {
105
+ "0": "LABEL_0",
106
+ "1": "LABEL_1"
107
+ },
108
+ "initializer_range": 0.02,
109
+ "intermediate_size": 3072,
110
+ "is_decoder": false,
111
+ "is_encoder_decoder": false,
112
+ "label2id": {
113
+ "LABEL_0": 0,
114
+ "LABEL_1": 1
115
+ },
116
+ "layer_norm_eps": 1e-12,
117
+ "length_penalty": 1.0,
118
+ "max_length": 20,
119
+ "max_position_embeddings": 512,
120
+ "min_length": 0,
121
+ "model_type": "bert",
122
+ "no_repeat_ngram_size": 0,
123
+ "num_attention_heads": 12,
124
+ "num_beam_groups": 1,
125
+ "num_beams": 1,
126
+ "num_hidden_layers": 12,
127
+ "num_return_sequences": 1,
128
+ "output_attentions": false,
129
+ "output_hidden_states": false,
130
+ "output_past": true,
131
+ "output_scores": false,
132
+ "pad_token_id": 1,
133
+ "position_embedding_type": "absolute",
134
+ "prefix": null,
135
+ "pruned_heads": {},
136
+ "remove_invalid_values": false,
137
+ "repetition_penalty": 1.0,
138
+ "return_dict": true,
139
+ "return_dict_in_generate": false,
140
+ "sep_token_id": null,
141
+ "task_specific_params": null,
142
+ "temperature": 1.0,
143
+ "tie_encoder_decoder": false,
144
+ "tie_word_embeddings": true,
145
+ "tokenizer_class": null,
146
+ "top_k": 50,
147
+ "top_p": 1.0,
148
+ "torchscript": false,
149
+ "transformers_version": "4.5.1",
150
+ "type_vocab_size": 2,
151
+ "use_bfloat16": false,
152
+ "use_cache": true,
153
+ "vocab_size": 31002
154
+ },
155
+ "eos_token_id": 5,
156
+ "is_encoder_decoder": true,
157
+ "model_type": "encoder-decoder",
158
+ "pad_token_id": 1,
159
+ "vocab_size": 31002
160
+ }
eval_data.json ADDED
@@ -0,0 +1 @@
 
1
+ {"eval_loss": 2.5021677017211914, "eval_rouge1": 26.1256, "eval_rouge2": 9.2552, "eval_rougeL": 21.4899, "eval_rougeLsum": 21.8194, "eval_gen_len": 19.2394, "eval_runtime": 1276.236, "eval_samples_per_second": 8.116, "epoch": 10.0, "eval_mem_cpu_alloc_delta": 270336, "eval_mem_gpu_alloc_delta": 0, "eval_mem_cpu_peaked_delta": 4534272, "eval_mem_gpu_peaked_delta": 1424628736}
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d66d07aae8617248a17d8a931d272c159dd42362d7f5cbd7909bada68ecad87c
3
+ size 992553531
test_data.json ADDED
@@ -0,0 +1 @@
 
1
+ {"eval_loss": 2.57672381401062, "eval_rouge1": 25.8639, "eval_rouge2": 8.911, "eval_rougeL": 21.2426, "eval_rougeLsum": 21.5859, "eval_gen_len": 19.2463, "eval_runtime": 1713.2216, "eval_samples_per_second": 8.125, "epoch": 10.0}
trainer_state.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 6.999519461797213,
5
+ "global_step": 7280,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.48,
12
+ "learning_rate": 4.656593406593407e-05,
13
+ "loss": 1.5943,
14
+ "step": 500
15
+ },
16
+ {
17
+ "epoch": 0.96,
18
+ "learning_rate": 4.3131868131868134e-05,
19
+ "loss": 1.6732,
20
+ "step": 1000
21
+ },
22
+ {
23
+ "epoch": 1.0,
24
+ "eval_gen_len": 19.2321,
25
+ "eval_loss": 2.4691147804260254,
26
+ "eval_rouge1": 25.8418,
27
+ "eval_rouge2": 8.7011,
28
+ "eval_rougeL": 21.1569,
29
+ "eval_rougeLsum": 21.4706,
30
+ "eval_runtime": 1188.0405,
31
+ "eval_samples_per_second": 8.719,
32
+ "step": 1040
33
+ },
34
+ {
35
+ "epoch": 1.44,
36
+ "learning_rate": 3.96978021978022e-05,
37
+ "loss": 1.9331,
38
+ "step": 1500
39
+ },
40
+ {
41
+ "epoch": 1.92,
42
+ "learning_rate": 3.6263736263736266e-05,
43
+ "loss": 1.9543,
44
+ "step": 2000
45
+ },
46
+ {
47
+ "epoch": 2.0,
48
+ "eval_gen_len": 19.2853,
49
+ "eval_loss": 2.4091384410858154,
50
+ "eval_rouge1": 26.1697,
51
+ "eval_rouge2": 9.222,
52
+ "eval_rougeL": 21.5166,
53
+ "eval_rougeLsum": 21.8758,
54
+ "eval_runtime": 1168.6884,
55
+ "eval_samples_per_second": 8.863,
56
+ "step": 2080
57
+ },
58
+ {
59
+ "epoch": 2.4,
60
+ "learning_rate": 3.282967032967033e-05,
61
+ "loss": 1.8371,
62
+ "step": 2500
63
+ },
64
+ {
65
+ "epoch": 2.88,
66
+ "learning_rate": 2.9395604395604398e-05,
67
+ "loss": 1.8282,
68
+ "step": 3000
69
+ },
70
+ {
71
+ "epoch": 3.0,
72
+ "eval_gen_len": 19.2576,
73
+ "eval_loss": 2.42261004447937,
74
+ "eval_rouge1": 26.1165,
75
+ "eval_rouge2": 9.2438,
76
+ "eval_rougeL": 21.5311,
77
+ "eval_rougeLsum": 21.8577,
78
+ "eval_runtime": 1236.3561,
79
+ "eval_samples_per_second": 8.378,
80
+ "step": 3120
81
+ },
82
+ {
83
+ "epoch": 3.37,
84
+ "learning_rate": 2.5961538461538464e-05,
85
+ "loss": 1.7419,
86
+ "step": 3500
87
+ },
88
+ {
89
+ "epoch": 3.85,
90
+ "learning_rate": 2.252747252747253e-05,
91
+ "loss": 1.7287,
92
+ "step": 4000
93
+ },
94
+ {
95
+ "epoch": 4.0,
96
+ "eval_gen_len": 19.2905,
97
+ "eval_loss": 2.4440300464630127,
98
+ "eval_rouge1": 26.2529,
99
+ "eval_rouge2": 9.3031,
100
+ "eval_rougeL": 21.5955,
101
+ "eval_rougeLsum": 21.9465,
102
+ "eval_runtime": 1217.0172,
103
+ "eval_samples_per_second": 8.511,
104
+ "step": 4160
105
+ },
106
+ {
107
+ "epoch": 4.33,
108
+ "learning_rate": 1.9093406593406592e-05,
109
+ "loss": 1.6642,
110
+ "step": 4500
111
+ },
112
+ {
113
+ "epoch": 4.81,
114
+ "learning_rate": 1.565934065934066e-05,
115
+ "loss": 1.6481,
116
+ "step": 5000
117
+ },
118
+ {
119
+ "epoch": 5.0,
120
+ "eval_gen_len": 19.1729,
121
+ "eval_loss": 2.471247673034668,
122
+ "eval_rouge1": 26.1789,
123
+ "eval_rouge2": 9.2847,
124
+ "eval_rougeL": 21.5357,
125
+ "eval_rougeLsum": 21.8938,
126
+ "eval_runtime": 1247.3952,
127
+ "eval_samples_per_second": 8.304,
128
+ "step": 5200
129
+ },
130
+ {
131
+ "epoch": 5.29,
132
+ "learning_rate": 1.2225274725274726e-05,
133
+ "loss": 1.6071,
134
+ "step": 5500
135
+ },
136
+ {
137
+ "epoch": 5.77,
138
+ "learning_rate": 8.791208791208792e-06,
139
+ "loss": 1.5796,
140
+ "step": 6000
141
+ },
142
+ {
143
+ "epoch": 6.0,
144
+ "eval_gen_len": 19.2078,
145
+ "eval_loss": 2.485071897506714,
146
+ "eval_rouge1": 26.07,
147
+ "eval_rouge2": 9.1429,
148
+ "eval_rougeL": 21.414,
149
+ "eval_rougeLsum": 21.7483,
150
+ "eval_runtime": 1301.5565,
151
+ "eval_samples_per_second": 7.958,
152
+ "step": 6240
153
+ },
154
+ {
155
+ "epoch": 6.25,
156
+ "learning_rate": 5.357142857142857e-06,
157
+ "loss": 1.5558,
158
+ "step": 6500
159
+ },
160
+ {
161
+ "epoch": 6.73,
162
+ "learning_rate": 1.9230769230769234e-06,
163
+ "loss": 1.5319,
164
+ "step": 7000
165
+ },
166
+ {
167
+ "epoch": 7.0,
168
+ "eval_gen_len": 19.2394,
169
+ "eval_loss": 2.5021677017211914,
170
+ "eval_rouge1": 26.1256,
171
+ "eval_rouge2": 9.2552,
172
+ "eval_rougeL": 21.4899,
173
+ "eval_rougeLsum": 21.8194,
174
+ "eval_runtime": 1250.4536,
175
+ "eval_samples_per_second": 8.283,
176
+ "step": 7280
177
+ },
178
+ {
179
+ "epoch": 7.0,
180
+ "step": 7280,
181
+ "total_flos": 1.5986426003154386e+18,
182
+ "train_runtime": 137981.946,
183
+ "train_samples_per_second": 0.053
184
+ },
185
+ {
186
+ "epoch": 7.0,
187
+ "eval_gen_len": 19.2394,
188
+ "eval_loss": 2.5021677017211914,
189
+ "eval_rouge1": 26.1256,
190
+ "eval_rouge2": 9.2552,
191
+ "eval_rougeL": 21.4899,
192
+ "eval_rougeLsum": 21.8194,
193
+ "eval_runtime": 1276.236,
194
+ "eval_samples_per_second": 8.116,
195
+ "step": 7280
196
+ },
197
+ {
198
+ "epoch": 7.0,
199
+ "eval_gen_len": 19.2463,
200
+ "eval_loss": 2.57672381401062,
201
+ "eval_rouge1": 25.8639,
202
+ "eval_rouge2": 8.911,
203
+ "eval_rougeL": 21.2426,
204
+ "eval_rougeLsum": 21.5859,
205
+ "eval_runtime": 1713.2216,
206
+ "eval_samples_per_second": 8.125,
207
+ "step": 7280
208
+ }
209
+ ],
210
+ "max_steps": 7280,
211
+ "num_train_epochs": 7,
212
+ "total_flos": 1.5986426003154386e+18,
213
+ "trial_name": null,
214
+ "trial_params": null
215
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90dff6dcad2ecaa16e932c89b750519a537b76cf7b09f64406a691cc86994327
3
+ size 2139