abdalrahmanshahrour commited on
Commit
11e8536
1 Parent(s): 230ce26

Upload 9 files

Browse files
README.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - ar
4
+ tags:
5
+ - AraBERT
6
+ - BERT
7
+ - BERT2BERT
8
+ - MSA
9
+ - Arabic Text Summarization
10
+ - Arabic News Title Generation
11
+ - Arabic Paraphrasing
12
+ widget:
13
+ - text: "شهدت مدينة طرابلس، مساء أمس الأربعاء، احتجاجات شعبية وأعمال شغب لليوم الثالث على التوالي، وذلك بسبب تردي الوضع المعيشي والاقتصادي. واندلعت مواجهات عنيفة وعمليات كر وفر ما بين الجيش اللبناني والمحتجين استمرت لساعات، إثر محاولة فتح الطرقات المقطوعة، ما أدى إلى إصابة العشرات من الطرفين."
14
+ ---
15
+
16
+ # An Arabic abstractive text summarization model
17
+ A BERT2BERT-based model whose parameters are initialized with AraBERT weights and which has been fine-tuned on a dataset of 84,764 paragraph-summary pairs.
18
+
19
+ More details on the fine-tuning of this model will be released later.
20
+
21
+ The model can be used as follows:
22
+ ```python
23
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
24
+ from arabert.preprocess import ArabertPreprocessor
25
+
26
+ model_name="malmarjeh/bert2bert"
27
+ preprocessor = ArabertPreprocessor(model_name="")
28
+
29
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
30
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
31
+ pipeline = pipeline("text2text-generation",model=model,tokenizer=tokenizer)
32
+
33
+ text = "شهدت مدينة طرابلس، مساء أمس الأربعاء، احتجاجات شعبية وأعمال شغب لليوم الثالث على التوالي، وذلك بسبب تردي الوضع المعيشي والاقتصادي. واندلعت مواجهات عنيفة وعمليات كر وفر ما بين الجيش اللبناني والمحتجين استمرت لساعات، إثر محاولة فتح الطرقات المقطوعة، ما أدى إلى إصابة العشرات من الطرفين."
34
+ text = preprocessor.preprocess(text)
35
+
36
+ result = pipeline(text,
37
+ pad_token_id=tokenizer.eos_token_id,
38
+ num_beams=3,
39
+ repetition_penalty=3.0,
40
+ max_length=200,
41
+ length_penalty=1.0,
42
+ no_repeat_ngram_size = 3)[0]['generated_text']
43
+ result
44
+ >>> 'مواجهات في طرابلس لليوم الثالث على التوالي'
45
+ ```
46
+
47
+ ## Contact:
48
+ <banimarje@gmail.com>
config.json ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./drive/MyDrive/newarabert2arabert/checkpoint-3000",
3
+ "architectures": [
4
+ "EncoderDecoderModel"
5
+ ],
6
+ "decoder": {
7
+ "_name_or_path": "aubmindlab/bert-base-arabertv02",
8
+ "add_cross_attention": true,
9
+ "architectures": [
10
+ "BertForMaskedLM"
11
+ ],
12
+ "attention_probs_dropout_prob": 0.1,
13
+ "bad_words_ids": null,
14
+ "bos_token_id": null,
15
+ "chunk_size_feed_forward": 0,
16
+ "decoder_start_token_id": null,
17
+ "diversity_penalty": 0.0,
18
+ "do_sample": false,
19
+ "early_stopping": false,
20
+ "encoder_no_repeat_ngram_size": 0,
21
+ "eos_token_id": null,
22
+ "finetuning_task": null,
23
+ "forced_bos_token_id": null,
24
+ "forced_eos_token_id": null,
25
+ "gradient_checkpointing": false,
26
+ "hidden_act": "gelu",
27
+ "hidden_dropout_prob": 0.1,
28
+ "hidden_size": 768,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1"
32
+ },
33
+ "initializer_range": 0.02,
34
+ "intermediate_size": 3072,
35
+ "is_decoder": true,
36
+ "is_encoder_decoder": false,
37
+ "label2id": {
38
+ "LABEL_0": 0,
39
+ "LABEL_1": 1
40
+ },
41
+ "layer_norm_eps": 1e-12,
42
+ "length_penalty": 1.0,
43
+ "max_length": 20,
44
+ "max_position_embeddings": 512,
45
+ "min_length": 0,
46
+ "model_type": "bert",
47
+ "no_repeat_ngram_size": 0,
48
+ "num_attention_heads": 12,
49
+ "num_beam_groups": 1,
50
+ "num_beams": 1,
51
+ "num_hidden_layers": 12,
52
+ "num_return_sequences": 1,
53
+ "output_attentions": false,
54
+ "output_hidden_states": false,
55
+ "output_scores": false,
56
+ "pad_token_id": 0,
57
+ "position_embedding_type": "absolute",
58
+ "prefix": null,
59
+ "pruned_heads": {},
60
+ "remove_invalid_values": false,
61
+ "repetition_penalty": 1.0,
62
+ "return_dict": true,
63
+ "return_dict_in_generate": false,
64
+ "sep_token_id": null,
65
+ "task_specific_params": null,
66
+ "temperature": 1.0,
67
+ "tie_encoder_decoder": false,
68
+ "tie_word_embeddings": true,
69
+ "tokenizer_class": null,
70
+ "top_k": 50,
71
+ "top_p": 1.0,
72
+ "torchscript": false,
73
+ "transformers_version": "4.5.1",
74
+ "type_vocab_size": 2,
75
+ "use_bfloat16": false,
76
+ "use_cache": true,
77
+ "vocab_size": 64000
78
+ },
79
+ "decoder_start_token_id": 2,
80
+ "encoder": {
81
+ "_name_or_path": "aubmindlab/bert-base-arabertv02",
82
+ "add_cross_attention": false,
83
+ "architectures": [
84
+ "BertForMaskedLM"
85
+ ],
86
+ "attention_probs_dropout_prob": 0.1,
87
+ "bad_words_ids": null,
88
+ "bos_token_id": null,
89
+ "chunk_size_feed_forward": 0,
90
+ "decoder_start_token_id": null,
91
+ "diversity_penalty": 0.0,
92
+ "do_sample": false,
93
+ "early_stopping": false,
94
+ "encoder_no_repeat_ngram_size": 0,
95
+ "eos_token_id": null,
96
+ "finetuning_task": null,
97
+ "forced_bos_token_id": null,
98
+ "forced_eos_token_id": null,
99
+ "gradient_checkpointing": false,
100
+ "hidden_act": "gelu",
101
+ "hidden_dropout_prob": 0.1,
102
+ "hidden_size": 768,
103
+ "id2label": {
104
+ "0": "LABEL_0",
105
+ "1": "LABEL_1"
106
+ },
107
+ "initializer_range": 0.02,
108
+ "intermediate_size": 3072,
109
+ "is_decoder": false,
110
+ "is_encoder_decoder": false,
111
+ "label2id": {
112
+ "LABEL_0": 0,
113
+ "LABEL_1": 1
114
+ },
115
+ "layer_norm_eps": 1e-12,
116
+ "length_penalty": 1.0,
117
+ "max_length": 20,
118
+ "max_position_embeddings": 512,
119
+ "min_length": 0,
120
+ "model_type": "bert",
121
+ "no_repeat_ngram_size": 0,
122
+ "num_attention_heads": 12,
123
+ "num_beam_groups": 1,
124
+ "num_beams": 1,
125
+ "num_hidden_layers": 12,
126
+ "num_return_sequences": 1,
127
+ "output_attentions": false,
128
+ "output_hidden_states": false,
129
+ "output_scores": false,
130
+ "pad_token_id": 0,
131
+ "position_embedding_type": "absolute",
132
+ "prefix": null,
133
+ "pruned_heads": {},
134
+ "remove_invalid_values": false,
135
+ "repetition_penalty": 1.0,
136
+ "return_dict": true,
137
+ "return_dict_in_generate": false,
138
+ "sep_token_id": null,
139
+ "task_specific_params": null,
140
+ "temperature": 1.0,
141
+ "tie_encoder_decoder": false,
142
+ "tie_word_embeddings": true,
143
+ "tokenizer_class": null,
144
+ "top_k": 50,
145
+ "top_p": 1.0,
146
+ "torchscript": false,
147
+ "transformers_version": "4.5.1",
148
+ "type_vocab_size": 2,
149
+ "use_bfloat16": false,
150
+ "use_cache": true,
151
+ "vocab_size": 64000
152
+ },
153
+ "eos_token_id": 3,
154
+ "is_encoder_decoder": true,
155
+ "max_length": 40,
156
+ "min_length": 5,
157
+ "model_type": "encoder-decoder",
158
+ "pad_token_id": 0,
159
+ "tie_encoder_decoder": true,
160
+ "vocab_size": 64000
161
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9de6d2e42231587a70b8b12937dab39e2feca662bb01a9b26159a7f80053338d
3
+ size 134
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:130af6f5d1268ea793878e199600dcf34989301fac8c60f4adf038b9c39a2b20
3
+ size 128
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "max_len": 512, "do_basic_tokenize": true, "never_split": ["[بريد]", "[مستخدم]", "[رابط]"], "special_tokens_map_file": null, "name_or_path": "aubmindlab/bert-base-arabertv02"}
trainer_state.json ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.9546317788562986,
5
+ "global_step": 7000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.42,
12
+ "learning_rate": 3.3333333333333335e-05,
13
+ "loss": 6.0161,
14
+ "step": 1000
15
+ },
16
+ {
17
+ "epoch": 0.42,
18
+ "eval_loss": 3.9784154891967773,
19
+ "eval_rouge-1": 0.1824,
20
+ "eval_rouge-2": 0.046,
21
+ "eval_rouge-l": 0.1728,
22
+ "eval_runtime": 265.003,
23
+ "eval_samples_per_second": 15.751,
24
+ "step": 1000
25
+ },
26
+ {
27
+ "epoch": 0.84,
28
+ "learning_rate": 4.8873366381252815e-05,
29
+ "loss": 3.6793,
30
+ "step": 2000
31
+ },
32
+ {
33
+ "epoch": 0.84,
34
+ "eval_loss": 2.9924702644348145,
35
+ "eval_rouge-1": 0.3283,
36
+ "eval_rouge-2": 0.1385,
37
+ "eval_rouge-l": 0.3123,
38
+ "eval_runtime": 236.629,
39
+ "eval_samples_per_second": 17.639,
40
+ "step": 2000
41
+ },
42
+ {
43
+ "epoch": 1.27,
44
+ "learning_rate": 4.662009914375845e-05,
45
+ "loss": 2.7635,
46
+ "step": 3000
47
+ },
48
+ {
49
+ "epoch": 1.27,
50
+ "eval_loss": 2.58980655670166,
51
+ "eval_rouge-1": 0.3955,
52
+ "eval_rouge-2": 0.2061,
53
+ "eval_rouge-l": 0.3776,
54
+ "eval_runtime": 233.9394,
55
+ "eval_samples_per_second": 17.842,
56
+ "step": 3000
57
+ },
58
+ {
59
+ "epoch": 1.69,
60
+ "learning_rate": 4.4366831906264086e-05,
61
+ "loss": 2.2888,
62
+ "step": 4000
63
+ },
64
+ {
65
+ "epoch": 1.69,
66
+ "eval_loss": 2.4224066734313965,
67
+ "eval_rouge-1": 0.42,
68
+ "eval_rouge-2": 0.2296,
69
+ "eval_rouge-l": 0.4011,
70
+ "eval_runtime": 459.7714,
71
+ "eval_samples_per_second": 9.078,
72
+ "step": 4000
73
+ },
74
+ {
75
+ "epoch": 2.11,
76
+ "learning_rate": 4.211356466876972e-05,
77
+ "loss": 2.1166,
78
+ "step": 5000
79
+ },
80
+ {
81
+ "epoch": 2.11,
82
+ "eval_loss": 2.343660593032837,
83
+ "eval_rouge-1": 0.4335,
84
+ "eval_rouge-2": 0.2425,
85
+ "eval_rouge-l": 0.413,
86
+ "eval_runtime": 449.5335,
87
+ "eval_samples_per_second": 9.285,
88
+ "step": 5000
89
+ },
90
+ {
91
+ "epoch": 2.53,
92
+ "learning_rate": 3.986029743127535e-05,
93
+ "loss": 1.8642,
94
+ "step": 6000
95
+ },
96
+ {
97
+ "epoch": 2.53,
98
+ "eval_loss": 2.2839481830596924,
99
+ "eval_rouge-1": 0.4426,
100
+ "eval_rouge-2": 0.2535,
101
+ "eval_rouge-l": 0.4222,
102
+ "eval_runtime": 453.1272,
103
+ "eval_samples_per_second": 9.212,
104
+ "step": 6000
105
+ },
106
+ {
107
+ "epoch": 2.95,
108
+ "learning_rate": 3.760703019378098e-05,
109
+ "loss": 1.8616,
110
+ "step": 7000
111
+ },
112
+ {
113
+ "epoch": 2.95,
114
+ "eval_loss": 2.220813035964966,
115
+ "eval_rouge-1": 0.4475,
116
+ "eval_rouge-2": 0.262,
117
+ "eval_rouge-l": 0.4285,
118
+ "eval_runtime": 443.175,
119
+ "eval_samples_per_second": 9.418,
120
+ "step": 7000
121
+ }
122
+ ],
123
+ "max_steps": 23690,
124
+ "num_train_epochs": 10,
125
+ "total_flos": 3.3108670416384e+16,
126
+ "trial_name": null,
127
+ "trial_params": null
128
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0347bd88fb8a3c4e63750d7bc8f6e5f9cfbc597cc208370229a20121825f59f7
3
+ size 129
vocab.txt ADDED
The diff for this file is too large to render. See raw diff