xzxyx malmarjeh commited on
Commit
cfa1e90
0 Parent(s):

Duplicate from malmarjeh/arabic-text-summarization

Browse files

Co-authored-by: Mohammad Bani Almarjeh <malmarjeh@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.onnx filter=lfs diff=lfs merge=lfs -text
13
+ *.ot filter=lfs diff=lfs merge=lfs -text
14
+ *.parquet filter=lfs diff=lfs merge=lfs -text
15
+ *.pb filter=lfs diff=lfs merge=lfs -text
16
+ *.pt filter=lfs diff=lfs merge=lfs -text
17
+ *.pth filter=lfs diff=lfs merge=lfs -text
18
+ *.rar filter=lfs diff=lfs merge=lfs -text
19
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
20
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
21
+ *.tflite filter=lfs diff=lfs merge=lfs -text
22
+ *.tgz filter=lfs diff=lfs merge=lfs -text
23
+ *.wasm filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Arabic Text Summarization
3
+ emoji: 👀
4
+ colorFrom: indigo
5
+ colorTo: blue
6
+ sdk: streamlit
7
+ sdk_version: 1.10.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mpl-2.0
11
+ duplicated_from: malmarjeh/arabic-text-summarization
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
__pycache__/preprocess.cpython-310.pyc ADDED
Binary file (11.2 kB). View file
 
__pycache__/summarize.cpython-310.pyc ADDED
Binary file (3.43 kB). View file
 
app.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from urllib.parse import unquote
2
+
3
+ import arabic_reshaper
4
+ import streamlit as st
5
+ from bidi.algorithm import get_display
6
+
7
+ st.set_page_config(
8
+ page_title="Arabic Text Summarization",
9
+ page_icon="📖",
10
+ initial_sidebar_state="expanded"
11
+ # layout="wide"
12
+ )
13
+
14
+ from summarize import get_results
15
+
16
+ rtl = lambda w: get_display(f"{arabic_reshaper.reshape(w)}")
17
+
18
+ st.title("تَلْخِيصُ اَلنُّصُوصِ بِاللُّغَةِ اَلْعَرَبِيَّةِ")
19
+
20
+ st.markdown(
21
+ """
22
+ <style>
23
+ @import url(https://fonts.googleapis.com/earlyaccess/scheherazade.css);
24
+ section.main {
25
+ background-color: beige;
26
+ }
27
+ .stMarkdown h1, .main .element-container.css-o7ulmj.e1tzin5v3 {
28
+ text-align: right;
29
+ }
30
+ .stMarkdown div.css-nlntq9.e16nr0p33 {
31
+ font-weight: bold;
32
+ }
33
+ textarea {
34
+ direction: rtl;
35
+ height: 140px;
36
+ }
37
+ .stTextArea .css-qrbaxs {
38
+ float: right;
39
+ font-size: 23px;
40
+ }
41
+ h1 {
42
+ font-family: 'Scheherazade', serif;
43
+ }
44
+
45
+ .main div.css-nlntq9.e16nr0p33 > p {
46
+ direction: rtl;
47
+ }
48
+ .main .stMarkdown div.css-nlntq9 p {
49
+ font-size: 22px;
50
+ }
51
+ .main .stMarkdown div.css-nlntq9 {
52
+ direction: rtl;
53
+ }
54
+ .main p, .main div, .main input, .main label {
55
+ text-align: right;
56
+ direction: rtl;
57
+ }
58
+ .main div>h1>div {
59
+ left: 0;
60
+ }
61
+ .main button {
62
+ font-size: 22px;
63
+ }
64
+
65
+ </style>
66
+ """,
67
+ unsafe_allow_html=True,
68
+ )
69
+
70
+ st.sidebar.write("Arabic Text Summarization")
71
+ st.sidebar.write("Contact: banimarje@gmail.com")
72
+ st.sidebar.write("\n")
73
+
74
+ model_selected = st.sidebar.selectbox(
75
+ 'Select a model',
76
+ ('T5','BERT2BERT', 'GPT-2', 'mBERT2mBERT','Transformer'))
77
+ st.sidebar.write("\n")
78
+ num_beams = st.sidebar.slider(
79
+ "Number of beams", min_value=1, max_value=10, value=3, step=1
80
+ )
81
+
82
+ length_pe_slider_disabled = False
83
+ if model_selected == "GPT-2":
84
+ length_pe_slider_disabled = True
85
+
86
+ st.sidebar.write("\n")
87
+ length_penalty = st.sidebar.slider(
88
+ "Length penalty ", min_value=0.1, max_value=3.0, value=1.0, step=0.1, disabled=length_pe_slider_disabled
89
+ )
90
+
91
+ txt = """يجري علماء في بريطانيا تجربة لاختبار فعالية عقار إيبوبروفين لمساعدة المصابين بفيروس كورونا. وذكرت هيئة الإذاعة البريطانية "بي بي سي" أن فريق مشترك من أطباء مستشفيات "جاي" و"سانت توماس" و"كينغز كوليدج" في لندن يعتقد أن إيبوبروفين، وهو مضاد للالتهابات ومسكن للألم، يمكن أن يعالج صعوبات التنفس.
92
+ ويأمل العلماء أن يساعد هذا العلاج المنخفض التكلفة المرضى في الاستغناء عن أجهزة التنفس الصناعي. وذكرت أنه خلال فترة الاختبار، سيحصل نصف المرضى على إيبوبروفين بالإضافة إلى الرعاية المعتادة، حيث سيتم استخدام تركيبة خاصة من إيبوبروفين بدلا من الأقراص العادية التي قد يشتريها الناس عادة."""
93
+ text = st.text_area("أدخل نص ليتم تلخيصه", value=txt)
94
+
95
+ run_query = st.button("لخّص")
96
+ if run_query:
97
+ # https://discuss.streamlit.io/t/showing-a-gif-while-st-spinner-runs/5084
98
+ with st.spinner("جاري التلخيص ..."):
99
+ result = get_results(text, model_selected, num_beams, length_penalty)
100
+ if len(result) > 0:
101
+ st.write(result)
102
+ else:
103
+ st.write("")
preprocess.py ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import html
2
+ import logging
3
+ import re
4
+
5
+ import pyarabic.araby as araby
6
+
7
+ ACCEPTED_MODELS = [
8
+ "bert-base-arabertv01",
9
+ "bert-base-arabert",
10
+ "bert-base-arabertv02",
11
+ "bert-base-arabertv2",
12
+ "bert-large-arabertv02",
13
+ "bert-large-arabertv2",
14
+ "araelectra-base",
15
+ "araelectra-base-discriminator",
16
+ "araelectra-base-generator",
17
+ "aragpt2-base",
18
+ "aragpt2-medium",
19
+ "aragpt2-large",
20
+ "aragpt2-mega",
21
+ ]
22
+
23
+ SEGMENTED_MODELS = [
24
+ "bert-base-arabert",
25
+ "bert-base-arabertv2",
26
+ "bert-large-arabertv2",
27
+ ]
28
+
29
+
30
+ class ArabertPreprocessor:
31
+ """
32
+ A Preprocessor class that cleans and preprocesses text for all models in the AraBERT repo.
33
+ It also can unprocess the text ouput of the generated text
34
+
35
+ Args:
36
+
37
+ model_name (:obj:`str`): model name from the HuggingFace Models page without the aubmindlab tag. Defaults to "bert-base-arabertv02". Current accepted models are:
38
+
39
+ - :obj:`"bert-base-arabertv01"`: No farasa segmentation.
40
+ - :obj:`"bert-base-arabert"`: with farasa segmentation.
41
+ - :obj:`"bert-base-arabertv02"`: No farasas egmentation.
42
+ - :obj:`"bert-base-arabertv2"`: with farasa segmentation.
43
+ - :obj:`"bert-large-arabertv02"`: No farasas egmentation.
44
+ - :obj:`"bert-large-arabertv2"`: with farasa segmentation.
45
+ - :obj:`"araelectra-base"`: No farasa segmentation.
46
+ - :obj:`"araelectra-base-discriminator"`: No farasa segmentation.
47
+ - :obj:`"araelectra-base-generator"`: No farasa segmentation.
48
+ - :obj:`"aragpt2-base"`: No farasa segmentation.
49
+ - :obj:`"aragpt2-medium"`: No farasa segmentation.
50
+ - :obj:`"aragpt2-large"`: No farasa segmentation.
51
+ - :obj:`"aragpt2-mega"`: No farasa segmentation.
52
+
53
+ keep_emojis(:obj: `bool`): don't remove emojis while preprocessing. Defaults to False
54
+
55
+ remove_html_markup(:obj: `bool`): Whether to remove html artfacts, should be set to False when preprocessing TyDi QA. Defaults to True
56
+
57
+ replace_urls_emails_mentions(:obj: `bool`): Whether to replace email urls and mentions by special tokens. Defaults to True
58
+
59
+ strip_tashkeel(:obj: `bool`): remove diacritics (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA)
60
+
61
+ strip_tatweel(:obj: `bool`): remove tatweel '\\u0640'
62
+
63
+ insert_white_spaces(:obj: `bool`): insert whitespace before and after all non Arabic digits or English digits or Arabic and English Alphabet or the 2 brackets, then inserts whitespace between words and numbers or numbers and words
64
+
65
+ remove_elongation(:obj: `bool`): replace repetition of more than 2 non-digit character with 2 of this character
66
+
67
+
68
+ Returns:
69
+
70
+ ArabertPreprocessor: the preprocessor class
71
+
72
+ Example:
73
+
74
+ from preprocess import ArabertPreprocessor
75
+
76
+ arabert_prep = ArabertPreprocessor("aubmindlab/bert-base-arabertv2")
77
+
78
+ arabert_prep.preprocess("SOME ARABIC TEXT")
79
+ """
80
+
81
+ def __init__(
82
+ self,
83
+ model_name,
84
+ keep_emojis=False,
85
+ remove_html_markup=True,
86
+ replace_urls_emails_mentions=True,
87
+ strip_tashkeel=True,
88
+ strip_tatweel=True,
89
+ insert_white_spaces=True,
90
+ remove_elongation=True,
91
+ ):
92
+ """
93
+ model_name (:obj:`str`): model name from the HuggingFace Models page without the aubmindlab tag. Defaults to "bert-base-arabertv02". Current accepted models are:
94
+
95
+ - :obj:`"bert-base-arabertv01"`: No farasa segmentation.
96
+ - :obj:`"bert-base-arabert"`: with farasa segmentation.
97
+ - :obj:`"bert-base-arabertv02"`: No farasas egmentation.
98
+ - :obj:`"bert-base-arabertv2"`: with farasa segmentation.
99
+ - :obj:`"bert-large-arabertv02"`: No farasas egmentation.
100
+ - :obj:`"bert-large-arabertv2"`: with farasa segmentation.
101
+ - :obj:`"araelectra-base"`: No farasa segmentation.
102
+ - :obj:`"araelectra-base-discriminator"`: No farasa segmentation.
103
+ - :obj:`"araelectra-base-generator"`: No farasa segmentation.
104
+ - :obj:`"aragpt2-base"`: No farasa segmentation.
105
+ - :obj:`"aragpt2-medium"`: No farasa segmentation.
106
+ - :obj:`"aragpt2-large"`: No farasa segmentation.
107
+ - :obj:`"aragpt2-mega"`: No farasa segmentation.
108
+
109
+ keep_emojis(:obj: `bool`): don't remove emojis while preprocessing. Defaults to False
110
+
111
+ remove_html_markup(:obj: `bool`): Whether to remove html artfacts, should be set to False when preprocessing TyDi QA. Defaults to True
112
+
113
+ replace_urls_emails_mentions(:obj: `bool`): Whether to replace email urls and mentions by special tokens. Defaults to True
114
+
115
+ strip_tashkeel(:obj: `bool`): remove diacritics (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA)
116
+
117
+ strip_tatweel(:obj: `bool`): remove tatweel '\\u0640'
118
+
119
+ insert_white_spaces(:obj: `bool`): insert whitespace before and after all non Arabic digits or English digits or Arabic and English Alphabet or the 2 brackets, then inserts whitespace between words and numbers or numbers and words
120
+
121
+ remove_elongation(:obj: `bool`): replace repetition of more than 2 non-digit character with 2 of this character
122
+
123
+ """
124
+ model_name = model_name.replace("aubmindlab/", "")
125
+
126
+ if model_name not in ACCEPTED_MODELS:
127
+ logging.warning(
128
+ "Model provided is not in the accepted model list. Assuming you don't want Farasa Segmentation"
129
+ )
130
+ self.model_name = "bert-base-arabertv02"
131
+ else:
132
+ self.model_name = model_name
133
+
134
+
135
+ self.keep_emojis = keep_emojis
136
+
137
+ self.remove_html_markup = remove_html_markup
138
+ self.replace_urls_emails_mentions = replace_urls_emails_mentions
139
+ self.strip_tashkeel = strip_tashkeel
140
+ self.strip_tatweel = strip_tatweel
141
+ self.insert_white_spaces = insert_white_spaces
142
+ self.remove_elongation = remove_elongation
143
+
144
+ def preprocess(self, text):
145
+ """
146
+ Preprocess takes an input text line an applies the same preprocessing used in AraBERT
147
+ pretraining
148
+
149
+ Args:
150
+
151
+ text (:obj:`str`): inout text string
152
+
153
+ Returns:
154
+
155
+ string: A preprocessed string depending on which model was selected
156
+ """
157
+
158
+
159
+ text = str(text)
160
+ text = html.unescape(text)
161
+ if self.strip_tashkeel:
162
+ text = araby.strip_tashkeel(text)
163
+ if self.strip_tatweel:
164
+ text = araby.strip_tatweel(text)
165
+
166
+ if self.replace_urls_emails_mentions:
167
+ # replace all possible URLs
168
+ for reg in url_regexes:
169
+ text = re.sub(reg, " [رابط] ", text)
170
+ # REplace Emails with [بريد]
171
+ for reg in email_regexes:
172
+ text = re.sub(reg, " [بريد] ", text)
173
+ # replace mentions with [مستخدم]
174
+ text = re.sub(user_mention_regex, " [مستخدم] ", text)
175
+
176
+ if self.remove_html_markup:
177
+ # remove html line breaks
178
+ text = re.sub("<br />", " ", text)
179
+ # remove html markup
180
+ text = re.sub("</?[^>]+>", " ", text)
181
+
182
+ # remove repeated characters >2
183
+ if self.remove_elongation:
184
+ text = self._remove_elongation(text)
185
+
186
+ # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets
187
+ if self.insert_white_spaces:
188
+ text = re.sub(
189
+ "([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z\[\]])",
190
+ r" \1 ",
191
+ text,
192
+ )
193
+
194
+ # insert whitespace between words and numbers or numbers and words
195
+ text = re.sub(
196
+ "(\d+)([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)", r" \1 \2 ", text
197
+ )
198
+ text = re.sub(
199
+ "([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)(\d+)", r" \1 \2 ", text
200
+ )
201
+
202
+
203
+ text = re.sub(rejected_chars_regex, " ", text)
204
+
205
+ # remove extra spaces
206
+ text = " ".join(text.replace("\uFE0F", "").split())
207
+
208
+ # ALl the other models dont require Farasa Segmentation
209
+ return text
210
+
211
+ def unpreprocess(self, text, desegment=True):
212
+ """Re-formats the text to a classic format where punctuations, brackets, parenthesis are not seperated by whitespaces.
213
+ The objective is to make the generated text of any model appear natural and not preprocessed.
214
+
215
+ Args:
216
+ text (str): input text to be un-preprocessed
217
+ desegment (bool, optional): [whether or not to remove farasa pre-segmentation before]. Defaults to True.
218
+
219
+ Returns:
220
+ str: The unpreprocessed (and possibly Farasa-desegmented) text.
221
+ """
222
+
223
+ # removes the spaces around quotation marks ex: i " ate " an apple --> i "ate" an apple
224
+ # https://stackoverflow.com/a/53436792/5381220
225
+ text = re.sub(white_spaced_double_quotation_regex, '"' + r"\1" + '"', text)
226
+ text = re.sub(white_spaced_single_quotation_regex, "'" + r"\1" + "'", text)
227
+ text = re.sub(white_spaced_back_quotation_regex, "\`" + r"\1" + "\`", text)
228
+ text = re.sub(white_spaced_back_quotation_regex, "\—" + r"\1" + "\—", text)
229
+
230
+ # during generation, sometimes the models don't put a space after the dot, this handles it
231
+ text = text.replace(".", " . ")
232
+ text = " ".join(text.split())
233
+
234
+ # handle decimals
235
+ text = re.sub(r"(\d+) \. (\d+)", r"\1.\2", text)
236
+ text = re.sub(r"(\d+) \, (\d+)", r"\1,\2", text)
237
+
238
+ text = re.sub(left_and_right_spaced_chars, r"\1", text)
239
+ text = re.sub(left_spaced_chars, r"\1", text)
240
+ text = re.sub(right_spaced_chars, r"\1", text)
241
+
242
+ return text
243
+
244
+
245
+ def _remove_elongation(self, text):
246
+ """
247
+ :param text: the input text to remove elongation
248
+ :return: delongated text
249
+ """
250
+ # loop over the number of times the regex matched the text
251
+ for index_ in range(len(re.findall(regex_tatweel, text))):
252
+ elongation = re.search(regex_tatweel, text)
253
+ if elongation:
254
+ elongation_pattern = elongation.group()
255
+ elongation_replacement = elongation_pattern[0]
256
+ elongation_pattern = re.escape(elongation_pattern)
257
+ text = re.sub(
258
+ elongation_pattern, elongation_replacement, text, flags=re.MULTILINE
259
+ )
260
+ else:
261
+ break
262
+ return text
263
+
264
+ def _remove_redundant_punct(self, text):
265
+ text_ = text
266
+ result = re.search(redundant_punct_pattern, text)
267
+ dif = 0
268
+ while result:
269
+ sub = result.group()
270
+ sub = sorted(set(sub), key=sub.index)
271
+ sub = " " + "".join(list(sub)) + " "
272
+ text = "".join(
273
+ (text[: result.span()[0] + dif], sub, text[result.span()[1] + dif :])
274
+ )
275
+ text_ = "".join(
276
+ (text_[: result.span()[0]], text_[result.span()[1] :])
277
+ ).strip()
278
+ dif = abs(len(text) - len(text_))
279
+ result = re.search(redundant_punct_pattern, text_)
280
+ text = re.sub(r"\s+", " ", text)
281
+ return text.strip()
282
+
283
+
284
+ prefix_list = [
285
+ "ال",
286
+ "و",
287
+ "ف",
288
+ "ب",
289
+ "ك",
290
+ "ل",
291
+ "لل",
292
+ "\u0627\u0644",
293
+ "\u0648",
294
+ "\u0641",
295
+ "\u0628",
296
+ "\u0643",
297
+ "\u0644",
298
+ "\u0644\u0644",
299
+ "س",
300
+ ]
301
+ suffix_list = [
302
+ "ه",
303
+ "ها",
304
+ "ك",
305
+ "ي",
306
+ "هما",
307
+ "كما",
308
+ "نا",
309
+ "كم",
310
+ "هم",
311
+ "هن",
312
+ "كن",
313
+ "ا",
314
+ "ان",
315
+ "ين",
316
+ "ون",
317
+ "وا",
318
+ "ات",
319
+ "ت",
320
+ "ن",
321
+ "ة",
322
+ "\u0647",
323
+ "\u0647\u0627",
324
+ "\u0643",
325
+ "\u064a",
326
+ "\u0647\u0645\u0627",
327
+ "\u0643\u0645\u0627",
328
+ "\u0646\u0627",
329
+ "\u0643\u0645",
330
+ "\u0647\u0645",
331
+ "\u0647\u0646",
332
+ "\u0643\u0646",
333
+ "\u0627",
334
+ "\u0627\u0646",
335
+ "\u064a\u0646",
336
+ "\u0648\u0646",
337
+ "\u0648\u0627",
338
+ "\u0627\u062a",
339
+ "\u062a",
340
+ "\u0646",
341
+ "\u0629",
342
+ ]
343
+ other_tokens = ["[رابط]", "[مستخدم]", "[بريد]"]
344
+
345
+ # the never_split list is ussed with the transformers library
346
+ prefix_symbols = [x + "+" for x in prefix_list]
347
+ suffix_symblos = ["+" + x for x in suffix_list]
348
+ never_split_tokens = list(set(prefix_symbols + suffix_symblos + other_tokens))
349
+
350
+ url_regexes = [
351
+ r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)",
352
+ r"@(https?|ftp)://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?$@iS",
353
+ r"http[s]?://[a-zA-Z0-9_\-./~\?=%&]+",
354
+ r"www[a-zA-Z0-9_\-?=%&/.~]+",
355
+ r"[a-zA-Z]+\.com",
356
+ r"(?=http)[^\s]+",
357
+ r"(?=www)[^\s]+",
358
+ r"://",
359
+ ]
360
+ user_mention_regex = r"@[\w\d]+"
361
+ email_regexes = [r"[\w-]+@([\w-]+\.)+[\w-]+", r"\S+@\S+"]
362
+ redundant_punct_pattern = (
363
+ r"([!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ【»؛\s+«–…‘]{2,})"
364
+ )
365
+ regex_tatweel = r"(\D)\1{2,}"
366
+ rejected_chars_regex = r"[^0-9\u0621-\u063A\u0640-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘]"
367
+
368
+ regex_url_step1 = r"(?=http)[^\s]+"
369
+ regex_url_step2 = r"(?=www)[^\s]+"
370
+ regex_url = r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
371
+ regex_mention = r"@[\w\d]+"
372
+ regex_email = r"\S+@\S+"
373
+
374
+ chars_regex = r"0-9\u0621-\u063A\u0640-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘"
375
+
376
+ white_spaced_double_quotation_regex = r'\"\s+([^"]+)\s+\"'
377
+ white_spaced_single_quotation_regex = r"\'\s+([^']+)\s+\'"
378
+ white_spaced_back_quotation_regex = r"\`\s+([^`]+)\s+\`"
379
+ white_spaced_em_dash = r"\—\s+([^—]+)\s+\—"
380
+
381
+ left_spaced_chars = r" ([\]!#\$%\),\.:;\?}٪’،؟”؛…»·])"
382
+ right_spaced_chars = r"([\[\(\{“«‘*\~]) "
383
+ left_and_right_spaced_chars = r" ([\+\-\<\=\>\@\\\^\_\|\–]) "
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ altair
2
+ pandas
3
+ streamlit
4
+ transformers[sentencepiece]
5
+ tokenizers
6
+ arabic-reshaper==2.1.3
7
+ python-bidi==0.4.2
8
+ PyArabic
9
+ torch
10
+ codetiming==1.3.0
summarize.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import re
4
+ from functools import lru_cache
5
+ from urllib.parse import unquote
6
+
7
+ import streamlit as st
8
+ from codetiming import Timer
9
+ from transformers import pipeline
10
+ from preprocess import ArabertPreprocessor
11
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
12
+ from transformers import GPT2TokenizerFast, BertTokenizer
13
+ import tokenizers
14
+
15
+ logger = logging.getLogger(__name__)
16
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
17
+
18
+ logger.info("Loading models...")
19
+ reader_time = Timer("loading", text="Time: {:.2f}", logger=logging.info)
20
+ reader_time.start()
21
+ #####
22
+
23
+ @st.cache(ttl=24*3600, hash_funcs={AutoModelForSeq2SeqLM: lambda _: None})
24
+ def load_seq2seqLM_model(model_path): #This function is not used
25
+ return AutoModelForSeq2SeqLM.from_pretrained(model_path)
26
+ @st.cache(ttl=24*3600, hash_funcs={AutoModelForCausalLM: lambda _: None})
27
+ def load_casualLM_model(model_path):
28
+ return AutoModelForCausalLM.from_pretrained(model_path)
29
+
30
+ @st.cache(ttl=24*3600, hash_funcs={tokenizers.Tokenizer: lambda _: None})
31
+ def load_autotokenizer_model(tokenizer_path):
32
+ return AutoTokenizer.from_pretrained(tokenizer_path)
33
+ @st.cache(ttl=24*3600, hash_funcs={BertTokenizer: lambda _: None})
34
+ def load_berttokenizer_model(tokenizer_path):
35
+ return BertTokenizer.from_pretrained(tokenizer_path)
36
+ @st.cache(ttl=24*3600, hash_funcs={GPT2TokenizerFast: lambda _: None})
37
+ def load_gpt2tokenizer_model(tokenizer_path):
38
+ return GPT2TokenizerFast.from_pretrained(tokenizer_path)
39
+
40
+ @st.cache(ttl=24*3600, allow_output_mutation=True, hash_funcs={pipeline: lambda _: None, tokenizers.Tokenizer: lambda _: None})
41
+ def load_generation_pipeline(model_path):
42
+ if model_path == "malmarjeh/mbert2mbert-arabic-text-summarization":
43
+ tokenizer = load_berttokenizer_model(model_path)
44
+ else:
45
+ tokenizer = load_autotokenizer_model(model_path)
46
+ #model = load_seq2seqLM_model(model_path)
47
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
48
+ return pipeline("text2text-generation",model=model,tokenizer=tokenizer)
49
+
50
+ @st.cache(ttl=24*3600, hash_funcs={ArabertPreprocessor: lambda _: None})
51
+ def load_preprocessor():
52
+ return ArabertPreprocessor(model_name="")
53
+
54
+ tokenizer = load_autotokenizer_model("malmarjeh/bert2bert")
55
+ generation_pipeline = load_generation_pipeline("malmarjeh/bert2bert")
56
+ logger.info("BERT2BERT is loaded")
57
+
58
+ tokenizer_mbert = load_berttokenizer_model("malmarjeh/mbert2mbert-arabic-text-summarization")
59
+ generation_pipeline_mbert = load_generation_pipeline("malmarjeh/mbert2mbert-arabic-text-summarization")
60
+ logger.info("mBERT2mBERT is loaded")
61
+
62
+ tokenizer_t5 = load_autotokenizer_model("malmarjeh/t5-arabic-text-summarization")
63
+ generation_pipeline_t5 = load_generation_pipeline("malmarjeh/t5-arabic-text-summarization")
64
+ logger.info("T5 is loaded")
65
+
66
+ tokenizer_transformer = load_autotokenizer_model("malmarjeh/transformer")
67
+ generation_pipeline_transformer = load_generation_pipeline("malmarjeh/transformer")
68
+ logger.info("Transformer is loaded")
69
+
70
+ tokenizer_gpt2 = load_gpt2tokenizer_model("aubmindlab/aragpt2-base")
71
+ model_gpt2 = load_casualLM_model("malmarjeh/gpt2")
72
+ logger.info("GPT-2 is loaded")
73
+
74
+ reader_time.stop()
75
+
76
+ preprocessor = load_preprocessor()
77
+
78
+ logger.info("Finished loading the models...")
79
+ logger.info(f"Time spent loading: {reader_time.last}")
80
+
81
+ @lru_cache(maxsize=200)
82
+ def get_results(text, model_selected, num_beams, length_penalty):
83
+ logger.info("\n=================================================================")
84
+ logger.info(f"Text: {text}")
85
+ logger.info(f"model_selected: {model_selected}")
86
+ logger.info(f"length_penalty: {length_penalty}")
87
+ reader_time = Timer("summarize", text="Time: {:.2f}", logger=logging.info)
88
+ reader_time.start()
89
+ if model_selected == 'GPT-2':
90
+ number_of_tokens_limit = 80
91
+ else:
92
+ number_of_tokens_limit = 150
93
+ text = preprocessor.preprocess(text)
94
+ logger.info(f"input length: {len(text.split())}")
95
+ text = ' '.join(text.split()[:number_of_tokens_limit])
96
+
97
+ if model_selected == 'Transformer':
98
+ result = generation_pipeline_transformer(text,
99
+ pad_token_id=tokenizer_transformer.eos_token_id,
100
+ num_beams=num_beams,
101
+ repetition_penalty=3.0,
102
+ max_length=200,
103
+ length_penalty=length_penalty,
104
+ no_repeat_ngram_size = 3)[0]['generated_text']
105
+ logger.info('Transformer')
106
+ elif model_selected == 'GPT-2':
107
+ text_processed = '\n النص: ' + text + ' \n الملخص: \n '
108
+ tokenizer_gpt2.add_special_tokens({'pad_token': '<pad>'})
109
+ text_tokens = tokenizer_gpt2.batch_encode_plus([text_processed], return_tensors='pt', padding='max_length', max_length=100)
110
+ output_ = model_gpt2.generate(input_ids=text_tokens['input_ids'],repetition_penalty=3.0, num_beams=num_beams, max_length=140, pad_token_id=2, eos_token_id=0, bos_token_id=10611)
111
+ result = tokenizer_gpt2.decode(output_[0][100:], skip_special_tokens=True).strip()
112
+ logger.info('GPT-2')
113
+ elif model_selected == 'mBERT2mBERT':
114
+ result = generation_pipeline_mbert(text,
115
+ pad_token_id=tokenizer_mbert.eos_token_id,
116
+ num_beams=num_beams,
117
+ repetition_penalty=3.0,
118
+ max_length=200,
119
+ length_penalty=length_penalty,
120
+ no_repeat_ngram_size = 3)[0]['generated_text']
121
+ logger.info('mBERT')
122
+ elif model_selected == 'T5':
123
+ result = generation_pipeline_t5(text,
124
+ pad_token_id=tokenizer_t5.eos_token_id,
125
+ num_beams=num_beams,
126
+ repetition_penalty=3.0,
127
+ max_length=200,
128
+ length_penalty=length_penalty,
129
+ no_repeat_ngram_size = 3)[0]['generated_text']
130
+ logger.info('t5')
131
+ elif model_selected == 'BERT2BERT':
132
+ result = generation_pipeline(text,
133
+ pad_token_id=tokenizer.eos_token_id,
134
+ num_beams=num_beams,
135
+ repetition_penalty=3.0,
136
+ max_length=200,
137
+ length_penalty=length_penalty,
138
+ no_repeat_ngram_size = 3)[0]['generated_text']
139
+ logger.info('bert2bert')
140
+ else:
141
+ result = "الرجاء اختيار نموذج"
142
+
143
+ reader_time.stop()
144
+ logger.info(f"Time spent summarizing: {reader_time.last}")
145
+
146
+ return result
147
+
148
+
149
+ if __name__ == "__main__":
150
+ results_dict = ""