Spaces:
Running
Running
import html | |
import logging | |
import re | |
from typing import List | |
from farasa.segmenter import FarasaSegmenter | |
import emoji | |
import pyarabic.araby as araby | |
ACCEPTED_MODELS = [ | |
"bert-base-arabertv01", | |
"bert-base-arabert", | |
"bert-base-arabertv02", | |
"bert-base-arabertv2", | |
"bert-large-arabertv02", | |
"bert-large-arabertv2", | |
"araelectra-base", | |
"araelectra-base-discriminator", | |
"araelectra-base-generator", | |
"araelectra-base-artydiqa", | |
"aragpt2-base", | |
"aragpt2-medium", | |
"aragpt2-large", | |
"aragpt2-mega", | |
] | |
SEGMENTED_MODELS = [ | |
"bert-base-arabert", | |
"bert-base-arabertv2", | |
"bert-large-arabertv2", | |
] | |
SECOND_GEN_MODELS = [ | |
"bert-base-arabertv02", | |
"bert-base-arabertv2", | |
"bert-large-arabertv02", | |
"bert-large-arabertv2", | |
"araelectra-base", | |
"araelectra-base-discriminator", | |
"araelectra-base-generator", | |
"araelectra-base-artydiqa", | |
"aragpt2-base", | |
"aragpt2-medium", | |
"aragpt2-large", | |
"aragpt2-mega", | |
] | |
farasa_segmenter = FarasaSegmenter(interactive=True) | |
class ArabertPreprocessor: | |
""" | |
A Preprocessor class that cleans and preprocesses text for all models in the AraBERT repo. | |
It also can unprocess the text ouput of the generated text | |
Args: | |
model_name (:obj:`str`): model name from the HuggingFace Models page without | |
the aubmindlab tag. Will default to a base Arabic preprocessor if model name was not found. | |
Current accepted models are: | |
- "bert-base-arabertv01": No farasa segmentation. | |
- "bert-base-arabert": with farasa segmentation. | |
- "bert-base-arabertv02": No farasas egmentation. | |
- "bert-base-arabertv2": with farasa segmentation. | |
- "bert-large-arabertv02": No farasas egmentation. | |
- "bert-large-arabertv2": with farasa segmentation. | |
- "araelectra-base": No farasa segmentation. | |
- "araelectra-base-discriminator": No farasa segmentation. | |
- "araelectra-base-generator": No farasa segmentation. | |
- "aragpt2-base": No farasa segmentation. | |
- "aragpt2-medium": No farasa segmentation. | |
- "aragpt2-large": No farasa segmentation. | |
- "aragpt2-mega": No farasa segmentation. | |
keep_emojis(:obj:`bool`, `optional`, defaults to :obj:`False`): don't remove emojis while preprocessing. | |
remove_html_markup(:obj: `bool`, `optional`, defaults to :obj:`True`): Whether to remove html artfacts, | |
should be set to False when preprocessing TyDi QA. | |
replace_urls_emails_mentions(:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to replace email urls | |
and mentions by special tokens. | |
strip_tashkeel(:obj:`bool`, `optional`, defaults to :obj:`True`): remove diacritics (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, | |
KASRA, SUKUN, SHADDA). | |
strip_tatweel(:obj:`bool`, `optional`, defaults to :obj:`True`): remove tatweel '\\u0640'. | |
insert_white_spaces(:obj:`bool`, `optional`, defaults to :obj:`True`): insert whitespace before and after all non Arabic digits | |
or English digits or Arabic and English Alphabet or the 2 brackets, then inserts whitespace | |
between words and numbers or numbers and words. | |
remove_non_digit_repetition(:obj:`bool`, `optional`, defaults to :obj:`True`): replace repetition of more than 2 non-digit character with | |
2 of this character. | |
replace_slash_with_dash(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in AraBERTv02, | |
AraELECTRA and AraGPT2. | |
Set to False to force disable, and True to force enable. Replaces the "/" with "-", | |
since "/" is missing from AraBERTv2, AraELECTRA and ARAGPT2 vocabulary. | |
map_hindi_numbers_to_arabic(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in | |
AraBERTv02, AraELECTRA and AraGPT2.Set to False to force disable, and True to force enable. | |
Replaces hindi numbers with the corresponding Arabic one. ex: "١٩٩٥" --> "1995". | |
This is behavior is present by default in AraBERTv1 and v2 (with pre-segmentation), | |
and fixes the issue of caused by a bug when inserting white spaces. | |
apply_farasa_segmentation(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in | |
AraBERTv2, and AraBERTv1. Set to False to force disable, and True to force enable. | |
Returns: | |
ArabertPreprocessor: A preprocessor instance | |
Example: | |
from preprocess import ArabertPreprocessor | |
arabert_prep = ArabertPreprocessor("aubmindlab/bert-base-arabertv2") | |
arabert_prep.preprocess("SOME ARABIC TEXT") | |
""" | |
def __init__( | |
self, | |
model_name: str, | |
keep_emojis: bool = False, | |
remove_html_markup: bool = True, | |
replace_urls_emails_mentions: bool = True, | |
strip_tashkeel: bool = True, | |
strip_tatweel: bool = True, | |
insert_white_spaces: bool = True, | |
remove_non_digit_repetition: bool = True, | |
replace_slash_with_dash: bool = None, | |
map_hindi_numbers_to_arabic: bool = None, | |
apply_farasa_segmentation: bool = None, | |
): | |
model_name = model_name.replace("aubmindlab/", "").replace("wissamantoun/", "") | |
if model_name not in ACCEPTED_MODELS: | |
logging.warning( | |
"""Model provided is not in the accepted model list. Preprocessor will default to a base Arabic preprocessor""" | |
) | |
self.model_name = "bert-base-arabertv02" | |
else: | |
self.model_name = model_name | |
if apply_farasa_segmentation is None: | |
if self.model_name in SEGMENTED_MODELS: | |
self.apply_farasa_segmentation = True | |
else: | |
self.apply_farasa_segmentation = False | |
else: | |
if ( | |
apply_farasa_segmentation == False | |
and self.model_name in SEGMENTED_MODELS | |
): | |
logging.warning( | |
"The selected model_name requires Farasa pre-segmentation, but apply_farasa_segmentation was set to False!" | |
) | |
self.apply_farasa_segmentation = apply_farasa_segmentation | |
self.keep_emojis = keep_emojis | |
self.remove_html_markup = remove_html_markup | |
self.replace_urls_emails_mentions = replace_urls_emails_mentions | |
self.strip_tashkeel = strip_tashkeel | |
self.strip_tatweel = strip_tatweel | |
self.insert_white_spaces = insert_white_spaces | |
self.remove_non_digit_repetition = remove_non_digit_repetition | |
if replace_slash_with_dash is None: | |
if self.model_name in SECOND_GEN_MODELS: | |
self.replace_slash_with_dash = True | |
else: | |
self.replace_slash_with_dash = False | |
else: | |
self.replace_slash_with_dash = replace_slash_with_dash | |
if map_hindi_numbers_to_arabic is None: | |
if self.model_name in SECOND_GEN_MODELS: | |
self.map_hindi_numbers_to_arabic = True | |
else: | |
self.map_hindi_numbers_to_arabic = False | |
else: | |
self.map_hindi_numbers_to_arabic = map_hindi_numbers_to_arabic | |
def preprocess(self, text: str) -> str: | |
""" | |
Preprocess takes an input text line an applies the same preprocessing used in AraBERT | |
pretraining, or according to settings | |
Args: | |
text (:obj:`str`): inout text string | |
Returns: | |
string: A preprocessed string depending on which model was selected | |
""" | |
if ( | |
self.model_name == "bert-base-arabert" | |
or self.model_name == "bert-base-arabertv01" | |
): | |
return self._preprocess_v1( | |
text, | |
do_farasa_tokenization=self.apply_farasa_segmentation, | |
) | |
if self.model_name in SECOND_GEN_MODELS: | |
return self._preprocess_v2(text) | |
return self._preprocess_v3(text) | |
def unpreprocess(self, text: str, desegment: bool = True) -> str: | |
"""Re-formats the text to a classic format where punctuations, brackets, parenthesis are not seperated by whitespaces. | |
The objective is to make the generated text of any model appear natural and not preprocessed. | |
Args: | |
text (:obj:`str`): input text to be un-preprocessed | |
desegment (:obj:`bool`, optional): [whether or not to remove farasa pre-segmentation before].. | |
Returns: | |
str: The unpreprocessed (and possibly Farasa-desegmented) text. | |
""" | |
if self.apply_farasa_segmentation and desegment: | |
text = self.desegment(text) | |
# removes the spaces around quotation marks ex: i " ate " an apple --> i "ate" an apple | |
# https://stackoverflow.com/a/53436792/5381220 | |
text = re.sub(white_spaced_double_quotation_regex, '"' + r"\1" + '"', text) | |
text = re.sub(white_spaced_single_quotation_regex, "'" + r"\1" + "'", text) | |
text = re.sub(white_spaced_back_quotation_regex, "\`" + r"\1" + "\`", text) | |
text = re.sub(white_spaced_back_quotation_regex, "\—" + r"\1" + "\—", text) | |
# during generation, sometimes the models don't put a space after the dot, this handles it | |
text = text.replace(".", " . ") | |
text = " ".join(text.split()) | |
# handle decimals | |
text = re.sub(r"(\d+) \. (\d+)", r"\1.\2", text) | |
text = re.sub(r"(\d+) \, (\d+)", r"\1,\2", text) | |
text = re.sub(left_and_right_spaced_chars, r"\1", text) | |
text = re.sub(left_spaced_chars, r"\1", text) | |
text = re.sub(right_spaced_chars, r"\1", text) | |
return text | |
def desegment(self, text: str) -> str: | |
""" | |
Use this function if sentence tokenization was done using | |
`from arabert.preprocess_arabert import preprocess` with Farasa enabled | |
AraBERT segmentation using Farasa adds a space after the '+' for prefixes, | |
and after before the '+' for suffixes | |
Example: | |
>>> desegment('ال+ دراس +ات') | |
الدراسات | |
""" | |
text = text.replace("+ ", "+") | |
text = text.replace(" +", "+") | |
text = " ".join([self._desegmentword(word) for word in text.split(" ")]) | |
return text | |
def _desegmentword(self, orig_word: str) -> str: | |
""" | |
Word segmentor that takes a Farasa Segmented Word and removes the '+' signs | |
Example: | |
>>> _desegmentword("ال+يومي+ة") | |
اليومية | |
""" | |
word = orig_word.replace("ل+ال+", "لل") | |
if "ال+ال" not in orig_word: | |
word = word.replace("ل+ال", "لل") | |
word = word.replace("+", "") | |
word = word.replace("للل", "لل") | |
return word | |
def _preprocess_v3(self, text: str) -> str: | |
text = str(text) | |
text = html.unescape(text) | |
if self.strip_tashkeel: | |
text = araby.strip_tashkeel(text) | |
if self.strip_tatweel: | |
text = araby.strip_tatweel(text) | |
if self.replace_urls_emails_mentions: | |
# replace all possible URLs | |
for reg in url_regexes: | |
text = re.sub(reg, " [رابط] ", text) | |
# REplace Emails with [بريد] | |
for reg in email_regexes: | |
text = re.sub(reg, " [بريد] ", text) | |
# replace mentions with [مستخدم] | |
text = re.sub(user_mention_regex, " [مستخدم] ", text) | |
if self.remove_html_markup: | |
# remove html line breaks | |
text = re.sub("<br />", " ", text) | |
# remove html markup | |
text = re.sub("</?[^>]+>", " ", text) | |
if self.map_hindi_numbers_to_arabic: | |
text = text.translate(hindi_to_arabic_map) | |
# remove repeated characters >2 | |
if self.remove_non_digit_repetition: | |
text = self._remove_non_digit_repetition(text) | |
# insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets | |
if self.insert_white_spaces: | |
text = re.sub( | |
"([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z ])", | |
r" \1 ", | |
text, | |
) | |
# re-fix brackets | |
text = text.replace("[ رابط ]", "[رابط]") | |
text = text.replace("[ بريد ]", "[بريد]") | |
text = text.replace("[ مستخدم ]", "[مستخدم]") | |
# insert whitespace between words and numbers or numbers and words | |
text = re.sub( | |
"(\d+)([\u0621-\u063A\u0641-\u064A\u066A-\u066C\u0654-\u0655]+)", | |
r" \1 \2 ", | |
text, | |
) | |
text = re.sub( | |
"([\u0621-\u063A\u0641-\u064A\u066A-\u066C\u0654-\u0655]+)(\d+)", | |
r" \1 \2 ", | |
text, | |
) | |
# remove unwanted characters | |
if self.keep_emojis: | |
emoji_regex = "".join(list(emoji.UNICODE_EMOJI["en"].keys())) | |
rejected_chars_regex2 = "[^%s%s]" % (chars_regexv2, emoji_regex) | |
text = re.sub(rejected_chars_regex2, " ", text) | |
else: | |
text = re.sub(rejected_chars_regexv2, " ", text) | |
# remove extra spaces | |
text = " ".join(text.replace("\uFE0F", "").split()) | |
if self.apply_farasa_segmentation: | |
if self.keep_emojis: | |
new_text = [] | |
for word in text.split(): | |
if word in list(emoji.UNICODE_EMOJI["en"].keys()): | |
new_text.append(word) | |
else: | |
new_text.append(farasa_segmenter.segment(word)) | |
text = " ".join(new_text) | |
else: | |
text = farasa_segmenter.segment(text) | |
return self._farasa_segment(text) | |
# ALl the other models dont require Farasa Segmentation | |
return text | |
def _preprocess_v2(self, text: str) -> str: | |
text = str(text) | |
text = html.unescape(text) | |
if self.strip_tashkeel: | |
text = araby.strip_tashkeel(text) | |
if self.strip_tatweel: | |
text = araby.strip_tatweel(text) | |
if self.replace_urls_emails_mentions: | |
# replace all possible URLs | |
for reg in url_regexes: | |
text = re.sub(reg, " [رابط] ", text) | |
# REplace Emails with [بريد] | |
for reg in email_regexes: | |
text = re.sub(reg, " [بريد] ", text) | |
# replace mentions with [مستخدم] | |
text = re.sub(user_mention_regex, " [مستخدم] ", text) | |
if self.remove_html_markup: | |
# remove html line breaks | |
text = re.sub("<br />", " ", text) | |
# remove html markup | |
text = re.sub("</?[^>]+>", " ", text) | |
if self.map_hindi_numbers_to_arabic: | |
text = text.translate(hindi_to_arabic_map) | |
# remove repeated characters >2 | |
if self.remove_non_digit_repetition: | |
text = self._remove_non_digit_repetition(text) | |
# insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets | |
if self.insert_white_spaces: | |
text = re.sub( | |
"([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z\[\]])", | |
r" \1 ", | |
text, | |
) | |
# insert whitespace between words and numbers or numbers and words | |
text = re.sub( | |
"(\d+)([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)", r" \1 \2 ", text | |
) | |
text = re.sub( | |
"([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)(\d+)", r" \1 \2 ", text | |
) | |
if self.replace_slash_with_dash: | |
text = text.replace("/", "-") | |
# remove unwanted characters | |
if self.keep_emojis: | |
emoji_regex = "".join(list(emoji.UNICODE_EMOJI["en"].keys())) | |
rejected_chars_regex2 = "[^%s%s]" % (chars_regex, emoji_regex) | |
text = re.sub(rejected_chars_regex2, " ", text) | |
else: | |
text = re.sub(rejected_chars_regex, " ", text) | |
# remove extra spaces | |
text = " ".join(text.replace("\uFE0F", "").split()) | |
if ( | |
self.model_name == "bert-base-arabertv2" | |
or self.model_name == "bert-large-arabertv2" | |
): | |
if self.keep_emojis: | |
new_text = [] | |
for word in text.split(): | |
if word in list(emoji.UNICODE_EMOJI["en"].keys()): | |
new_text.append(word) | |
else: | |
new_text.append(farasa_segmenter.segment(word)) | |
text = " ".join(new_text) | |
else: | |
text = farasa_segmenter.segment(text) | |
return self._farasa_segment(text) | |
# ALl the other models dont require Farasa Segmentation | |
return text | |
def _preprocess_v1(self, text: str, do_farasa_tokenization: bool) -> str: | |
""" | |
AraBERTv1 preprocessing Function | |
""" | |
text = str(text) | |
if self.strip_tashkeel: | |
text = araby.strip_tashkeel(text) | |
text = re.sub(r"\d+\/[ء-ي]+\/\d+\]", "", text) | |
text = re.sub("ـ", "", text) | |
text = re.sub("[«»]", ' " ', text) | |
if self.replace_urls_emails_mentions: | |
# replace the [رابط] token with space if you want to clean links | |
text = re.sub(regex_url_step1, "[رابط]", text) | |
text = re.sub(regex_url_step2, "[رابط]", text) | |
text = re.sub(regex_url, "[رابط]", text) | |
text = re.sub(regex_email, "[بريد]", text) | |
text = re.sub(regex_mention, "[مستخدم]", text) | |
text = re.sub("…", r"\.", text).strip() | |
text = self._remove_redundant_punct(text) | |
if self.replace_urls_emails_mentions: | |
text = re.sub(r"\[ رابط \]|\[ رابط\]|\[رابط \]", " [رابط] ", text) | |
text = re.sub(r"\[ بريد \]|\[ بريد\]|\[بريد \]", " [بريد] ", text) | |
text = re.sub(r"\[ مستخدم \]|\[ مستخدم\]|\[مستخدم \]", " [مستخدم] ", text) | |
if self.remove_non_digit_repetition: | |
text = self._remove_non_digit_repetition(text) | |
if self.insert_white_spaces: | |
text = re.sub( | |
"([^0-9\u0621-\u063A\u0641-\u0669\u0671-\u0673a-zA-Z\[\]])", | |
r" \1 ", | |
text, | |
) | |
if do_farasa_tokenization: | |
text = self._tokenize_arabic_words_farasa(text) | |
text = " ".join(text.split()) | |
return text | |
def _farasa_segment(self, text: str) -> str: | |
line_farasa = text.split() | |
segmented_line = [] | |
for index, word in enumerate(line_farasa): | |
if word in ["[", "]"]: | |
continue | |
if word in ["رابط", "بريد", "مستخدم"] and line_farasa[index - 1] in [ | |
"[", | |
"]", | |
]: | |
segmented_line.append("[" + word + "]") | |
continue | |
if "+" not in word: | |
segmented_line.append(word) | |
continue | |
segmented_word = self._split_farasa_output(word) | |
segmented_line.extend(segmented_word) | |
return " ".join(segmented_line) | |
def _split_farasa_output(self, word: str) -> str: | |
segmented_word = [] | |
temp_token = "" | |
for i, c in enumerate(word): | |
if c == "+": | |
# if the token is KAF, it could be a suffix or prefix | |
if temp_token == "ك": | |
# if we are at the second token, then KAF is surely a prefix | |
if i == 1: | |
segmented_word.append(temp_token + "+") | |
temp_token = "" | |
# If the KAF token is between 2 tokens | |
elif word[i - 2] == "+": | |
# if the previous token is prefix, then this KAF must be a prefix | |
if segmented_word[-1][-1] == "+": | |
segmented_word.append(temp_token + "+") | |
temp_token = "" | |
# else it is a suffix, this KAF could not be a second suffix | |
else: | |
segmented_word.append("+" + temp_token) | |
temp_token = "" | |
# if Kaf is at the end, this is handled with the statement after the loop | |
elif temp_token in prefix_list: | |
segmented_word.append(temp_token + "+") | |
temp_token = "" | |
elif temp_token in suffix_list: | |
segmented_word.append("+" + temp_token) | |
temp_token = "" | |
else: | |
segmented_word.append(temp_token) | |
temp_token = "" | |
continue | |
temp_token += c | |
if temp_token != "": | |
if temp_token in suffix_list: | |
segmented_word.append("+" + temp_token) | |
else: | |
segmented_word.append(temp_token) | |
return segmented_word | |
def _tokenize_arabic_words_farasa(self, line_input: str) -> str: | |
if self.keep_emojis: | |
# insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets | |
line_farasa = [] | |
for word in line_input.split(): | |
if word in list(emoji.UNICODE_EMOJI["en"].keys()): | |
line_farasa.append(word) | |
else: | |
line_farasa.append(farasa_segmenter.segment(word)) | |
else: | |
line_farasa = farasa_segmenter.segment(line_input).split() | |
segmented_line = [] | |
for index, word in enumerate(line_farasa): | |
if word in ["[", "]"]: | |
continue | |
if word in ["رابط", "بريد", "مستخدم"] and line_farasa[index - 1] in [ | |
"[", | |
"]", | |
]: | |
segmented_line.append("[" + word + "]") | |
continue | |
segmented_word = [] | |
for token in word.split("+"): | |
if token in prefix_list: | |
segmented_word.append(token + "+") | |
elif token in suffix_list: | |
segmented_word.append("+" + token) | |
else: | |
segmented_word.append(token) | |
segmented_line.extend(segmented_word) | |
return " ".join(segmented_line) | |
def _remove_non_digit_repetition(self, text: str) -> str: | |
""" | |
:param text: the input text to remove elongation | |
:return: delongated text | |
""" | |
# loop over the number of times the regex matched the text | |
# OLD | |
# for index_ in range(len(re.findall(regex_tatweel, text))): | |
# elongation = re.search(regex_tatweel, text) | |
# if elongation: | |
# elongation_pattern = elongation.group() | |
# elongation_replacement = elongation_pattern[0] | |
# elongation_pattern = re.escape(elongation_pattern) | |
# text = re.sub( | |
# elongation_pattern, elongation_replacement, text, flags=re.MULTILINE | |
# ) | |
# else: | |
# break | |
# New | |
text = multiple_char_pattern.sub(r"\1\1", text) | |
return text | |
def _remove_redundant_punct(self, text: str) -> str: | |
text_ = text | |
result = re.search(redundant_punct_pattern, text) | |
dif = 0 | |
while result: | |
sub = result.group() | |
sub = sorted(set(sub), key=sub.index) | |
sub = " " + "".join(list(sub)) + " " | |
text = "".join( | |
(text[: result.span()[0] + dif], sub, text[result.span()[1] + dif :]) | |
) | |
text_ = "".join( | |
(text_[: result.span()[0]], text_[result.span()[1] :]) | |
).strip() | |
dif = abs(len(text) - len(text_)) | |
result = re.search(redundant_punct_pattern, text_) | |
text = re.sub(r"\s+", " ", text) | |
return text.strip() | |
prefix_list = [ | |
"ال", | |
"و", | |
"ف", | |
"ب", | |
"ك", | |
"ل", | |
"لل", | |
"\u0627\u0644", | |
"\u0648", | |
"\u0641", | |
"\u0628", | |
"\u0643", | |
"\u0644", | |
"\u0644\u0644", | |
"س", | |
] | |
suffix_list = [ | |
"ه", | |
"ها", | |
"ك", | |
"ي", | |
"هما", | |
"كما", | |
"نا", | |
"كم", | |
"هم", | |
"هن", | |
"كن", | |
"ا", | |
"ان", | |
"ين", | |
"ون", | |
"وا", | |
"ات", | |
"ت", | |
"ن", | |
"ة", | |
"\u0647", | |
"\u0647\u0627", | |
"\u0643", | |
"\u064a", | |
"\u0647\u0645\u0627", | |
"\u0643\u0645\u0627", | |
"\u0646\u0627", | |
"\u0643\u0645", | |
"\u0647\u0645", | |
"\u0647\u0646", | |
"\u0643\u0646", | |
"\u0627", | |
"\u0627\u0646", | |
"\u064a\u0646", | |
"\u0648\u0646", | |
"\u0648\u0627", | |
"\u0627\u062a", | |
"\u062a", | |
"\u0646", | |
"\u0629", | |
] | |
other_tokens = ["[رابط]", "[مستخدم]", "[بريد]"] | |
# the never_split list is ussed with the transformers library | |
prefix_symbols = [x + "+" for x in prefix_list] | |
suffix_symblos = ["+" + x for x in suffix_list] | |
never_split_tokens = list(set(prefix_symbols + suffix_symblos + other_tokens)) | |
url_regexes = [ | |
r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", | |
r"@(https?|ftp)://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?$@iS", | |
r"http[s]?://[a-zA-Z0-9_\-./~\?=%&]+", | |
r"www[a-zA-Z0-9_\-?=%&/.~]+", | |
r"[a-zA-Z]+\.com", | |
r"(?=http)[^\s]+", | |
r"(?=www)[^\s]+", | |
r"://", | |
] | |
user_mention_regex = r"@[\w\d]+" | |
email_regexes = [r"[\w-]+@([\w-]+\.)+[\w-]+", r"\S+@\S+"] | |
redundant_punct_pattern = ( | |
r"([!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ【»؛\s+«–…‘]{2,})" | |
) | |
regex_tatweel = r"(\D)\1{2,}" | |
multiple_char_pattern = re.compile(r"(\D)\1{2,}", re.DOTALL) | |
rejected_chars_regex = r"[^0-9\u0621-\u063A\u0640-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘]" | |
rejected_chars_regexv2 = r"[^0-9\u0621-\u063A\u0641-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘/]" | |
regex_url_step1 = r"(?=http)[^\s]+" | |
regex_url_step2 = r"(?=www)[^\s]+" | |
regex_url = r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)" | |
regex_mention = r"@[\w\d]+" | |
regex_email = r"\S+@\S+" | |
chars_regex = r"0-9\u0621-\u063A\u0640-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘" | |
chars_regexv2 = r"0-9\u0621-\u063A\u0640-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘/" | |
white_spaced_double_quotation_regex = r'\"\s+([^"]+)\s+\"' | |
white_spaced_single_quotation_regex = r"\'\s+([^']+)\s+\'" | |
white_spaced_back_quotation_regex = r"\`\s+([^`]+)\s+\`" | |
white_spaced_em_dash = r"\—\s+([^—]+)\s+\—" | |
left_spaced_chars = r" ([\]!#\$%\),\.:;\?}٪’،؟”؛…»·])" | |
right_spaced_chars = r"([\[\(\{“«‘*\~]) " | |
left_and_right_spaced_chars = r" ([\+\-\<\=\>\@\\\^\_\|\–]) " | |
hindi_nums = "٠١٢٣٤٥٦٧٨٩" | |
arabic_nums = "0123456789" | |
hindi_to_arabic_map = str.maketrans(hindi_nums, arabic_nums) | |