|
import html |
|
import logging |
|
import re |
|
from typing import List |
|
|
|
import pyarabic.araby as araby |
|
|
|
ACCEPTED_MODELS = [ |
|
"bert-base-arabertv01", |
|
"bert-base-arabert", |
|
"bert-base-arabertv02", |
|
"bert-base-arabertv2", |
|
"bert-large-arabertv02", |
|
"bert-large-arabertv2", |
|
"araelectra-base", |
|
"araelectra-base-discriminator", |
|
"araelectra-base-generator", |
|
"araelectra-base-artydiqa", |
|
"aragpt2-base", |
|
"aragpt2-medium", |
|
"aragpt2-large", |
|
"aragpt2-mega", |
|
] |
|
|
|
SEGMENTED_MODELS = [ |
|
"bert-base-arabert", |
|
"bert-base-arabertv2", |
|
"bert-large-arabertv2", |
|
] |
|
|
|
SECOND_GEN_MODELS = [ |
|
"bert-base-arabertv02", |
|
"bert-base-arabertv2", |
|
"bert-large-arabertv02", |
|
"bert-large-arabertv2", |
|
"araelectra-base", |
|
"araelectra-base-discriminator", |
|
"araelectra-base-generator", |
|
"araelectra-base-artydiqa", |
|
"aragpt2-base", |
|
"aragpt2-medium", |
|
"aragpt2-large", |
|
"aragpt2-mega", |
|
] |
|
|
|
|
|
class ArabertPreprocessor: |
|
""" |
|
A Preprocessor class that cleans and preprocesses text for all models in the AraBERT repo. |
|
It also can unprocess the text ouput of the generated text |
|
|
|
Args: |
|
|
|
model_name (:obj:`str`): model name from the HuggingFace Models page without |
|
the aubmindlab tag. Will default to a base Arabic preprocessor if model name was not found. |
|
Current accepted models are: |
|
|
|
- "bert-base-arabertv01": No farasa segmentation. |
|
- "bert-base-arabert": with farasa segmentation. |
|
- "bert-base-arabertv02": No farasas egmentation. |
|
- "bert-base-arabertv2": with farasa segmentation. |
|
- "bert-large-arabertv02": No farasas egmentation. |
|
- "bert-large-arabertv2": with farasa segmentation. |
|
- "araelectra-base": No farasa segmentation. |
|
- "araelectra-base-discriminator": No farasa segmentation. |
|
- "araelectra-base-generator": No farasa segmentation. |
|
- "aragpt2-base": No farasa segmentation. |
|
- "aragpt2-medium": No farasa segmentation. |
|
- "aragpt2-large": No farasa segmentation. |
|
- "aragpt2-mega": No farasa segmentation. |
|
|
|
|
|
keep_emojis(:obj:`bool`, `optional`, defaults to :obj:`False`): don't remove emojis while preprocessing. |
|
|
|
remove_html_markup(:obj: `bool`, `optional`, defaults to :obj:`True`): Whether to remove html artfacts, |
|
should be set to False when preprocessing TyDi QA. |
|
|
|
replace_urls_emails_mentions(:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to replace email urls |
|
and mentions by special tokens. |
|
|
|
strip_tashkeel(:obj:`bool`, `optional`, defaults to :obj:`True`): remove diacritics (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, |
|
KASRA, SUKUN, SHADDA). |
|
|
|
strip_tatweel(:obj:`bool`, `optional`, defaults to :obj:`True`): remove tatweel '\\u0640'. |
|
|
|
insert_white_spaces(:obj:`bool`, `optional`, defaults to :obj:`True`): insert whitespace before and after all non Arabic digits |
|
or English digits or Arabic and English Alphabet or the 2 brackets, then inserts whitespace |
|
between words and numbers or numbers and words. |
|
|
|
remove_non_digit_repetition(:obj:`bool`, `optional`, defaults to :obj:`True`): replace repetition of more than 2 non-digit character with |
|
2 of this character. |
|
|
|
replace_slash_with_dash(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in AraBERTv02, |
|
AraELECTRA and AraGPT2. |
|
Set to False to force disable, and True to force enable. Replaces the "/" with "-", |
|
since "/" is missing from AraBERTv2, AraELECTRA and ARAGPT2 vocabulary. |
|
|
|
map_hindi_numbers_to_arabic(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in |
|
AraBERTv02, AraELECTRA and AraGPT2.Set to False to force disable, and True to force enable. |
|
Replaces hindi numbers with the corresponding Arabic one. ex: "١٩٩٥" --> "1995". |
|
This is behavior is present by default in AraBERTv1 and v2 (with pre-segmentation), |
|
and fixes the issue of caused by a bug when inserting white spaces. |
|
|
|
apply_farasa_segmentation(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in |
|
AraBERTv2, and AraBERTv1. Set to False to force disable, and True to force enable. |
|
|
|
|
|
|
|
Returns: |
|
|
|
ArabertPreprocessor: A preprocessor instance |
|
|
|
Example: |
|
|
|
from preprocess import ArabertPreprocessor |
|
|
|
arabert_prep = ArabertPreprocessor("aubmindlab/bert-base-arabertv2") |
|
|
|
arabert_prep.preprocess("SOME ARABIC TEXT") |
|
""" |
|
|
|
def __init__( |
|
self, |
|
model_name: str, |
|
keep_emojis: bool = False, |
|
remove_html_markup: bool = True, |
|
replace_urls_emails_mentions: bool = True, |
|
strip_tashkeel: bool = True, |
|
strip_tatweel: bool = True, |
|
insert_white_spaces: bool = True, |
|
remove_non_digit_repetition: bool = True, |
|
replace_slash_with_dash: bool = None, |
|
map_hindi_numbers_to_arabic: bool = None, |
|
apply_farasa_segmentation: bool = None, |
|
): |
|
""" |
|
A Preprocessor class that cleans and preprocesses text for all models in the AraBERT repo. |
|
It also can unprocess the text ouput of the generated text |
|
|
|
Args: |
|
|
|
model_name (:obj:`str`): model name from the HuggingFace Models page without |
|
the aubmindlab tag. Will default to a base Arabic preprocessor if model name was not found. |
|
Current accepted models are: |
|
|
|
- :obj:`"bert-base-arabertv01"`: No farasa segmentation. |
|
- :obj:`"bert-base-arabert"`: with farasa segmentation. |
|
- :obj:`"bert-base-arabertv02"`: No farasas egmentation. |
|
- :obj:`"bert-base-arabertv2"`: with farasa segmentation. |
|
- :obj:`"bert-large-arabertv02"`: No farasas egmentation. |
|
- :obj:`"bert-large-arabertv2"`: with farasa segmentation. |
|
- :obj:`"araelectra-base"`: No farasa segmentation. |
|
- :obj:`"araelectra-base-discriminator"`: No farasa segmentation. |
|
- :obj:`"araelectra-base-generator"`: No farasa segmentation. |
|
- :obj:`"aragpt2-base"`: No farasa segmentation. |
|
- :obj:`"aragpt2-medium"`: No farasa segmentation. |
|
- :obj:`"aragpt2-large"`: No farasa segmentation. |
|
- :obj:`"aragpt2-mega"`: No farasa segmentation. |
|
|
|
keep_emojis(:obj:`bool`, `optional`, defaults to :obj:`False`): don't remove emojis while preprocessing. |
|
|
|
remove_html_markup(:obj: `bool`, `optional`, defaults to :obj:`True`): Whether to remove html artfacts, |
|
should be set to False when preprocessing TyDi QA. |
|
|
|
replace_urls_emails_mentions(:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to replace email urls |
|
and mentions by special tokens. |
|
|
|
strip_tashkeel(:obj:`bool`, `optional`, defaults to :obj:`True`): remove diacritics (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, |
|
KASRA, SUKUN, SHADDA). |
|
|
|
strip_tatweel(:obj:`bool`, `optional`, defaults to :obj:`True`): remove tatweel '\\u0640'. |
|
|
|
insert_white_spaces(:obj:`bool`, `optional`, defaults to :obj:`True`): insert whitespace before and after all non Arabic digits |
|
or English digits or Arabic and English Alphabet or the 2 brackets, then inserts whitespace |
|
between words and numbers or numbers and words. |
|
|
|
remove_non_digit_repetition(:obj:`bool`, `optional`, defaults to :obj:`True`): replace repetition of more than 2 non-digit character with |
|
2 of this character. |
|
|
|
replace_slash_with_dash(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in AraBERTv02, |
|
AraELECTRA and AraGPT2. |
|
Set to False to force disable, and True to force enable. Replaces the "/" with "-", |
|
since "/" is missing from AraBERTv2, AraELECTRA and ARAGPT2 vocabulary. |
|
|
|
map_hindi_numbers_to_arabic(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in |
|
AraBERTv02, AraELECTRA and AraGPT2.Set to False to force disable, and True to force enable. |
|
Replaces hindi numbers with the corresponding Arabic one. ex: "١٩٩٥" --> "1995". |
|
This is behavior is present by default in AraBERTv1 and v2 (with pre-segmentation), |
|
and fixes the issue of caused by a bug when inserting white spaces. |
|
|
|
apply_farasa_segmentation(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in |
|
AraBERTv2, and AraBERTv1. Set to False to force disable, and True to force enable. |
|
|
|
Returns: |
|
|
|
ArabertPreprocessor: A preprocessor instance |
|
|
|
Example: |
|
|
|
from preprocess import ArabertPreprocessor |
|
|
|
arabert_prep = ArabertPreprocessor("aubmindlab/bert-base-arabertv2") |
|
|
|
arabert_prep.preprocess("SOME ARABIC TEXT") |
|
|
|
""" |
|
model_name = model_name.replace("aubmindlab/", "").replace("wissamantoun/", "") |
|
|
|
if model_name not in ACCEPTED_MODELS: |
|
logging.warning( |
|
"""Model provided is not in the accepted model list. Preprocessor will default to a base Arabic preprocessor""" |
|
) |
|
self.model_name = "bert-base-arabertv02" |
|
else: |
|
self.model_name = model_name |
|
|
|
if apply_farasa_segmentation is None: |
|
if self.model_name in SEGMENTED_MODELS: |
|
self.apply_farasa_segmentation = True |
|
else: |
|
self.apply_farasa_segmentation = False |
|
else: |
|
if apply_farasa_segmentation == False and self.apply_farasa_segmentation: |
|
logging.warning( |
|
"The selected model_name requires Farasa pre-segmentation, but apply_farasa_segmentation was set to False!" |
|
) |
|
|
|
self.apply_farasa_segmentation = apply_farasa_segmentation |
|
|
|
if self.apply_farasa_segmentation: |
|
try: |
|
from farasa.segmenter import FarasaSegmenter |
|
|
|
self.farasa_segmenter = FarasaSegmenter(interactive=True) |
|
except ModuleNotFoundError: |
|
logging.error( |
|
"farasapy is not installed, you want be able to process text for AraBERTv1 and v2. Install it using: pip install farasapy" |
|
) |
|
|
|
self.keep_emojis = keep_emojis |
|
if self.keep_emojis: |
|
import emoji |
|
|
|
self.emoji = emoji |
|
if self.apply_farasa_segmentation: |
|
logging.warning( |
|
"Keeping tweets with Farasa Segmentation is 10 times slower" |
|
) |
|
|
|
self.remove_html_markup = remove_html_markup |
|
self.replace_urls_emails_mentions = replace_urls_emails_mentions |
|
self.strip_tashkeel = strip_tashkeel |
|
self.strip_tatweel = strip_tatweel |
|
self.insert_white_spaces = insert_white_spaces |
|
self.remove_non_digit_repetition = remove_non_digit_repetition |
|
|
|
if replace_slash_with_dash is None: |
|
if self.model_name in SECOND_GEN_MODELS: |
|
self.replace_slash_with_dash = True |
|
else: |
|
self.replace_slash_with_dash = False |
|
else: |
|
self.replace_slash_with_dash = replace_slash_with_dash |
|
|
|
if map_hindi_numbers_to_arabic is None: |
|
if self.model_name in SECOND_GEN_MODELS: |
|
self.map_hindi_numbers_to_arabic = True |
|
else: |
|
self.map_hindi_numbers_to_arabic = False |
|
else: |
|
self.map_hindi_numbers_to_arabic = map_hindi_numbers_to_arabic |
|
|
|
def preprocess(self, text: str) -> str: |
|
""" |
|
Preprocess takes an input text line an applies the same preprocessing used in AraBERT |
|
pretraining, or according to settings |
|
|
|
Args: |
|
|
|
text (:obj:`str`): inout text string |
|
|
|
Returns: |
|
|
|
string: A preprocessed string depending on which model was selected |
|
""" |
|
if ( |
|
self.model_name == "bert-base-arabert" |
|
or self.model_name == "bert-base-arabertv01" |
|
): |
|
return self._preprocess_v1( |
|
text, |
|
do_farasa_tokenization=self.apply_farasa_segmentation, |
|
) |
|
|
|
if self.model_name in SECOND_GEN_MODELS: |
|
return self._preprocess_v2(text) |
|
|
|
return self._preprocess_v3(text) |
|
|
|
def unpreprocess(self, text: str, desegment: bool = True) -> str: |
|
"""Re-formats the text to a classic format where punctuations, brackets, parenthesis are not seperated by whitespaces. |
|
The objective is to make the generated text of any model appear natural and not preprocessed. |
|
|
|
Args: |
|
text (:obj:`str`): input text to be un-preprocessed |
|
desegment (:obj:`bool`, optional): [whether or not to remove farasa pre-segmentation before].. |
|
|
|
Returns: |
|
str: The unpreprocessed (and possibly Farasa-desegmented) text. |
|
""" |
|
|
|
if self.apply_farasa_segmentation and desegment: |
|
text = self.desegment(text) |
|
|
|
|
|
|
|
text = re.sub(white_spaced_double_quotation_regex, '"' + r"\1" + '"', text) |
|
text = re.sub(white_spaced_single_quotation_regex, "'" + r"\1" + "'", text) |
|
text = re.sub(white_spaced_back_quotation_regex, "\`" + r"\1" + "\`", text) |
|
text = re.sub(white_spaced_back_quotation_regex, "\—" + r"\1" + "\—", text) |
|
|
|
|
|
text = text.replace(".", " . ") |
|
text = " ".join(text.split()) |
|
|
|
|
|
text = re.sub(r"(\d+) \. (\d+)", r"\1.\2", text) |
|
text = re.sub(r"(\d+) \, (\d+)", r"\1,\2", text) |
|
|
|
text = re.sub(left_and_right_spaced_chars, r"\1", text) |
|
text = re.sub(left_spaced_chars, r"\1", text) |
|
text = re.sub(right_spaced_chars, r"\1", text) |
|
|
|
return text |
|
|
|
def desegment(self, text: str) -> str: |
|
""" |
|
Use this function if sentence tokenization was done using |
|
`from arabert.preprocess_arabert import preprocess` with Farasa enabled |
|
AraBERT segmentation using Farasa adds a space after the '+' for prefixes, |
|
and after before the '+' for suffixes |
|
|
|
Example: |
|
>>> desegment('ال+ دراس +ات') |
|
الدراسات |
|
""" |
|
text = text.replace("+ ", "+") |
|
text = text.replace(" +", "+") |
|
text = " ".join([self._desegmentword(word) for word in text.split(" ")]) |
|
return text |
|
|
|
def _desegmentword(self, orig_word: str) -> str: |
|
""" |
|
Word segmentor that takes a Farasa Segmented Word and removes the '+' signs |
|
|
|
Example: |
|
>>> _desegmentword("ال+يومي+ة") |
|
اليومية |
|
""" |
|
word = orig_word.replace("ل+ال+", "لل") |
|
if "ال+ال" not in orig_word: |
|
word = word.replace("ل+ال", "لل") |
|
word = word.replace("+", "") |
|
word = word.replace("للل", "لل") |
|
return word |
|
|
|
def _preprocess_v3(self, text: str) -> str: |
|
text = str(text) |
|
text = html.unescape(text) |
|
if self.strip_tashkeel: |
|
text = araby.strip_tashkeel(text) |
|
if self.strip_tatweel: |
|
text = araby.strip_tatweel(text) |
|
|
|
if self.replace_urls_emails_mentions: |
|
|
|
for reg in url_regexes: |
|
text = re.sub(reg, " [رابط] ", text) |
|
|
|
for reg in email_regexes: |
|
text = re.sub(reg, " [بريد] ", text) |
|
|
|
text = re.sub(user_mention_regex, " [مستخدم] ", text) |
|
|
|
if self.remove_html_markup: |
|
|
|
text = re.sub("<br />", " ", text) |
|
|
|
text = re.sub("</?[^>]+>", " ", text) |
|
|
|
if self.map_hindi_numbers_to_arabic: |
|
text = text.translate(hindi_to_arabic_map) |
|
|
|
|
|
if self.remove_non_digit_repetition: |
|
text = self._remove_non_digit_repetition(text) |
|
|
|
|
|
if self.insert_white_spaces: |
|
text = re.sub( |
|
"([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z ])", |
|
r" \1 ", |
|
text, |
|
) |
|
|
|
|
|
text = text.replace("[ رابط ]", "[رابط]") |
|
text = text.replace("[ بريد ]", "[بريد]") |
|
text = text.replace("[ مستخدم ]", "[مستخدم]") |
|
|
|
|
|
text = re.sub( |
|
"(\d+)([\u0621-\u063A\u0641-\u064A\u066A-\u066C\u0654-\u0655]+)", |
|
r" \1 \2 ", |
|
text, |
|
) |
|
text = re.sub( |
|
"([\u0621-\u063A\u0641-\u064A\u066A-\u066C\u0654-\u0655]+)(\d+)", |
|
r" \1 \2 ", |
|
text, |
|
) |
|
|
|
|
|
if self.keep_emojis: |
|
emoji_regex = "".join(list(self.emoji.UNICODE_EMOJI["en"].keys())) |
|
rejected_chars_regex2 = "[^%s%s]" % (chars_regexv2, emoji_regex) |
|
text = re.sub(rejected_chars_regex2, " ", text) |
|
else: |
|
text = re.sub(rejected_chars_regexv2, " ", text) |
|
|
|
|
|
text = " ".join(text.replace("\uFE0F", "").split()) |
|
|
|
if self.apply_farasa_segmentation: |
|
if self.keep_emojis: |
|
new_text = [] |
|
for word in text.split(): |
|
if word in list(self.emoji.UNICODE_EMOJI["en"].keys()): |
|
new_text.append(word) |
|
else: |
|
new_text.append(self.farasa_segmenter.segment(word)) |
|
text = " ".join(new_text) |
|
else: |
|
text = self.farasa_segmenter.segment(text) |
|
return self._farasa_segment(text) |
|
|
|
|
|
return text |
|
|
|
def _preprocess_v2(self, text: str) -> str: |
|
text = str(text) |
|
text = html.unescape(text) |
|
if self.strip_tashkeel: |
|
text = araby.strip_tashkeel(text) |
|
if self.strip_tatweel: |
|
text = araby.strip_tatweel(text) |
|
|
|
if self.replace_urls_emails_mentions: |
|
|
|
for reg in url_regexes: |
|
text = re.sub(reg, " [رابط] ", text) |
|
|
|
for reg in email_regexes: |
|
text = re.sub(reg, " [بريد] ", text) |
|
|
|
text = re.sub(user_mention_regex, " [مستخدم] ", text) |
|
|
|
if self.remove_html_markup: |
|
|
|
text = re.sub("<br />", " ", text) |
|
|
|
text = re.sub("</?[^>]+>", " ", text) |
|
|
|
if self.map_hindi_numbers_to_arabic: |
|
text = text.translate(hindi_to_arabic_map) |
|
|
|
|
|
if self.remove_non_digit_repetition: |
|
text = self._remove_non_digit_repetition(text) |
|
|
|
|
|
if self.insert_white_spaces: |
|
text = re.sub( |
|
"([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z\[\]])", |
|
r" \1 ", |
|
text, |
|
) |
|
|
|
|
|
text = re.sub( |
|
"(\d+)([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)", r" \1 \2 ", text |
|
) |
|
text = re.sub( |
|
"([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)(\d+)", r" \1 \2 ", text |
|
) |
|
|
|
if self.replace_slash_with_dash: |
|
text = text.replace("/", "-") |
|
|
|
|
|
if self.keep_emojis: |
|
emoji_regex = "".join(list(self.emoji.UNICODE_EMOJI["en"].keys())) |
|
rejected_chars_regex2 = "[^%s%s]" % (chars_regex, emoji_regex) |
|
text = re.sub(rejected_chars_regex2, " ", text) |
|
else: |
|
text = re.sub(rejected_chars_regex, " ", text) |
|
|
|
|
|
text = " ".join(text.replace("\uFE0F", "").split()) |
|
|
|
if ( |
|
self.model_name == "bert-base-arabertv2" |
|
or self.model_name == "bert-large-arabertv2" |
|
): |
|
if self.keep_emojis: |
|
new_text = [] |
|
for word in text.split(): |
|
if word in list(self.emoji.UNICODE_EMOJI["en"].keys()): |
|
new_text.append(word) |
|
else: |
|
new_text.append(self.farasa_segmenter.segment(word)) |
|
text = " ".join(new_text) |
|
else: |
|
text = self.farasa_segmenter.segment(text) |
|
return self._farasa_segment(text) |
|
|
|
|
|
return text |
|
|
|
def _preprocess_v1(self, text: str, do_farasa_tokenization: bool) -> str: |
|
""" |
|
AraBERTv1 preprocessing Function |
|
""" |
|
text = str(text) |
|
if self.strip_tashkeel: |
|
text = araby.strip_tashkeel(text) |
|
|
|
text = re.sub(r"\d+\/[ء-ي]+\/\d+\]", "", text) |
|
text = re.sub("ـ", "", text) |
|
text = re.sub("[«»]", ' " ', text) |
|
|
|
if self.replace_urls_emails_mentions: |
|
|
|
text = re.sub(regex_url_step1, "[رابط]", text) |
|
text = re.sub(regex_url_step2, "[رابط]", text) |
|
text = re.sub(regex_url, "[رابط]", text) |
|
text = re.sub(regex_email, "[بريد]", text) |
|
text = re.sub(regex_mention, "[مستخدم]", text) |
|
text = re.sub("…", r"\.", text).strip() |
|
text = self._remove_redundant_punct(text) |
|
|
|
if self.replace_urls_emails_mentions: |
|
text = re.sub(r"\[ رابط \]|\[ رابط\]|\[رابط \]", " [رابط] ", text) |
|
text = re.sub(r"\[ بريد \]|\[ بريد\]|\[بريد \]", " [بريد] ", text) |
|
text = re.sub(r"\[ مستخدم \]|\[ مستخدم\]|\[مستخدم \]", " [مستخدم] ", text) |
|
|
|
if self.remove_non_digit_repetition: |
|
text = self._remove_non_digit_repetition(text) |
|
|
|
if self.insert_white_spaces: |
|
text = re.sub( |
|
"([^0-9\u0621-\u063A\u0641-\u0669\u0671-\u0673a-zA-Z\[\]])", |
|
r" \1 ", |
|
text, |
|
) |
|
if do_farasa_tokenization: |
|
text = self._tokenize_arabic_words_farasa(text) |
|
|
|
text = " ".join(text.split()) |
|
|
|
return text |
|
|
|
def _farasa_segment(self, text: str) -> str: |
|
line_farasa = text.split() |
|
segmented_line = [] |
|
for index, word in enumerate(line_farasa): |
|
if word in ["[", "]"]: |
|
continue |
|
if word in ["رابط", "بريد", "مستخدم"] and line_farasa[index - 1] in [ |
|
"[", |
|
"]", |
|
]: |
|
segmented_line.append("[" + word + "]") |
|
continue |
|
if "+" not in word: |
|
segmented_line.append(word) |
|
continue |
|
segmented_word = self._split_farasa_output(word) |
|
segmented_line.extend(segmented_word) |
|
|
|
return " ".join(segmented_line) |
|
|
|
def _split_farasa_output(self, word: str) -> str: |
|
segmented_word = [] |
|
temp_token = "" |
|
for i, c in enumerate(word): |
|
if c == "+": |
|
|
|
if temp_token == "ك": |
|
|
|
if i == 1: |
|
segmented_word.append(temp_token + "+") |
|
temp_token = "" |
|
|
|
elif word[i - 2] == "+": |
|
|
|
if segmented_word[-1][-1] == "+": |
|
segmented_word.append(temp_token + "+") |
|
temp_token = "" |
|
|
|
else: |
|
segmented_word.append("+" + temp_token) |
|
temp_token = "" |
|
|
|
elif temp_token in prefix_list: |
|
segmented_word.append(temp_token + "+") |
|
temp_token = "" |
|
elif temp_token in suffix_list: |
|
segmented_word.append("+" + temp_token) |
|
temp_token = "" |
|
else: |
|
segmented_word.append(temp_token) |
|
temp_token = "" |
|
continue |
|
temp_token += c |
|
if temp_token != "": |
|
if temp_token in suffix_list: |
|
segmented_word.append("+" + temp_token) |
|
else: |
|
segmented_word.append(temp_token) |
|
return segmented_word |
|
|
|
def _tokenize_arabic_words_farasa(self, line_input: str) -> str: |
|
|
|
if self.keep_emojis: |
|
|
|
line_farasa = [] |
|
for word in line_input.split(): |
|
if word in list(self.emoji.UNICODE_EMOJI["en"].keys()): |
|
line_farasa.append(word) |
|
else: |
|
line_farasa.append(self.farasa_segmenter.segment(word)) |
|
else: |
|
line_farasa = self.farasa_segmenter.segment(line_input).split() |
|
|
|
segmented_line = [] |
|
for index, word in enumerate(line_farasa): |
|
if word in ["[", "]"]: |
|
continue |
|
if word in ["رابط", "بريد", "مستخدم"] and line_farasa[index - 1] in [ |
|
"[", |
|
"]", |
|
]: |
|
segmented_line.append("[" + word + "]") |
|
continue |
|
segmented_word = [] |
|
for token in word.split("+"): |
|
if token in prefix_list: |
|
segmented_word.append(token + "+") |
|
elif token in suffix_list: |
|
segmented_word.append("+" + token) |
|
else: |
|
segmented_word.append(token) |
|
segmented_line.extend(segmented_word) |
|
return " ".join(segmented_line) |
|
|
|
def _remove_non_digit_repetition(self, text: str) -> str: |
|
""" |
|
:param text: the input text to remove elongation |
|
:return: delongated text |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text = multiple_char_pattern.sub(r"\1\1", text) |
|
return text |
|
|
|
def _remove_redundant_punct(self, text: str) -> str: |
|
text_ = text |
|
result = re.search(redundant_punct_pattern, text) |
|
dif = 0 |
|
while result: |
|
sub = result.group() |
|
sub = sorted(set(sub), key=sub.index) |
|
sub = " " + "".join(list(sub)) + " " |
|
text = "".join( |
|
(text[: result.span()[0] + dif], sub, text[result.span()[1] + dif :]) |
|
) |
|
text_ = "".join( |
|
(text_[: result.span()[0]], text_[result.span()[1] :]) |
|
).strip() |
|
dif = abs(len(text) - len(text_)) |
|
result = re.search(redundant_punct_pattern, text_) |
|
text = re.sub(r"\s+", " ", text) |
|
return text.strip() |
|
|
|
|
|
prefix_list = [ |
|
"ال", |
|
"و", |
|
"ف", |
|
"ب", |
|
"ك", |
|
"ل", |
|
"لل", |
|
"\u0627\u0644", |
|
"\u0648", |
|
"\u0641", |
|
"\u0628", |
|
"\u0643", |
|
"\u0644", |
|
"\u0644\u0644", |
|
"س", |
|
] |
|
suffix_list = [ |
|
"ه", |
|
"ها", |
|
"ك", |
|
"ي", |
|
"هما", |
|
"كما", |
|
"نا", |
|
"كم", |
|
"هم", |
|
"هن", |
|
"كن", |
|
"ا", |
|
"ان", |
|
"ين", |
|
"ون", |
|
"وا", |
|
"ات", |
|
"ت", |
|
"ن", |
|
"ة", |
|
"\u0647", |
|
"\u0647\u0627", |
|
"\u0643", |
|
"\u064a", |
|
"\u0647\u0645\u0627", |
|
"\u0643\u0645\u0627", |
|
"\u0646\u0627", |
|
"\u0643\u0645", |
|
"\u0647\u0645", |
|
"\u0647\u0646", |
|
"\u0643\u0646", |
|
"\u0627", |
|
"\u0627\u0646", |
|
"\u064a\u0646", |
|
"\u0648\u0646", |
|
"\u0648\u0627", |
|
"\u0627\u062a", |
|
"\u062a", |
|
"\u0646", |
|
"\u0629", |
|
] |
|
other_tokens = ["[رابط]", "[مستخدم]", "[بريد]"] |
|
|
|
|
|
prefix_symbols = [x + "+" for x in prefix_list] |
|
suffix_symblos = ["+" + x for x in suffix_list] |
|
never_split_tokens = list(set(prefix_symbols + suffix_symblos + other_tokens)) |
|
|
|
url_regexes = [ |
|
r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", |
|
r"@(https?|ftp)://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?$@iS", |
|
r"http[s]?://[a-zA-Z0-9_\-./~\?=%&]+", |
|
r"www[a-zA-Z0-9_\-?=%&/.~]+", |
|
r"[a-zA-Z]+\.com", |
|
r"(?=http)[^\s]+", |
|
r"(?=www)[^\s]+", |
|
r"://", |
|
] |
|
user_mention_regex = r"@[\w\d]+" |
|
email_regexes = [r"[\w-]+@([\w-]+\.)+[\w-]+", r"\S+@\S+"] |
|
redundant_punct_pattern = ( |
|
r"([!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ【»؛\s+«–…‘]{2,})" |
|
) |
|
|
|
regex_tatweel = r"(\D)\1{2,}" |
|
multiple_char_pattern = re.compile(r"(\D)\1{2,}", re.DOTALL) |
|
|
|
rejected_chars_regex = r"[^0-9\u0621-\u063A\u0640-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘]" |
|
rejected_chars_regexv2 = r"[^0-9\u0621-\u063A\u0641-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘/]" |
|
|
|
regex_url_step1 = r"(?=http)[^\s]+" |
|
regex_url_step2 = r"(?=www)[^\s]+" |
|
regex_url = r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)" |
|
regex_mention = r"@[\w\d]+" |
|
regex_email = r"\S+@\S+" |
|
|
|
chars_regex = r"0-9\u0621-\u063A\u0640-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘" |
|
chars_regexv2 = r"0-9\u0621-\u063A\u0640-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘/" |
|
|
|
white_spaced_double_quotation_regex = r'\"\s+([^"]+)\s+\"' |
|
white_spaced_single_quotation_regex = r"\'\s+([^']+)\s+\'" |
|
white_spaced_back_quotation_regex = r"\`\s+([^`]+)\s+\`" |
|
white_spaced_em_dash = r"\—\s+([^—]+)\s+\—" |
|
|
|
left_spaced_chars = r" ([\]!#\$%\),\.:;\?}٪’،؟”؛…»·])" |
|
right_spaced_chars = r"([\[\(\{“«‘*\~]) " |
|
left_and_right_spaced_chars = r" ([\+\-\<\=\>\@\\\^\_\|\–]) " |
|
|
|
hindi_nums = "٠١٢٣٤٥٦٧٨٩" |
|
arabic_nums = "0123456789" |
|
hindi_to_arabic_map = str.maketrans(hindi_nums, arabic_nums) |
|
|