|
import re |
|
from .constants import VALID_ARABIC |
|
from itertools import product, combinations |
|
|
|
_whitespace_re = re.compile(r"\s+") |
|
|
|
|
|
def collapse_whitespace(text): |
|
text = re.sub(_whitespace_re, " ", text) |
|
return text |
|
|
|
|
|
def basic_cleaners(text): |
|
text = collapse_whitespace(text) |
|
return text.strip() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
harakat = ["\u0650", "\u064E", "\u064F"] |
|
sukun = ["\u0652"] |
|
mostly_saken = [ |
|
"\u0627", |
|
"\u0648", |
|
"\u0649", |
|
"\u064A", |
|
] |
|
|
|
always_saken = [ |
|
"\u0627", |
|
"\u0649", |
|
] |
|
|
|
tnween_chars = [ |
|
"\u064c", |
|
"\u064d", |
|
"\u064b", |
|
] |
|
shadda_chars = ["\u0651"] |
|
all_tashkeel = harakat+tnween_chars+sukun+shadda_chars |
|
|
|
|
|
all_chars = list("إةابتثجحخدذرزسشصضطظعغفقكلمنهويىأءئؤ ") |
|
prem_chars = harakat + sukun + mostly_saken + tnween_chars + shadda_chars + all_chars |
|
|
|
def not_valid_tashkeel_comb(comb): |
|
all_comb = list(product(harakat+sukun+tnween_chars, repeat = 2))+list(product(shadda_chars+sukun, repeat = 2)) |
|
if comb in all_comb or comb[::-1] in all_comb: |
|
return True |
|
else: |
|
return False |
|
|
|
def remove_tanween_on_alef(text): |
|
text_copy = "" |
|
for i in range(0, len(text)): |
|
|
|
|
|
if i < len(text) - 2 and text[i] in all_chars+shadda_chars and text[i+1] in always_saken and text[i+2] == tnween_chars[2]: |
|
text_copy += text[i] + tnween_chars[2] |
|
|
|
|
|
elif i < len(text) - 2 and text[i] in harakat and text[i+1] in always_saken and text[i+2] == tnween_chars[2] : |
|
text_copy += tnween_chars[2] |
|
|
|
|
|
elif i > 0 and text[i] == tnween_chars[2] and text[i-1] in always_saken: |
|
continue |
|
|
|
else: |
|
text_copy += text[i] |
|
return text_copy |
|
|
|
def dont_start_by_harakah(text): |
|
text_copy = "" |
|
for i, char in enumerate(text): |
|
if not(char in all_tashkeel): |
|
text_copy = text[i:] |
|
break |
|
return text_copy |
|
|
|
def valid_arabic_cleaners(text): |
|
prev_text = text |
|
for i in range(5): |
|
text = prev_text |
|
cleaned_text = "" |
|
text = filter(lambda char: char in VALID_ARABIC, text) |
|
text = collapse_whitespace(''.join(list(text))) |
|
text = dont_start_by_harakah(text) |
|
text = text.strip() |
|
i = 0 |
|
cnt = 0 |
|
len_text = len(text) |
|
while( i < len_text): |
|
if text[i] in all_tashkeel: |
|
cnt += 1 |
|
else: |
|
cnt = 0 |
|
|
|
|
|
if cnt > 2: |
|
i+= 1 |
|
continue |
|
|
|
|
|
if i > 1 and text[i] in tnween_chars+sukun and text[i-2] in tnween_chars+sukun: |
|
i += 1 |
|
continue |
|
|
|
|
|
if i < len(text) - 1 and text[i] in harakat and text[i+1] in tnween_chars+sukun+shadda_chars: |
|
i += 1 |
|
continue |
|
|
|
|
|
if i> 0 and text[i] in all_tashkeel and text[i-1] == " " : |
|
i += 1 |
|
continue |
|
|
|
|
|
if not_valid_tashkeel_comb((text[i], text[i-1])): |
|
i+=1 |
|
continue |
|
|
|
|
|
if i> 1 and text[i] in harakat and text[i-1] in always_saken : |
|
if text[i-2] in all_tashkeel: |
|
continue |
|
else: |
|
cleaned_text = text[:i-1]+text[i]+ always_saken[always_saken.index(text[i-1])] |
|
i += 1 |
|
|
|
if i < len(text): |
|
cleaned_text+= text[i] |
|
i += 1 |
|
|
|
|
|
cleaned_text = remove_tanween_on_alef(cleaned_text) |
|
cleaned_text = re.sub(r" +", " ", cleaned_text).strip() |
|
if prev_text == cleaned_text: |
|
break |
|
else: |
|
prev_text = cleaned_text |
|
return cleaned_text |