File size: 4,247 Bytes
77a12fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import re
from .constants import VALID_ARABIC
from itertools import product, combinations
_whitespace_re = re.compile(r"\s+")
def collapse_whitespace(text):
text = re.sub(_whitespace_re, " ", text)
return text
def basic_cleaners(text):
text = collapse_whitespace(text)
return text.strip()
# def valid_arabic_cleaners(text):
# text = filter(lambda char: char in VALID_ARABIC, text)
# text = collapse_whitespace(''.join(list(text)))
# return text.strip()
harakat = ["\u0650", "\u064E", "\u064F"] # [kasra, fatha, damma, ]
sukun = ["\u0652"] # [sukun]
mostly_saken = [
"\u0627",
"\u0648",
"\u0649",
"\u064A",
] # [alef, waw, alef maqsurah, ya'a]
always_saken = [
"\u0627",
"\u0649",
]
tnween_chars = [
"\u064c",
"\u064d",
"\u064b",
] # damm tanween, kasra tanween, fatha tanween, maddah
shadda_chars = ["\u0651"]
all_tashkeel = harakat+tnween_chars+sukun+shadda_chars
all_chars = list("إةابتثجحخدذرزسشصضطظعغفقكلمنهويىأءئؤ ")
prem_chars = harakat + sukun + mostly_saken + tnween_chars + shadda_chars + all_chars
def not_valid_tashkeel_comb(comb):
all_comb = list(product(harakat+sukun+tnween_chars, repeat = 2))+list(product(shadda_chars+sukun, repeat = 2))
if comb in all_comb or comb[::-1] in all_comb:
return True
else:
return False
def remove_tanween_on_alef(text):
text_copy = ""
for i in range(0, len(text)):
# if there is shaddah or character followed by alef followed by tanween add
if i < len(text) - 2 and text[i] in all_chars+shadda_chars and text[i+1] in always_saken and text[i+2] == tnween_chars[2]:
text_copy += text[i] + tnween_chars[2]
#ignore current harakah if there is alef followed by tanween
elif i < len(text) - 2 and text[i] in harakat and text[i+1] in always_saken and text[i+2] == tnween_chars[2] :
text_copy += tnween_chars[2]
# if the current char is tanween with alef is the previous character drop tanween
elif i > 0 and text[i] == tnween_chars[2] and text[i-1] in always_saken:
continue
else:
text_copy += text[i]
return text_copy
def dont_start_by_harakah(text):
text_copy = ""
for i, char in enumerate(text):
if not(char in all_tashkeel):
text_copy = text[i:]
break
return text_copy
def valid_arabic_cleaners(text):
prev_text = text
for i in range(5):
text = prev_text
cleaned_text = ""
text = filter(lambda char: char in VALID_ARABIC, text)
text = collapse_whitespace(''.join(list(text)))
text = dont_start_by_harakah(text)
text = text.strip()
i = 0
cnt = 0
len_text = len(text)
while( i < len_text):
if text[i] in all_tashkeel:
cnt += 1
else:
cnt = 0
# don't allow three consecutive tashkeel
if cnt > 2:
i+= 1
continue
# remove second tanween and sukun
if i > 1 and text[i] in tnween_chars+sukun and text[i-2] in tnween_chars+sukun:
i += 1
continue
# don't allow harakah followed by shaddah or tanween
if i < len(text) - 1 and text[i] in harakat and text[i+1] in tnween_chars+sukun+shadda_chars:
i += 1
continue
# don't allow harkah on space
if i> 0 and text[i] in all_tashkeel and text[i-1] == " " :
i += 1
continue
# only allow permissable combinations
if not_valid_tashkeel_comb((text[i], text[i-1])):
i+=1
continue
# don't allow harkah on alef, alef maqsura, if there is no tashkeel before move it back
if i> 1 and text[i] in harakat and text[i-1] in always_saken :
if text[i-2] in all_tashkeel: # in case there is a tashkeelah before alef
continue
else:
cleaned_text = text[:i-1]+text[i]+ always_saken[always_saken.index(text[i-1])]
i += 1
if i < len(text):
cleaned_text+= text[i]
i += 1
# only allow tanween before alef
cleaned_text = remove_tanween_on_alef(cleaned_text)
cleaned_text = re.sub(r" +", " ", cleaned_text).strip()
if prev_text == cleaned_text:
break
else:
prev_text = cleaned_text
return cleaned_text |