Ashaar / poetry_diacritizer /util /text_cleaners.py
aaaaaabbbbbbbdddddddduuuuulllll's picture
Duplicate from arbml/Ashaar
77a12fd
import re
from .constants import VALID_ARABIC
from itertools import product, combinations
_whitespace_re = re.compile(r"\s+")
def collapse_whitespace(text):
text = re.sub(_whitespace_re, " ", text)
return text
def basic_cleaners(text):
text = collapse_whitespace(text)
return text.strip()
# def valid_arabic_cleaners(text):
# text = filter(lambda char: char in VALID_ARABIC, text)
# text = collapse_whitespace(''.join(list(text)))
# return text.strip()
harakat = ["\u0650", "\u064E", "\u064F"] # [kasra, fatha, damma, ]
sukun = ["\u0652"] # [sukun]
mostly_saken = [
"\u0627",
"\u0648",
"\u0649",
"\u064A",
] # [alef, waw, alef maqsurah, ya'a]
always_saken = [
"\u0627",
"\u0649",
]
tnween_chars = [
"\u064c",
"\u064d",
"\u064b",
] # damm tanween, kasra tanween, fatha tanween, maddah
shadda_chars = ["\u0651"]
all_tashkeel = harakat+tnween_chars+sukun+shadda_chars
all_chars = list("إةابتثجحخدذرزسشصضطظعغفقكلمنهويىأءئؤ ")
prem_chars = harakat + sukun + mostly_saken + tnween_chars + shadda_chars + all_chars
def not_valid_tashkeel_comb(comb):
all_comb = list(product(harakat+sukun+tnween_chars, repeat = 2))+list(product(shadda_chars+sukun, repeat = 2))
if comb in all_comb or comb[::-1] in all_comb:
return True
else:
return False
def remove_tanween_on_alef(text):
text_copy = ""
for i in range(0, len(text)):
# if there is shaddah or character followed by alef followed by tanween add
if i < len(text) - 2 and text[i] in all_chars+shadda_chars and text[i+1] in always_saken and text[i+2] == tnween_chars[2]:
text_copy += text[i] + tnween_chars[2]
#ignore current harakah if there is alef followed by tanween
elif i < len(text) - 2 and text[i] in harakat and text[i+1] in always_saken and text[i+2] == tnween_chars[2] :
text_copy += tnween_chars[2]
# if the current char is tanween with alef is the previous character drop tanween
elif i > 0 and text[i] == tnween_chars[2] and text[i-1] in always_saken:
continue
else:
text_copy += text[i]
return text_copy
def dont_start_by_harakah(text):
text_copy = ""
for i, char in enumerate(text):
if not(char in all_tashkeel):
text_copy = text[i:]
break
return text_copy
def valid_arabic_cleaners(text):
prev_text = text
for i in range(5):
text = prev_text
cleaned_text = ""
text = filter(lambda char: char in VALID_ARABIC, text)
text = collapse_whitespace(''.join(list(text)))
text = dont_start_by_harakah(text)
text = text.strip()
i = 0
cnt = 0
len_text = len(text)
while( i < len_text):
if text[i] in all_tashkeel:
cnt += 1
else:
cnt = 0
# don't allow three consecutive tashkeel
if cnt > 2:
i+= 1
continue
# remove second tanween and sukun
if i > 1 and text[i] in tnween_chars+sukun and text[i-2] in tnween_chars+sukun:
i += 1
continue
# don't allow harakah followed by shaddah or tanween
if i < len(text) - 1 and text[i] in harakat and text[i+1] in tnween_chars+sukun+shadda_chars:
i += 1
continue
# don't allow harkah on space
if i> 0 and text[i] in all_tashkeel and text[i-1] == " " :
i += 1
continue
# only allow permissable combinations
if not_valid_tashkeel_comb((text[i], text[i-1])):
i+=1
continue
# don't allow harkah on alef, alef maqsura, if there is no tashkeel before move it back
if i> 1 and text[i] in harakat and text[i-1] in always_saken :
if text[i-2] in all_tashkeel: # in case there is a tashkeelah before alef
continue
else:
cleaned_text = text[:i-1]+text[i]+ always_saken[always_saken.index(text[i-1])]
i += 1
if i < len(text):
cleaned_text+= text[i]
i += 1
# only allow tanween before alef
cleaned_text = remove_tanween_on_alef(cleaned_text)
cleaned_text = re.sub(r" +", " ", cleaned_text).strip()
if prev_text == cleaned_text:
break
else:
prev_text = cleaned_text
return cleaned_text