import re , string | |
from textacy.preprocessing.resources import (RE_EMAIL , RE_URL , RE_NUMBER , | |
RE_NUMBER , RE_EMOJI , RE_SHORT_URL , RE_PHONE_NUMBER | |
) | |
NON_ARABIC_RE = re.compile(r"[%s]"%string.ascii_letters) | |
def clean_text(text:str)->str: | |
'''remove unwanted data''' | |
patterns = [RE_EMAIL , RE_EMOJI , RE_NUMBER , RE_PHONE_NUMBER , RE_SHORT_URL , RE_URL , NON_ARABIC_RE] | |
for pattern in patterns: | |
text = pattern.sub("" , text) | |
return text | |