MahmoudNasser's picture
Update Cleaning.py
28ee38b
from nltk.stem.isri import ISRIStemmer
from pyarabic.araby import strip_tashkeel, strip_tatweel
import numpy as np
import pandas as pd
import json
import re
import time
import os
import math
import random
# isristemmer = ISRIStemmer()
# def stemming(txt):
# return isristemmer.stem(txt)
def remove_singleCharacter(text):
text_tokenized = ar.tokenize(text)
clean_txt = ''
for word in text_tokenized:
if len(word) != 1:
clean_txt = clean_txt + word + ' '
return clean_txt[:-1]
# remove_punctuations
def remove_punctuations(text):
punc = '''()-[]{};:'"\,<>./@#$%^&*،؛_~'''
arabic_punctuations = '''`÷×؛_ـ،/:".,'~¦+|”…“–ـ=﴾﴿ ﹱ ﹹ ⸀˓• ב'''
punctuations_list = punc + arabic_punctuations
for x in punctuations_list:
text = text.replace(x, ' ')
return text
def normalize_text(txt):
txt = strip_tashkeel(txt)
txt = strip_tatweel(txt)
txt = ''.join(txt[i] for i in range(len(txt)) if i ==
0 or txt[i-1] != txt[i]) # remove repeated characters
return txt
def remove_stopwords(txt, path="stopword.txt"):
text_tokenized = txt.split(' ')
clean_txt = ''
# useful_words=[]
# filtered_sentence=" "
arabic_stop_words_file = open(path, 'r', encoding='utf-8')
arabic_stop_words = arabic_stop_words_file.read().split('\n')
for word in text_tokenized:
if word not in arabic_stop_words:
clean_txt = clean_txt + word + ' '
return clean_txt[:-1]
def Remove_unwanted(text):
# removing the extra spacing and links
text = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE)
text = re.sub(r'^http?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE)
text = re.sub(r"http\S+", " ", text)
text = re.sub(r"https\S+", " ", text)
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[a-zA-Z]+', ' ', text)
text = re.sub(r"^\s+|\s+$", "", text)
text = re.sub(r"(\s\d+)", " ", text)
text = re.sub(r"$\d+\W+|\b\d+\b|\W+\d+$", " ", text)
text = re.sub(r"\d+", " ", text)
text = re.sub(r'[إأٱآا]', 'ا', text)
text = re.sub(r'ى', '[ي]', text)
text = re.sub(r'ء', '[ؤئ]', text)
text = re.sub(r' +', ' ', text)
return text
def txt_preprocess(text):
text = normalize_text(text)
# text = stemming(text)
text = remove_stopwords(text)
text = remove_punctuations(text)
text = Remove_unwanted(text)
return text