from nltk.stem.isri import ISRIStemmer from pyarabic.araby import strip_tashkeel, strip_tatweel import numpy as np import pandas as pd import json import re import time import os import math import random # isristemmer = ISRIStemmer() # def stemming(txt): # return isristemmer.stem(txt) def remove_singleCharacter(text): text_tokenized = ar.tokenize(text) clean_txt = '' for word in text_tokenized: if len(word) != 1: clean_txt = clean_txt + word + ' ' return clean_txt[:-1] # remove_punctuations def remove_punctuations(text): punc = '''()-[]{};:'"\,<>./@#$%^&*،؛_~''' arabic_punctuations = '''`÷×؛_ـ،/:".,'~¦+|”…“–ـ=﴾﴿ ﹱ ﹹ ⸀˓• ב''' punctuations_list = punc + arabic_punctuations for x in punctuations_list: text = text.replace(x, ' ') return text def normalize_text(txt): txt = strip_tashkeel(txt) txt = strip_tatweel(txt) txt = ''.join(txt[i] for i in range(len(txt)) if i == 0 or txt[i-1] != txt[i]) # remove repeated characters return txt def remove_stopwords(txt, path="stopword.txt"): text_tokenized = txt.split(' ') clean_txt = '' # useful_words=[] # filtered_sentence=" " arabic_stop_words_file = open(path, 'r', encoding='utf-8') arabic_stop_words = arabic_stop_words_file.read().split('\n') for word in text_tokenized: if word not in arabic_stop_words: clean_txt = clean_txt + word + ' ' return clean_txt[:-1] def Remove_unwanted(text): # removing the extra spacing and links text = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE) text = re.sub(r'^http?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE) text = re.sub(r"http\S+", " ", text) text = re.sub(r"https\S+", " ", text) text = re.sub(r'\s+', ' ', text) text = re.sub(r'[a-zA-Z]+', ' ', text) text = re.sub(r"^\s+|\s+$", "", text) text = re.sub(r"(\s\d+)", " ", text) text = re.sub(r"$\d+\W+|\b\d+\b|\W+\d+$", " ", text) text = re.sub(r"\d+", " ", text) text = re.sub(r'[إأٱآا]', 'ا', text) text = re.sub(r'ى', '[ي]', text) text = re.sub(r'ء', '[ؤئ]', text) text = re.sub(r' +', ' ', text) return text def txt_preprocess(text): text = normalize_text(text) # text = stemming(text) text = remove_stopwords(text) text = remove_punctuations(text) text = Remove_unwanted(text) return text