Spaces:

Na2seenWahedd
/

EmotionDetectionOrganized

Sleeping

App Files Files Community

EmotionDetectionOrganized / Cleaning.py

MahmoudNasser

Update Cleaning.py

28ee38b over 1 year ago

raw

history blame

2.46 kB

	from nltk.stem.isri import ISRIStemmer
	from pyarabic.araby import strip_tashkeel, strip_tatweel
	import numpy as np
	import pandas as pd
	import json
	import re
	import time
	import os
	import math
	import random

	# isristemmer = ISRIStemmer()
	# def stemming(txt):
	# return isristemmer.stem(txt)


	def remove_singleCharacter(text):
	text_tokenized = ar.tokenize(text)
	clean_txt = ''
	for word in text_tokenized:
	if len(word) != 1:
	clean_txt = clean_txt + word + ' '

	return clean_txt[:-1]

	# remove_punctuations
	def remove_punctuations(text):
	punc = '''()-[]{};:'"\,<>./@#$%^&*،؛_~'''
	arabic_punctuations = '''`÷×؛_ـ،/:".,'~¦+\|”…“–ـ=﴾﴿ ﹱ ﹹ ⸀˓• ב'''
	punctuations_list = punc + arabic_punctuations
	for x in punctuations_list:
	text = text.replace(x, ' ')
	return text


	def normalize_text(txt):
	txt = strip_tashkeel(txt)
	txt = strip_tatweel(txt)
	txt = ''.join(txt[i] for i in range(len(txt)) if i ==
	0 or txt[i-1] != txt[i]) # remove repeated characters
	return txt


	def remove_stopwords(txt, path="stopword.txt"):
	text_tokenized = txt.split(' ')
	clean_txt = ''
	# useful_words=[]
	# filtered_sentence=" "
	arabic_stop_words_file = open(path, 'r', encoding='utf-8')
	arabic_stop_words = arabic_stop_words_file.read().split('\n')
	for word in text_tokenized:
	if word not in arabic_stop_words:
	clean_txt = clean_txt + word + ' '

	return clean_txt[:-1]


	def Remove_unwanted(text):
	# removing the extra spacing and links

	text = re.sub(r'^https?:\/\/.[\r\n]', ' ', text, flags=re.MULTILINE)
	text = re.sub(r'^http?:\/\/.[\r\n]', ' ', text, flags=re.MULTILINE)
	text = re.sub(r"http\S+", " ", text)
	text = re.sub(r"https\S+", " ", text)
	text = re.sub(r'\s+', ' ', text)
	text = re.sub(r'[a-zA-Z]+', ' ', text)
	text = re.sub(r"^\s+\|\s+$", "", text)
	text = re.sub(r"(\s\d+)", " ", text)
	text = re.sub(r"$\d+\W+\|\b\d+\b\|\W+\d+$", " ", text)
	text = re.sub(r"\d+", " ", text)
	text = re.sub(r'[إأٱآا]', 'ا', text)
	text = re.sub(r'ى', '[ي]', text)
	text = re.sub(r'ء', '[ؤئ]', text)
	text = re.sub(r' +', ' ', text)
	return text


	def txt_preprocess(text):
	text = normalize_text(text)
	# text = stemming(text)
	text = remove_stopwords(text)
	text = remove_punctuations(text)
	text = Remove_unwanted(text)
	return text