File size: 2,462 Bytes
0901162
 
 
 
 
 
 
 
 
 
 
28ee38b
 
 
0901162
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28ee38b
0901162
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from nltk.stem.isri import ISRIStemmer
from pyarabic.araby import strip_tashkeel, strip_tatweel
import numpy as np
import pandas as pd
import json
import re
import time
import os
import math
import random

# isristemmer = ISRIStemmer()
# def stemming(txt):
#     return isristemmer.stem(txt)


def remove_singleCharacter(text):
    text_tokenized = ar.tokenize(text)
    clean_txt = ''
    for word in text_tokenized:
        if len(word) != 1:
            clean_txt = clean_txt + word + ' '

    return clean_txt[:-1]

# remove_punctuations
def remove_punctuations(text):
    punc = '''()-[]{};:'"\,<>./@#$%^&*،؛_~'''
    arabic_punctuations = '''`÷×؛_ـ،/:".,'~¦+|”…“–ـ=﴾﴿ ﹱ ﹹ ⸀˓• ב'''
    punctuations_list = punc + arabic_punctuations
    for x in punctuations_list:
        text = text.replace(x, ' ')
    return text


def normalize_text(txt):
    txt = strip_tashkeel(txt)
    txt = strip_tatweel(txt)
    txt = ''.join(txt[i] for i in range(len(txt)) if i ==
                  0 or txt[i-1] != txt[i])  # remove repeated characters
    return txt


def remove_stopwords(txt, path="stopword.txt"):
    text_tokenized = txt.split(' ')
    clean_txt = ''
#   useful_words=[]
#   filtered_sentence=" "
    arabic_stop_words_file = open(path, 'r', encoding='utf-8')
    arabic_stop_words = arabic_stop_words_file.read().split('\n')
    for word in text_tokenized:
        if word not in arabic_stop_words:
            clean_txt = clean_txt + word + ' '

    return clean_txt[:-1]


def Remove_unwanted(text):
    # removing the extra spacing and links

    text = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE)
    text = re.sub(r'^http?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE)
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"https\S+", " ", text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[a-zA-Z]+', ' ', text)
    text = re.sub(r"^\s+|\s+$", "", text)
    text = re.sub(r"(\s\d+)", " ", text)
    text = re.sub(r"$\d+\W+|\b\d+\b|\W+\d+$", " ", text)
    text = re.sub(r"\d+", " ", text)
    text = re.sub(r'[إأٱآا]', 'ا', text)
    text = re.sub(r'ى', '[ي]', text)
    text = re.sub(r'ء', '[ؤئ]', text)
    text = re.sub(r' +', ' ', text)
    return text


def txt_preprocess(text):
    text = normalize_text(text)
    # text = stemming(text)
    text = remove_stopwords(text)
    text = remove_punctuations(text)
    text = Remove_unwanted(text)
    return text