File size: 881 Bytes
c423295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk


# Create A Function for Text Preprocessing
def text_preprocessing(text, stop_words, lemmatizer):

    # Case folding
    text = text.lower()

    # Mention and Hashtags removal
    text = re.sub(r"@[A-Za-z0-9_]+|#[A-Za-z0-9_]+", " ", text)

    # Newline removal (\n)
    text = re.sub(r"\\n", " ", text)

    # URL and Non-letter removal
    text = re.sub(r"http\S+|www.\S+|[^A-Za-z\s']", " ", text)

    # Tokenization
    tokens = word_tokenize(text)

    # Stopwords removal
    tokens = [word for word in tokens if word.lower() not in stop_words]

    # Lemmatization instead of stemming
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Combining Tokens
    processed_text = ' '.join(tokens)

    return processed_text