Spaces:
Sleeping
Sleeping
File size: 1,013 Bytes
e0c55bb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
"""
// function.py //
This programme was created to store the function used through out this project.
"""
import re
from nltk.tokenize import word_tokenize
# Create A Function for Text Preprocessing
def text_preprocessing(text, lemmatizer, sw):
# Case folding
text = text.lower()
# Mention removal
text = re.sub("@[A-Za-z0-9_]+", " ", text)
# Hashtags removal
text = re.sub("#[A-Za-z0-9_]+", " ", text)
# Newline removal (\n)
text = re.sub(r"\\n", " ",text)
# Whitespace removal
text = text.strip()
# URL removal
text = re.sub(r"http\S+", " ", text)
text = re.sub(r"www.\S+", " ", text)
# Non-letter removal (such as emoticon, symbol (like μ, $, 兀), etc
text = re.sub("[^A-Za-z\s']", " ", text)
# Tokenization
tokens = word_tokenize(text)
# Stopwords removal
tokens = [word for word in tokens if word not in sw]
# Lemmatization
tokens = [lemmatizer.lemmatize(word) for word in tokens]
# Combining Tokens
text = ' '.join(tokens)
return text
|