Spaces:
Sleeping
Sleeping
""" | |
// function.py // | |
This programme was created to store the function used through out this project. | |
""" | |
import re | |
from nltk.tokenize import word_tokenize | |
# Create A Function for Text Preprocessing | |
def text_preprocessing(text, lemmatizer, sw): | |
# Case folding | |
text = text.lower() | |
# Mention removal | |
text = re.sub("@[A-Za-z0-9_]+", " ", text) | |
# Hashtags removal | |
text = re.sub("#[A-Za-z0-9_]+", " ", text) | |
# Newline removal (\n) | |
text = re.sub(r"\\n", " ",text) | |
# Whitespace removal | |
text = text.strip() | |
# URL removal | |
text = re.sub(r"http\S+", " ", text) | |
text = re.sub(r"www.\S+", " ", text) | |
# Non-letter removal (such as emoticon, symbol (like μ, $, 兀), etc | |
text = re.sub("[^A-Za-z\s']", " ", text) | |
# Tokenization | |
tokens = word_tokenize(text) | |
# Stopwords removal | |
tokens = [word for word in tokens if word not in sw] | |
# Lemmatization | |
tokens = [lemmatizer.lemmatize(word) for word in tokens] | |
# Combining Tokens | |
text = ' '.join(tokens) | |
return text | |