Spaces:
Sleeping
Sleeping
import re | |
from nltk.tokenize import word_tokenize | |
from nltk.stem import WordNetLemmatizer | |
from nltk.corpus import stopwords | |
import nltk | |
# Create A Function for Text Preprocessing | |
def text_preprocessing(text, stop_words, lemmatizer): | |
# Case folding | |
text = text.lower() | |
# Mention and Hashtags removal | |
text = re.sub(r"@[A-Za-z0-9_]+|#[A-Za-z0-9_]+", " ", text) | |
# Newline removal (\n) | |
text = re.sub(r"\\n", " ", text) | |
# URL and Non-letter removal | |
text = re.sub(r"http\S+|www.\S+|[^A-Za-z\s']", " ", text) | |
# Tokenization | |
tokens = word_tokenize(text) | |
# Stopwords removal | |
tokens = [word for word in tokens if word.lower() not in stop_words] | |
# Lemmatization instead of stemming | |
tokens = [lemmatizer.lemmatize(word) for word in tokens] | |
# Combining Tokens | |
processed_text = ' '.join(tokens) | |
return processed_text | |