Spaces:
Running
Running
File size: 881 Bytes
c423295 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
# Create A Function for Text Preprocessing
def text_preprocessing(text, stop_words, lemmatizer):
# Case folding
text = text.lower()
# Mention and Hashtags removal
text = re.sub(r"@[A-Za-z0-9_]+|#[A-Za-z0-9_]+", " ", text)
# Newline removal (\n)
text = re.sub(r"\\n", " ", text)
# URL and Non-letter removal
text = re.sub(r"http\S+|www.\S+|[^A-Za-z\s']", " ", text)
# Tokenization
tokens = word_tokenize(text)
# Stopwords removal
tokens = [word for word in tokens if word.lower() not in stop_words]
# Lemmatization instead of stemming
tokens = [lemmatizer.lemmatize(word) for word in tokens]
# Combining Tokens
processed_text = ' '.join(tokens)
return processed_text
|