sentiment_analysis / preprocess.py
nadish1210's picture
Upload 8 files
f75d9fd verified
raw
history blame contribute delete
900 Bytes
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Ensure nltk resources are downloaded
try:
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('stopwords')
try:
nltk.data.find('corpora/wordnet')
except LookupError:
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
if not isinstance(text, str):
return ""
# Lowercase
text = text.lower()
# Remove special characters, numbers, and urls
text = re.sub(r'http\S+', '', text)
text = re.sub(r'[^a-zA-Z\s]', '', text)
# Tokenize and remove stopwords & lemmatize
words = text.split()
clean_words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
return " ".join(clean_words)