Prudhvinath07's picture
added all files
dec266f
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from typing import List, Optional
class TextPreprocessor:
def __init__(self):
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
self.stop_words = set(stopwords.words('english'))
self.lemmatizer = WordNetLemmatizer()
def clean_text(self, text: str) -> str:
"""Clean and normalize text"""
# Convert to lowercase
text = text.lower()
# Remove special characters and numbers
text = re.sub(r'[^a-zA-Z\s]', '', text)
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
def tokenize(self, text: str) -> List[str]:
"""Tokenize text into words"""
return word_tokenize(text)
def remove_stopwords(self, tokens: List[str]) -> List[str]:
"""Remove stop words from token list"""
return [token for token in tokens if token not in self.stop_words]
def lemmatize(self, tokens: List[str]) -> List[str]:
"""Lemmatize tokens"""
return [self.lemmatizer.lemmatize(token) for token in tokens]
def process(self, text: str) -> List[str]:
"""Complete preprocessing pipeline"""
cleaned_text = self.clean_text(text)
tokens = self.tokenize(cleaned_text)
tokens = self.remove_stopwords(tokens)
tokens = self.lemmatize(tokens)
return tokens