|
import re |
|
import warnings |
|
from nltk.corpus import stopwords |
|
from nltk.stem import WordNetLemmatizer |
|
from bs4 import BeautifulSoup |
|
|
|
class TextCleaner: |
|
def __init__(self): |
|
warnings.filterwarnings("ignore") |
|
self.stop_words = set(stopwords.words('english')) |
|
self.lemmatizer = WordNetLemmatizer() |
|
|
|
def cleaning_text(self, text): |
|
if text and isinstance(text, str): |
|
text = BeautifulSoup(text, "html.parser").get_text() |
|
text = re.sub(r'https?://\S+|www\.\S+|@\w+|#\w+|[^a-zA-Z]', ' ', text.lower()) |
|
text = ' '.join([self.lemmatizer.lemmatize(word) for word in text.split() if len(word) > 1 and word not in self.stop_words]) |
|
text = ' '.join(list(dict.fromkeys(text.split()))) |
|
else: |
|
text = '' |
|
return text |
|
|
|
if __name__ == "__main__": |
|
|
|
cleaner = TextCleaner() |
|
print(cleaner.cleaning_text("I feel bullied online.")) |
|
|