File size: 1,028 Bytes
bd9870c
 
 
 
 
35de82b
 
bd9870c
 
 
 
35de82b
ba852a7
bd9870c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import re
import warnings
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import nltk


class TextCleaner:
    def __init__(self):
        warnings.filterwarnings("ignore")
        nltk.download('stopwords')
        nltk.download('wordnet')
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def cleaning_text(self, text):
        if text and isinstance(text, str):
            text = BeautifulSoup(text, "html.parser").get_text()
            text = re.sub(r'https?://\S+|www\.\S+|@\w+|#\w+|[^a-zA-Z]', ' ', text.lower())
            text = ' '.join([self.lemmatizer.lemmatize(word) for word in text.split() if len(word) > 1 and word not in self.stop_words])
            text = ' '.join(list(dict.fromkeys(text.split())))
        else:
            text = ''
        return text

if __name__ == "__main__":
    # Example usage:
    cleaner = TextCleaner()
    print(cleaner.cleaning_text("I feel bullied online."))