Spaces:
Sleeping
Sleeping
File size: 1,111 Bytes
18fcef9 15c875a 18fcef9 e788caa 18fcef9 e788caa 18fcef9 15c875a 18fcef9 15c875a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
import string
import nltk
import re
from nltk.stem.porter import PorterStemmer
import warnings
# Cleans one text
def clean_one_text(text: str) -> str:
"""
Cleans one text by removing punctuation, stopwords, and applying stemming.
Args:
text (str): The text to be cleaned.
Returns:
str: The cleaned text.
"""
# remove punctuation
filter_str = string.punctuation.replace("'", "")
new_string = text.translate(str.maketrans("", "", filter_str))
tk = nltk.TweetTokenizer()
s = set(nltk.corpus.stopwords.words("english"))
# n't words
rexp_1 = re.compile(r"n't")
not_words = set(filter(rexp_1.findall, s))
not_words.update(("against", "no", "nor", "not"))
s.difference_update(not_words)
stmr = PorterStemmer()
tokens = [token for token in tk.tokenize(new_string) if token.lower() not in s]
clean_tokens = [stmr.stem(token) for token in tokens]
text = " ".join(clean_tokens)
return text
def setup_nltk():
nltk.download("stopwords")
def initialize():
warnings.filterwarnings("ignore")
setup_nltk()
|