File size: 1,111 Bytes
18fcef9
 
 
15c875a
 
18fcef9
e788caa
18fcef9
 
e788caa
 
 
 
 
 
 
 
 
 
18fcef9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15c875a
18fcef9
 
 
 
15c875a
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import string
import nltk
import re
from nltk.stem.porter import PorterStemmer
import warnings


# Cleans one text
def clean_one_text(text: str) -> str:
    """
    Cleans one text by removing punctuation, stopwords, and applying stemming.

    Args:
        text (str): The text to be cleaned.

    Returns:
        str: The cleaned text.

    """

    # remove punctuation
    filter_str = string.punctuation.replace("'", "")

    new_string = text.translate(str.maketrans("", "", filter_str))
    tk = nltk.TweetTokenizer()

    s = set(nltk.corpus.stopwords.words("english"))
    # n't words
    rexp_1 = re.compile(r"n't")
    not_words = set(filter(rexp_1.findall, s))
    not_words.update(("against", "no", "nor", "not"))

    s.difference_update(not_words)

    stmr = PorterStemmer()
    tokens = [token for token in tk.tokenize(new_string) if token.lower() not in s]
    clean_tokens = [stmr.stem(token) for token in tokens]
    text = " ".join(clean_tokens)
    return text


def setup_nltk():
    nltk.download("stopwords")


def initialize():
    warnings.filterwarnings("ignore")
    setup_nltk()