# Utilities import string import nltk import re nltk.download("stopwords") # Constants TASK_1_MODEL = " models/TASK_A_model_final.pkl" TASK_2_MODEL = " models/TASK_B_model_final.pkl" TASK_1_MAP = { 0: "NAG - Non Aggressive Content", 1: "CAG - Covertly Aggressive Content", 2: "OAG - Overtly Aggressive Content", } TASK_2_MAP = { 0: "NGEN - Non Misogynistic Content", 1: "GEN - Misogynistic Content", } # Cleans one text def clean_one_text(text: str) -> str: # Cleans one text and returns it # remove punctuation filter_str = string.punctuation.replace("'", "") new_string = text.translate(str.maketrans("", "", filter_str)) tk = nltk.TweetTokenizer() s = set(nltk.corpus.stopwords.words("english")) # n't words rexp_1 = re.compile(r"n't") not_words = set(filter(rexp_1.findall, s)) not_words.update(("against", "no", "nor", "not")) s.difference_update(not_words) stmr = nltk.stem.porter.PorterStemmer() tokens = [token for token in tk.tokenize(new_string) if token.lower() not in s] clean_tokens = [stmr.stem(token) for token in tokens] text = " ".join(clean_tokens) return text