Spaces:

sdutta28
/

AggDetectApp

Sleeping

AggDetectApp / utils.py

First Web App

18fcef9 over 2 years ago

No virus

1.17 kB

	# Utilities
	import string
	import nltk
	import re

	nltk.download("stopwords")
	# Constants
	TASK_1_MODEL = " models/TASK_A_model_final.pkl"
	TASK_2_MODEL = " models/TASK_B_model_final.pkl"
	TASK_1_MAP = {
	0: "NAG - Non Aggressive Content",
	1: "CAG - Covertly Aggressive Content",
	2: "OAG - Overtly Aggressive Content",
	}
	TASK_2_MAP = {
	0: "NGEN - Non Misogynistic Content",
	1: "GEN - Misogynistic Content",
	}

	# Cleans one text
	def clean_one_text(text: str) -> str:
	# Cleans one text and returns it

	# remove punctuation
	filter_str = string.punctuation.replace("'", "")

	new_string = text.translate(str.maketrans("", "", filter_str))
	tk = nltk.TweetTokenizer()

	s = set(nltk.corpus.stopwords.words("english"))
	# n't words
	rexp_1 = re.compile(r"n't")
	not_words = set(filter(rexp_1.findall, s))
	not_words.update(("against", "no", "nor", "not"))

	s.difference_update(not_words)

	stmr = nltk.stem.porter.PorterStemmer()
	tokens = [token for token in tk.tokenize(new_string) if token.lower() not in s]
	clean_tokens = [stmr.stem(token) for token in tokens]
	text = " ".join(clean_tokens)
	return text