Spaces:

soumyaprabhamaiti
/

hate_speech_classifier

Runtime error

hate_speech_classifier / utils.py

Add hate classifier app

5ce506c about 1 year ago

1.2 kB

	import re
	import string
	from collections.abc import Iterable

	import nltk
	import numpy as np
	from keras.preprocessing.text import Tokenizer
	from keras.utils import pad_sequences
	from nltk.corpus import stopwords

	nltk.download('stopwords')


	# Apply regex and do cleaning.
	def clean_text(words: str) -> str:
	words = str(words).lower()
	words = re.sub('\[.*?\]', '', words)
	words = re.sub('https?://\S+\|www\.\S+', '', words)
	words = re.sub('<.*?>+', '', words)
	words = re.sub(r'@\w+', '', words)
	words = re.sub('[%s]' % re.escape(string.punctuation), '', words)
	words = re.sub('\n', '', words)
	words = re.sub('\w\d\w', '', words)

	stopword = set(stopwords.words('english'))
	words = ' '.join(
	[word for word in words.split(' ') if word not in stopword])

	stemmer = nltk.SnowballStemmer("english")
	words = ' '.join([stemmer.stem(word) for word in words.split(' ')])

	return words


	def tokenize_and_pad(text_list: Iterable[str], tokenizer: Tokenizer, max_len: int) -> np.ndarray[np.str_]:
	sequences = tokenizer.texts_to_sequences(text_list)
	sequences_matrix = pad_sequences(sequences, maxlen=max_len)
	return sequences_matrix