import json import pickle import pandas as pd import nltk nltk.download('stopwords') import regex as re from nltk.corpus import stopwords class Preprocess: genres = None y = None def __init__(self) -> None: self.genres = [] def clean_text(self, text): """Cleans text by removing certains unwanted characters""" # remove backslash-apostrophe text = re.sub("\'", "", text) # remove everything except alphabets text = re.sub("[^a-zA-Z]"," ",text) # remove whitespaces text = ' '.join(text.split()) # convert text to lowercase text = text.lower() return text def remove_stopwords(self,text): """Function to remove stopwords""" stop_words = set(stopwords.words('english')) no_stopword_text = [w for w in text.split() if not w in stop_words] return ' '.join(no_stopword_text) def apply(self, plot): clean_plot = self.clean_text(str(plot)) clean_plot = self.remove_stopwords(str(clean_plot)) return clean_plot