import json
import pickle
import pandas as pd
import nltk
nltk.download('stopwords')
import regex as re
from nltk.corpus import stopwords

class Preprocess:
    genres = None
    y = None
    
    def __init__(self) -> None:
        self.genres = []

    def clean_text(self, text):
        """Cleans text by removing certains unwanted characters"""

        # remove backslash-apostrophe 
        text = re.sub("\'", "", text) 
        # remove everything except alphabets 
        text = re.sub("[^a-zA-Z]"," ",text) 
        # remove whitespaces 
        text = ' '.join(text.split()) 
        # convert text to lowercase 
        text = text.lower() 

        return text

    def remove_stopwords(self,text):
        """Function to remove stopwords""" 
        stop_words = set(stopwords.words('english'))
        no_stopword_text = [w for w in text.split() if not w in stop_words]
        return ' '.join(no_stopword_text)

    def apply(self, plot):
        clean_plot = self.clean_text(str(plot))

        clean_plot = self.remove_stopwords(str(clean_plot))

        return clean_plot