File size: 1,097 Bytes
086c6d8
 
 
 
2e09508
086c6d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d89a1db
 
086c6d8
d89a1db
086c6d8
d89a1db
086c6d8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import json
import pickle
import pandas as pd
import nltk
nltk.download('stopwords')
import regex as re
from nltk.corpus import stopwords

class Preprocess:
    genres = None
    y = None
    
    def __init__(self) -> None:
        self.genres = []

    def clean_text(self, text):
        """Cleans text by removing certains unwanted characters"""

        # remove backslash-apostrophe 
        text = re.sub("\'", "", text) 
        # remove everything except alphabets 
        text = re.sub("[^a-zA-Z]"," ",text) 
        # remove whitespaces 
        text = ' '.join(text.split()) 
        # convert text to lowercase 
        text = text.lower() 

        return text

    def remove_stopwords(self,text):
        """Function to remove stopwords""" 
        stop_words = set(stopwords.words('english'))
        no_stopword_text = [w for w in text.split() if not w in stop_words]
        return ' '.join(no_stopword_text)

    def apply(self, plot):
        clean_plot = self.clean_text(str(plot))

        clean_plot = self.remove_stopwords(str(clean_plot))

        return clean_plot