movie-genre /
A-M-S's picture
raw history blame
No virus
1.07 kB
import json
import pickle
import pandas as pd
import nltk
import regex as re
from nltk.corpus import stopwords
class Preprocess:
genres = None
y = None
def __init__(self) -> None:
self.genres = []
def clean_text(self, text):
"""Cleans text by removing certains unwanted characters"""
# remove backslash-apostrophe
text = re.sub("\'", "", text)
# remove everything except alphabets
text = re.sub("[^a-zA-Z]"," ",text)
# remove whitespaces
text = ' '.join(text.split())
# convert text to lowercase
text = text.lower()
return text
def remove_stopwords(self,text):
"""Function to remove stopwords"""
stop_words = set(stopwords.words('english'))
no_stopword_text = [w for w in text.split() if not w in stop_words]
return ' '.join(no_stopword_text)
def apply(self, plot):
clean_plot = self.clean_text(str(plot))
clean_plot = self.remove_stopwords(str(clean_plot))
return clean_plot