movie-genre / preprocess.py
A-M-S's picture
Updated app.py
d89a1db
raw history blame
No virus
1.07 kB
import json
import pickle
import pandas as pd
import nltk
import regex as re
from nltk.corpus import stopwords
class Preprocess:
genres = None
y = None
def __init__(self) -> None:
self.genres = []
def clean_text(self, text):
"""Cleans text by removing certains unwanted characters"""
# remove backslash-apostrophe
text = re.sub("\'", "", text)
# remove everything except alphabets
text = re.sub("[^a-zA-Z]"," ",text)
# remove whitespaces
text = ' '.join(text.split())
# convert text to lowercase
text = text.lower()
return text
def remove_stopwords(self,text):
"""Function to remove stopwords"""
stop_words = set(stopwords.words('english'))
no_stopword_text = [w for w in text.split() if not w in stop_words]
return ' '.join(no_stopword_text)
def apply(self, plot):
clean_plot = self.clean_text(str(plot))
clean_plot = self.remove_stopwords(str(clean_plot))
return clean_plot