imdb_recommender / imdb_recommender.py
ronirigoni's picture
Primeiro teste do recommender
fe4f68b
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import requests
from bs4 import BeautifulSoup
def tokenize_and_stem(text):
# Tokenize by sentence, then by word
tokens = [ word for sent in sent_tokenize(text) for word in word_tokenize(sent) ]
# Filter out raw tokens to remove noise
filtered_tokens = [token for token in tokens if re.search('[a-zA-Z0-9]', token)]
# Stem the filtered_tokens
stems = [ stemmer_en.stem(ft) for ft in filtered_tokens ]
return stems
def main(movie_name):
search_url = 'https://www.imdb.com/find'
payload = {'q': movie_name}
soup_search = BeautifulSoup(requests.get(search_url, params=payload).text, 'html.parser')
movie_found = soup_search.select("a[href*='/title/tt']")
if movie_found == []:
raise SystemExit(f'Filme \"{movie_name}\" não encontrado no Imdb. Verifique e tente novamente.')
# Assumes that the first result of the imdb search is the correct one
imdb_id_given_by_user = str.replace(movie_found[0].attrs['href'], "/title/", "")[:-1]
df = pd.read_csv('imdb_top_1000.csv')
#df.head()
### The user choose a movie to compare. This movie is in the Top 1000?
# If we didn't find the movie in the DataFrame -> Get movie data from imdb.com
if df[ df['imdb_id'] == imdb_id_given_by_user ].shape[0] == 0:
genres = []
url = 'https://www.imdb.com/title/' + imdb_id_given_by_user
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
title = soup.find('h1').text
plot = soup.find('span', attrs={'data-testid': 'plot-xl'}).text
director = soup.select("a[href*='/name/nm']")[0].text
year = soup.select("a[href*='/releaseinfo?ref_=tt_ov_rdat']")[0].text
genres_soup = soup.select("a[href*='/search/title?genres=']")
for genre in genres_soup:
genres.append(genre.span.text)
genres = ", ".join(genres)
rate = float(soup.find('div', {'data-testid': 'hero-rating-bar__aggregate-rating__score'}).span.text)
# Adding the new movie to the DataFrame
new_movie = pd.DataFrame([{
'imdb_id': imdb_id_given_by_user,
'title': title,
'rate': rate,
'year': year,
'director': director,
'genres': genres,
'plot': plot }])
df = pd.concat([df, new_movie], ignore_index=True)
#df.tail()
### Concatenating 'genres' and 'director' with 'plot' to tokenize later
df['combined_features'] = df['title'] + " " + df['director'] + " " + \
df['genres'].apply(lambda x: str.replace(x, '\'', '')) + \
" " + df['plot']
df['combined_features'].tail()
### Function that tokenize and stem a text
# We need to tokenize to perform the cosine distance analisys. We also stem the text to transform the words into their root form, so # words like "run" and "running" are grouped in the same token.
# Create an English language SnowballStemmer object
stemmer_en = SnowballStemmer("english")
# testing the function:
# print( tokenize_and_stem("[Drama, Romance, War] At a U.S. Army base at 1945") )
### Vectorizing the movie plot and genres with TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer( max_features=50000, stop_words='english',
tokenizer=tokenize_and_stem, ngram_range=(1,2) )
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'].values)
#print(tfidf_matrix.shape)
### Calculating the Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix)
#print(cosine_sim[0:4,0:4], cosine_sim.shape)
movie_index = df[ df['imdb_id'] == imdb_id_given_by_user ].index
# Find the cosine similarity tax with all the movies from the df, addint the index in the tupple
similar_movies = list(enumerate(cosine_sim[movie_index,:][0]))
# Sort the result, in descending order
sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)
sorted_similar_movies[:5]
### Finding the data from the 10 movies more simmilar to the user's choice
top_15_indexes = [ m[0] for m in sorted_similar_movies[:15] ]
top_15_scores = [ m[1] for m in sorted_similar_movies[:15] ]
top_15_sim = df.iloc[top_15_indexes].drop(['combined_features'], axis=1)
top_15_sim['similarity'] = top_15_scores
return top_15_sim