Spaces:
Runtime error
Runtime error
import numpy as np | |
import pandas as pd | |
import re | |
import nltk | |
nltk.download('punkt') | |
from nltk import sent_tokenize, word_tokenize | |
from nltk.stem.snowball import SnowballStemmer | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
import requests | |
from bs4 import BeautifulSoup | |
def tokenize_and_stem(text): | |
# Tokenize by sentence, then by word | |
tokens = [ word for sent in sent_tokenize(text) for word in word_tokenize(sent) ] | |
# Filter out raw tokens to remove noise | |
filtered_tokens = [token for token in tokens if re.search('[a-zA-Z0-9]', token)] | |
# Stem the filtered_tokens | |
stems = [ stemmer_en.stem(ft) for ft in filtered_tokens ] | |
return stems | |
def main(movie_name): | |
search_url = 'https://www.imdb.com/find' | |
payload = {'q': movie_name} | |
soup_search = BeautifulSoup(requests.get(search_url, params=payload).text, 'html.parser') | |
movie_found = soup_search.select("a[href*='/title/tt']") | |
if movie_found == []: | |
raise SystemExit(f'Filme \"{movie_name}\" não encontrado no Imdb. Verifique e tente novamente.') | |
# Assumes that the first result of the imdb search is the correct one | |
imdb_id_given_by_user = str.replace(movie_found[0].attrs['href'], "/title/", "")[:-1] | |
df = pd.read_csv('imdb_top_1000.csv') | |
#df.head() | |
### The user choose a movie to compare. This movie is in the Top 1000? | |
# If we didn't find the movie in the DataFrame -> Get movie data from imdb.com | |
if df[ df['imdb_id'] == imdb_id_given_by_user ].shape[0] == 0: | |
genres = [] | |
url = 'https://www.imdb.com/title/' + imdb_id_given_by_user | |
soup = BeautifulSoup(requests.get(url).text, 'html.parser') | |
title = soup.find('h1').text | |
plot = soup.find('span', attrs={'data-testid': 'plot-xl'}).text | |
director = soup.select("a[href*='/name/nm']")[0].text | |
year = soup.select("a[href*='/releaseinfo?ref_=tt_ov_rdat']")[0].text | |
genres_soup = soup.select("a[href*='/search/title?genres=']") | |
for genre in genres_soup: | |
genres.append(genre.span.text) | |
genres = ", ".join(genres) | |
rate = float(soup.find('div', {'data-testid': 'hero-rating-bar__aggregate-rating__score'}).span.text) | |
# Adding the new movie to the DataFrame | |
new_movie = pd.DataFrame([{ | |
'imdb_id': imdb_id_given_by_user, | |
'title': title, | |
'rate': rate, | |
'year': year, | |
'director': director, | |
'genres': genres, | |
'plot': plot }]) | |
df = pd.concat([df, new_movie], ignore_index=True) | |
#df.tail() | |
### Concatenating 'genres' and 'director' with 'plot' to tokenize later | |
df['combined_features'] = df['title'] + " " + df['director'] + " " + \ | |
df['genres'].apply(lambda x: str.replace(x, '\'', '')) + \ | |
" " + df['plot'] | |
df['combined_features'].tail() | |
### Function that tokenize and stem a text | |
# We need to tokenize to perform the cosine distance analisys. We also stem the text to transform the words into their root form, so # words like "run" and "running" are grouped in the same token. | |
# Create an English language SnowballStemmer object | |
stemmer_en = SnowballStemmer("english") | |
# testing the function: | |
# print( tokenize_and_stem("[Drama, Romance, War] At a U.S. Army base at 1945") ) | |
### Vectorizing the movie plot and genres with TfidfVectorizer | |
tfidf_vectorizer = TfidfVectorizer( max_features=50000, stop_words='english', | |
tokenizer=tokenize_and_stem, ngram_range=(1,2) ) | |
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'].values) | |
#print(tfidf_matrix.shape) | |
### Calculating the Cosine Similarity | |
cosine_sim = cosine_similarity(tfidf_matrix) | |
#print(cosine_sim[0:4,0:4], cosine_sim.shape) | |
movie_index = df[ df['imdb_id'] == imdb_id_given_by_user ].index | |
# Find the cosine similarity tax with all the movies from the df, addint the index in the tupple | |
similar_movies = list(enumerate(cosine_sim[movie_index,:][0])) | |
# Sort the result, in descending order | |
sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True) | |
sorted_similar_movies[:5] | |
### Finding the data from the 10 movies more simmilar to the user's choice | |
top_15_indexes = [ m[0] for m in sorted_similar_movies[:15] ] | |
top_15_scores = [ m[1] for m in sorted_similar_movies[:15] ] | |
top_15_sim = df.iloc[top_15_indexes].drop(['combined_features'], axis=1) | |
top_15_sim['similarity'] = top_15_scores | |
return top_15_sim |