Spaces:
Runtime error
Runtime error
File size: 4,705 Bytes
998ecb0 fe4f68b 998ecb0 5716fc9 998ecb0 5716fc9 998ecb0 5716fc9 998ecb0 5716fc9 998ecb0 5716fc9 998ecb0 5716fc9 2e720e3 5716fc9 2e720e3 5716fc9 2e720e3 5716fc9 2e720e3 5716fc9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import requests
from bs4 import BeautifulSoup
def tokenize_and_stem(text):
# Tokenize by sentence, then by word
tokens = [ word for sent in sent_tokenize(text) for word in word_tokenize(sent) ]
# Filter out raw tokens to remove noise
filtered_tokens = [token for token in tokens if re.search('[a-zA-Z0-9]', token)]
# Stem the filtered_tokens
stems = [ stemmer_en.stem(ft) for ft in filtered_tokens ]
return stems
def main(movie_name):
search_url = 'https://www.imdb.com/find'
payload = {'q': movie_name}
soup_search = BeautifulSoup(requests.get(search_url, params=payload).text, 'html.parser')
movie_found = soup_search.select("a[href*='/title/tt']")
if movie_found == []:
raise SystemExit(f'Filme \"{movie_name}\" não encontrado no Imdb. Verifique e tente novamente.')
# Assumes that the first result of the imdb search is the correct one
imdb_id_given_by_user = str.replace(movie_found[0].attrs['href'], "/title/", "")[:-1]
df = pd.read_csv('imdb_top_1000.csv')
#df.head()
### The user choose a movie to compare. This movie is in the Top 1000?
# If we didn't find the movie in the DataFrame -> Get movie data from imdb.com
if df[ df['imdb_id'] == imdb_id_given_by_user ].shape[0] == 0:
genres = []
url = 'https://www.imdb.com/title/' + imdb_id_given_by_user
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
title = soup.find('h1').text
plot = soup.find('span', attrs={'data-testid': 'plot-xl'}).text
director = soup.select("a[href*='/name/nm']")[0].text
year = soup.select("a[href*='/releaseinfo?ref_=tt_ov_rdat']")[0].text
genres_soup = soup.select("a[href*='/search/title?genres=']")
for genre in genres_soup:
genres.append(genre.span.text)
genres = ", ".join(genres)
rate = float(soup.find('div', {'data-testid': 'hero-rating-bar__aggregate-rating__score'}).span.text)
# Adding the new movie to the DataFrame
new_movie = pd.DataFrame([{
'imdb_id': imdb_id_given_by_user,
'title': title,
'rate': rate,
'year': year,
'director': director,
'genres': genres,
'plot': plot }])
df = pd.concat([df, new_movie], ignore_index=True)
#df.tail()
### Concatenating 'genres' and 'director' with 'plot' to tokenize later
df['combined_features'] = df['title'] + " " + df['director'] + " " + \
df['genres'].apply(lambda x: str.replace(x, '\'', '')) + \
" " + df['plot']
df['combined_features'].tail()
### Function that tokenize and stem a text
# We need to tokenize to perform the cosine distance analisys. We also stem the text to transform the words into their root form, so # words like "run" and "running" are grouped in the same token.
# Create an English language SnowballStemmer object
stemmer_en = SnowballStemmer("english")
# testing the function:
# print( tokenize_and_stem("[Drama, Romance, War] At a U.S. Army base at 1945") )
### Vectorizing the movie plot and genres with TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer( max_features=50000, stop_words='english',
tokenizer=tokenize_and_stem, ngram_range=(1,2) )
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'].values)
#print(tfidf_matrix.shape)
### Calculating the Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix)
#print(cosine_sim[0:4,0:4], cosine_sim.shape)
movie_index = df[ df['imdb_id'] == imdb_id_given_by_user ].index
# Find the cosine similarity tax with all the movies from the df, addint the index in the tupple
similar_movies = list(enumerate(cosine_sim[movie_index,:][0]))
# Sort the result, in descending order
sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)
sorted_similar_movies[:5]
### Finding the data from the 10 movies more simmilar to the user's choice
top_15_indexes = [ m[0] for m in sorted_similar_movies[:15] ]
top_15_scores = [ m[1] for m in sorted_similar_movies[:15] ]
top_15_sim = df.iloc[top_15_indexes].drop(['combined_features'], axis=1)
top_15_sim['similarity'] = top_15_scores
return top_15_sim |