Spaces:

ronirigoni
/

imdb_recommender

Runtime error

App Files Files Community

imdb_recommender / imdb_recommender.py

ronirigoni

Primeiro teste do recommender

fe4f68b almost 2 years ago

raw history blame contribute delete

No virus

4.71 kB

	import numpy as np
	import pandas as pd
	import re

	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import requests
	from bs4 import BeautifulSoup

	def tokenize_and_stem(text):

	# Tokenize by sentence, then by word
	tokens = [ word for sent in sent_tokenize(text) for word in word_tokenize(sent) ]

	# Filter out raw tokens to remove noise
	filtered_tokens = [token for token in tokens if re.search('[a-zA-Z0-9]', token)]

	# Stem the filtered_tokens
	stems = [ stemmer_en.stem(ft) for ft in filtered_tokens ]

	return stems

	def main(movie_name):

	search_url = 'https://www.imdb.com/find'
	payload = {'q': movie_name}
	soup_search = BeautifulSoup(requests.get(search_url, params=payload).text, 'html.parser')
	movie_found = soup_search.select("a[href*='/title/tt']")

	if movie_found == []:
	raise SystemExit(f'Filme \"{movie_name}\" não encontrado no Imdb. Verifique e tente novamente.')

	# Assumes that the first result of the imdb search is the correct one
	imdb_id_given_by_user = str.replace(movie_found[0].attrs['href'], "/title/", "")[:-1]
	df = pd.read_csv('imdb_top_1000.csv')
	#df.head()

	### The user choose a movie to compare. This movie is in the Top 1000?
	# If we didn't find the movie in the DataFrame -> Get movie data from imdb.com
	if df[ df['imdb_id'] == imdb_id_given_by_user ].shape[0] == 0:
	genres = []
	url = 'https://www.imdb.com/title/' + imdb_id_given_by_user
	soup = BeautifulSoup(requests.get(url).text, 'html.parser')

	title = soup.find('h1').text
	plot = soup.find('span', attrs={'data-testid': 'plot-xl'}).text
	director = soup.select("a[href*='/name/nm']")[0].text
	year = soup.select("a[href*='/releaseinfo?ref_=tt_ov_rdat']")[0].text
	genres_soup = soup.select("a[href*='/search/title?genres=']")
	for genre in genres_soup:
	genres.append(genre.span.text)
	genres = ", ".join(genres)
	rate = float(soup.find('div', {'data-testid': 'hero-rating-bar__aggregate-rating__score'}).span.text)

	# Adding the new movie to the DataFrame
	new_movie = pd.DataFrame([{
	'imdb_id': imdb_id_given_by_user,
	'title': title,
	'rate': rate,
	'year': year,
	'director': director,
	'genres': genres,
	'plot': plot }])

	df = pd.concat([df, new_movie], ignore_index=True)

	#df.tail()


	### Concatenating 'genres' and 'director' with 'plot' to tokenize later
	df['combined_features'] = df['title'] + " " + df['director'] + " " + \
	df['genres'].apply(lambda x: str.replace(x, '\'', '')) + \
	" " + df['plot']
	df['combined_features'].tail()


	### Function that tokenize and stem a text
	# We need to tokenize to perform the cosine distance analisys. We also stem the text to transform the words into their root form, so # words like "run" and "running" are grouped in the same token.

	# Create an English language SnowballStemmer object
	stemmer_en = SnowballStemmer("english")

	# testing the function:
	# print( tokenize_and_stem("[Drama, Romance, War] At a U.S. Army base at 1945") )


	### Vectorizing the movie plot and genres with TfidfVectorizer
	tfidf_vectorizer = TfidfVectorizer( max_features=50000, stop_words='english',
	tokenizer=tokenize_and_stem, ngram_range=(1,2) )

	tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'].values)
	#print(tfidf_matrix.shape)


	### Calculating the Cosine Similarity
	cosine_sim = cosine_similarity(tfidf_matrix)
	#print(cosine_sim[0:4,0:4], cosine_sim.shape)
	movie_index = df[ df['imdb_id'] == imdb_id_given_by_user ].index

	# Find the cosine similarity tax with all the movies from the df, addint the index in the tupple
	similar_movies = list(enumerate(cosine_sim[movie_index,:][0]))

	# Sort the result, in descending order
	sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)
	sorted_similar_movies[:5]


	### Finding the data from the 10 movies more simmilar to the user's choice
	top_15_indexes = [ m[0] for m in sorted_similar_movies[:15] ]
	top_15_scores = [ m[1] for m in sorted_similar_movies[:15] ]
	top_15_sim = df.iloc[top_15_indexes].drop(['combined_features'], axis=1)
	top_15_sim['similarity'] = top_15_scores

	return top_15_sim