import numpy as np import pandas as pd import re from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import requests from bs4 import BeautifulSoup def tokenize_and_stem(text): # Tokenize by sentence, then by word tokens = [ word for sent in sent_tokenize(text) for word in word_tokenize(sent) ] # Filter out raw tokens to remove noise filtered_tokens = [token for token in tokens if re.search('[a-zA-Z0-9]', token)] # Stem the filtered_tokens stems = [ stemmer_en.stem(ft) for ft in filtered_tokens ] return stems def main(movie_name): search_url = 'https://www.imdb.com/find' payload = {'q': movie_name} soup_search = BeautifulSoup(requests.get(search_url, params=payload).text, 'html.parser') movie_found = soup_search.select("a[href*='/title/tt']") if movie_found == []: raise SystemExit(f'Filme \"{movie_name}\" não encontrado no Imdb. Verifique e tente novamente.') # Assumes that the first result of the imdb search is the correct one imdb_id_given_by_user = str.replace(movie_found[0].attrs['href'], "/title/", "")[:-1] df = pd.read_csv('imdb_top_1000.csv') #df.head() ### The user choose a movie to compare. This movie is in the Top 1000? # If we didn't find the movie in the DataFrame -> Get movie data from imdb.com if df[ df['imdb_id'] == imdb_id_given_by_user ].shape[0] == 0: genres = [] url = 'https://www.imdb.com/title/' + imdb_id_given_by_user soup = BeautifulSoup(requests.get(url).text, 'html.parser') title = soup.find('h1').text plot = soup.find('span', attrs={'data-testid': 'plot-xl'}).text director = soup.select("a[href*='/name/nm']")[0].text year = soup.select("a[href*='/releaseinfo?ref_=tt_ov_rdat']")[0].text genres_soup = soup.select("a[href*='/search/title?genres=']") for genre in genres_soup: genres.append(genre.span.text) genres = ", ".join(genres) rate = float(soup.find('div', {'data-testid': 'hero-rating-bar__aggregate-rating__score'}).span.text) # Adding the new movie to the DataFrame new_movie = pd.DataFrame([{ 'imdb_id': imdb_id_given_by_user, 'title': title, 'rate': rate, 'year': year, 'director': director, 'genres': genres, 'plot': plot }]) df = pd.concat([df, new_movie], ignore_index=True) #df.tail() ### Concatenating 'genres' and 'director' with 'plot' to tokenize later df['combined_features'] = df['title'] + " " + df['director'] + " " + \ df['genres'].apply(lambda x: str.replace(x, '\'', '')) + \ " " + df['plot'] df['combined_features'].tail() ### Function that tokenize and stem a text # We need to tokenize to perform the cosine distance analisys. We also stem the text to transform the words into their root form, so # words like "run" and "running" are grouped in the same token. # Create an English language SnowballStemmer object stemmer_en = SnowballStemmer("english") # testing the function: # print( tokenize_and_stem("[Drama, Romance, War] At a U.S. Army base at 1945") ) ### Vectorizing the movie plot and genres with TfidfVectorizer tfidf_vectorizer = TfidfVectorizer( max_features=50000, stop_words='english', tokenizer=tokenize_and_stem, ngram_range=(1,2) ) tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'].values) #print(tfidf_matrix.shape) ### Calculating the Cosine Similarity cosine_sim = cosine_similarity(tfidf_matrix) #print(cosine_sim[0:4,0:4], cosine_sim.shape) movie_index = df[ df['imdb_id'] == imdb_id_given_by_user ].index # Find the cosine similarity tax with all the movies from the df, addint the index in the tupple similar_movies = list(enumerate(cosine_sim[movie_index,:][0])) # Sort the result, in descending order sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True) sorted_similar_movies[:5] ### Finding the data from the 10 movies more simmilar to the user's choice top_15_indexes = [ m[0] for m in sorted_similar_movies[:15] ] top_15_scores = [ m[1] for m in sorted_similar_movies[:15] ] top_15_sim = df.iloc[top_15_indexes].drop(['combined_features'], axis=1) top_15_sim['similarity'] = top_15_scores return top_15_sim