File size: 4,705 Bytes
998ecb0
 
 
fe4f68b
998ecb0
 
 
 
 
 
5716fc9
998ecb0
 
 
 
 
 
 
 
 
 
 
5716fc9
 
 
 
 
 
 
 
 
998ecb0
5716fc9
 
 
 
998ecb0
5716fc9
 
 
 
 
 
998ecb0
5716fc9
 
 
 
 
 
 
 
 
998ecb0
5716fc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e720e3
 
5716fc9
2e720e3
 
5716fc9
 
 
 
 
2e720e3
 
5716fc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e720e3
 
5716fc9
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import numpy as np
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import requests
from bs4 import BeautifulSoup

def tokenize_and_stem(text):
        
    # Tokenize by sentence, then by word
    tokens = [ word for sent in sent_tokenize(text) for word in word_tokenize(sent) ]
    
    # Filter out raw tokens to remove noise
    filtered_tokens = [token for token in tokens if re.search('[a-zA-Z0-9]', token)]
    
    # Stem the filtered_tokens
    stems = [ stemmer_en.stem(ft) for ft in filtered_tokens ]
    
    return stems

def main(movie_name):

    search_url = 'https://www.imdb.com/find'
    payload = {'q': movie_name}
    soup_search = BeautifulSoup(requests.get(search_url, params=payload).text, 'html.parser')
    movie_found = soup_search.select("a[href*='/title/tt']")

    if movie_found == []:
        raise SystemExit(f'Filme \"{movie_name}\" não encontrado no Imdb. Verifique e tente novamente.')

    # Assumes that the first result of the imdb search is the correct one
    imdb_id_given_by_user = str.replace(movie_found[0].attrs['href'], "/title/", "")[:-1]
    df = pd.read_csv('imdb_top_1000.csv')
    #df.head()

    ### The user choose a movie to compare. This movie is in the Top 1000?
    # If we didn't find the movie in the DataFrame -> Get movie data from imdb.com
    if df[ df['imdb_id'] == imdb_id_given_by_user ].shape[0] == 0:
        genres = []
        url = 'https://www.imdb.com/title/' + imdb_id_given_by_user
        soup = BeautifulSoup(requests.get(url).text, 'html.parser')

        title = soup.find('h1').text
        plot = soup.find('span', attrs={'data-testid': 'plot-xl'}).text
        director = soup.select("a[href*='/name/nm']")[0].text
        year = soup.select("a[href*='/releaseinfo?ref_=tt_ov_rdat']")[0].text
        genres_soup = soup.select("a[href*='/search/title?genres=']")
        for genre in genres_soup:
            genres.append(genre.span.text)
        genres = ", ".join(genres)
        rate = float(soup.find('div', {'data-testid': 'hero-rating-bar__aggregate-rating__score'}).span.text)

        # Adding the new movie to the DataFrame
        new_movie = pd.DataFrame([{
                        'imdb_id': imdb_id_given_by_user,
                        'title': title,
                        'rate': rate,
                        'year': year,
                        'director': director,
                        'genres': genres,
                        'plot': plot }])

        df = pd.concat([df, new_movie], ignore_index=True)

    #df.tail()
    
    
    ### Concatenating 'genres' and 'director' with 'plot' to tokenize later
    df['combined_features'] = df['title'] + " " + df['director'] + " " + \
                        df['genres'].apply(lambda x: str.replace(x, '\'', '')) + \
                        " " + df['plot']
    df['combined_features'].tail()
    
    
    ### Function that tokenize and stem a text
    # We need to tokenize to perform the cosine distance analisys. We also stem the text to transform the words into their root form, so # words like "run" and "running" are grouped in the same token.
    
    # Create an English language SnowballStemmer object
    stemmer_en = SnowballStemmer("english")

    # testing the function:
    # print( tokenize_and_stem("[Drama, Romance, War] At a U.S. Army base at 1945") )
    
    
    ### Vectorizing the movie plot and genres with TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer( max_features=50000, stop_words='english',
                                        tokenizer=tokenize_and_stem, ngram_range=(1,2) )

    tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'].values)
    #print(tfidf_matrix.shape)
    
    
    ### Calculating the Cosine Similarity
    cosine_sim = cosine_similarity(tfidf_matrix)
    #print(cosine_sim[0:4,0:4], cosine_sim.shape)
    movie_index = df[ df['imdb_id'] == imdb_id_given_by_user ].index

    # Find the cosine similarity tax with all the movies from the df, addint the index in the tupple
    similar_movies = list(enumerate(cosine_sim[movie_index,:][0]))

    # Sort the result, in descending order
    sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)
    sorted_similar_movies[:5]
    
    
    ### Finding the data from the 10 movies more simmilar to the user's choice
    top_15_indexes = [ m[0] for m in sorted_similar_movies[:15] ]
    top_15_scores = [ m[1] for m in sorted_similar_movies[:15] ]
    top_15_sim = df.iloc[top_15_indexes].drop(['combined_features'], axis=1)
    top_15_sim['similarity'] = top_15_scores
    
    return top_15_sim