Spaces:
Runtime error
Runtime error
ronirigoni
commited on
Commit
•
5716fc9
1
Parent(s):
998ecb0
app.py versao hello world
Browse files- app.py +11 -0
- imdb_recommender.py +83 -78
app.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
#import imdb_recommender
|
3 |
+
|
4 |
+
def greet(name):
|
5 |
+
return "Hello " + name + "!!"
|
6 |
+
|
7 |
+
iface = gr.Interface(fn=greet, inputs="text", outputs="text")
|
8 |
+
iface.launch()
|
9 |
+
|
10 |
+
#iface = gr.Interface(fn=imdb_recommender.main, inputs="text", outputs="text")
|
11 |
+
#iface.launch()
|
imdb_recommender.py
CHANGED
@@ -8,63 +8,8 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|
8 |
import requests
|
9 |
from bs4 import BeautifulSoup
|
10 |
|
11 |
-
movie_name_given_by_user = 'O poderoso chefão'
|
12 |
-
|
13 |
-
search_url = 'https://www.imdb.com/find'
|
14 |
-
payload = {'q': movie_name_given_by_user}
|
15 |
-
soup_search = BeautifulSoup(requests.get(search_url, params=payload).text, 'html.parser')
|
16 |
-
movie_found = soup_search.select("a[href*='/title/tt']")
|
17 |
-
|
18 |
-
if movie_found == []:
|
19 |
-
raise SystemExit(f'Filme \"{movie_name_given_by_user}\" não encontrado no Imdb. Verifique e tente novamente.')
|
20 |
-
|
21 |
-
# Assumes that the first result of the imdb search is the correct one
|
22 |
-
imdb_id_given_by_user = str.replace(movie_found[0].attrs['href'], "/title/", "")[:-1]
|
23 |
-
df = pd.read_csv('imdb_top_1000.csv')
|
24 |
-
df.head()
|
25 |
-
### The user choose a movie to compare. This movie is in the Top 1000?
|
26 |
-
# If we didn't find the movie in the DataFrame -> Get movie data from imdb.com
|
27 |
-
if df[ df['imdb_id'] == imdb_id_given_by_user ].shape[0] == 0:
|
28 |
-
genres = []
|
29 |
-
url = 'https://www.imdb.com/title/' + imdb_id_given_by_user
|
30 |
-
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
|
31 |
-
|
32 |
-
title = soup.find('h1').text
|
33 |
-
plot = soup.find('span', attrs={'data-testid': 'plot-xl'}).text
|
34 |
-
director = soup.select("a[href*='/name/nm']")[0].text
|
35 |
-
year = soup.select("a[href*='/releaseinfo?ref_=tt_ov_rdat']")[0].text
|
36 |
-
genres_soup = soup.select("a[href*='/search/title?genres=']")
|
37 |
-
for genre in genres_soup:
|
38 |
-
genres.append(genre.span.text)
|
39 |
-
genres = ", ".join(genres)
|
40 |
-
rate = float(soup.find('div', {'data-testid': 'hero-rating-bar__aggregate-rating__score'}).span.text)
|
41 |
-
|
42 |
-
# Adding the new movie to the DataFrame
|
43 |
-
new_movie = pd.DataFrame([{
|
44 |
-
'imdb_id': imdb_id_given_by_user,
|
45 |
-
'title': title,
|
46 |
-
'rate': rate,
|
47 |
-
'year': year,
|
48 |
-
'director': director,
|
49 |
-
'genres': genres,
|
50 |
-
'plot': plot }])
|
51 |
-
|
52 |
-
df = pd.concat([df, new_movie], ignore_index=True)
|
53 |
-
|
54 |
-
df.tail()
|
55 |
-
### Concatenating 'genres' and 'director' with 'plot' to tokenize later
|
56 |
-
# Here we dupplicate the director name so it has more weight comparing two movies
|
57 |
-
df['combined_features'] = df['title'] + " " + df['director'] + " " + \
|
58 |
-
df['genres'].apply(lambda x: str.replace(x, '\'', '')) + \
|
59 |
-
" " + df['plot']
|
60 |
-
df['combined_features'].tail()
|
61 |
-
### Function that tokenize and stem a text
|
62 |
-
We need to tokenize to perform the cosine distance analisys. We also stem the text to transform the words into their root form, so words like "run" and "running" are grouped in the same token.
|
63 |
-
# Create an English language SnowballStemmer object
|
64 |
-
stemmer_en = SnowballStemmer("english")
|
65 |
-
|
66 |
def tokenize_and_stem(text):
|
67 |
-
|
68 |
# Tokenize by sentence, then by word
|
69 |
tokens = [ word for sent in sent_tokenize(text) for word in word_tokenize(sent) ]
|
70 |
|
@@ -76,29 +21,89 @@ def tokenize_and_stem(text):
|
|
76 |
|
77 |
return stems
|
78 |
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
|
|
84 |
|
85 |
-
|
|
|
|
|
|
|
86 |
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
92 |
|
93 |
-
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
-
#
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
import requests
|
9 |
from bs4 import BeautifulSoup
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
def tokenize_and_stem(text):
|
12 |
+
|
13 |
# Tokenize by sentence, then by word
|
14 |
tokens = [ word for sent in sent_tokenize(text) for word in word_tokenize(sent) ]
|
15 |
|
|
|
21 |
|
22 |
return stems
|
23 |
|
24 |
+
def main(movie_name):
|
25 |
+
|
26 |
+
search_url = 'https://www.imdb.com/find'
|
27 |
+
payload = {'q': movie_name}
|
28 |
+
soup_search = BeautifulSoup(requests.get(search_url, params=payload).text, 'html.parser')
|
29 |
+
movie_found = soup_search.select("a[href*='/title/tt']")
|
30 |
+
|
31 |
+
if movie_found == []:
|
32 |
+
raise SystemExit(f'Filme \"{movie_name}\" não encontrado no Imdb. Verifique e tente novamente.')
|
33 |
|
34 |
+
# Assumes that the first result of the imdb search is the correct one
|
35 |
+
imdb_id_given_by_user = str.replace(movie_found[0].attrs['href'], "/title/", "")[:-1]
|
36 |
+
df = pd.read_csv('imdb_top_1000.csv')
|
37 |
+
#df.head()
|
38 |
|
39 |
+
### The user choose a movie to compare. This movie is in the Top 1000?
|
40 |
+
# If we didn't find the movie in the DataFrame -> Get movie data from imdb.com
|
41 |
+
if df[ df['imdb_id'] == imdb_id_given_by_user ].shape[0] == 0:
|
42 |
+
genres = []
|
43 |
+
url = 'https://www.imdb.com/title/' + imdb_id_given_by_user
|
44 |
+
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
|
45 |
|
46 |
+
title = soup.find('h1').text
|
47 |
+
plot = soup.find('span', attrs={'data-testid': 'plot-xl'}).text
|
48 |
+
director = soup.select("a[href*='/name/nm']")[0].text
|
49 |
+
year = soup.select("a[href*='/releaseinfo?ref_=tt_ov_rdat']")[0].text
|
50 |
+
genres_soup = soup.select("a[href*='/search/title?genres=']")
|
51 |
+
for genre in genres_soup:
|
52 |
+
genres.append(genre.span.text)
|
53 |
+
genres = ", ".join(genres)
|
54 |
+
rate = float(soup.find('div', {'data-testid': 'hero-rating-bar__aggregate-rating__score'}).span.text)
|
55 |
|
56 |
+
# Adding the new movie to the DataFrame
|
57 |
+
new_movie = pd.DataFrame([{
|
58 |
+
'imdb_id': imdb_id_given_by_user,
|
59 |
+
'title': title,
|
60 |
+
'rate': rate,
|
61 |
+
'year': year,
|
62 |
+
'director': director,
|
63 |
+
'genres': genres,
|
64 |
+
'plot': plot }])
|
65 |
+
|
66 |
+
df = pd.concat([df, new_movie], ignore_index=True)
|
67 |
+
|
68 |
+
#df.tail()
|
69 |
+
|
70 |
+
|
71 |
+
### Concatenating 'genres' and 'director' with 'plot' to tokenize later
|
72 |
+
# Here we dupplicate the director name so it has more weight comparing two movies
|
73 |
+
df['combined_features'] = df['title'] + " " + df['director'] + " " + \
|
74 |
+
df['genres'].apply(lambda x: str.replace(x, '\'', '')) + \
|
75 |
+
" " + df['plot']
|
76 |
+
df['combined_features'].tail()
|
77 |
+
### Function that tokenize and stem a text
|
78 |
+
We need to tokenize to perform the cosine distance analisys. We also stem the text to transform the words into their root form, so words like "run" and "running" are grouped in the same token.
|
79 |
+
# Create an English language SnowballStemmer object
|
80 |
+
stemmer_en = SnowballStemmer("english")
|
81 |
+
|
82 |
+
# testing the function:
|
83 |
+
# print( tokenize_and_stem("[Drama, Romance, War] At a U.S. Army base at 1945") )
|
84 |
+
### Vectorizing the movie plot and genres with TfidfVectorizer
|
85 |
+
tfidf_vectorizer = TfidfVectorizer( max_features=50000, stop_words='english',
|
86 |
+
tokenizer=tokenize_and_stem, ngram_range=(1,2) )
|
87 |
+
|
88 |
+
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'].values)
|
89 |
+
#print(tfidf_matrix.shape)
|
90 |
+
|
91 |
+
|
92 |
+
### Calculating the Cosine Similarity
|
93 |
+
cosine_sim = cosine_similarity(tfidf_matrix)
|
94 |
+
#print(cosine_sim[0:4,0:4], cosine_sim.shape)
|
95 |
+
movie_index = df[ df['imdb_id'] == imdb_id_given_by_user ].index
|
96 |
+
|
97 |
+
# Find the cosine similarity tax with all the movies from the df, addint the index in the tupple
|
98 |
+
similar_movies = list(enumerate(cosine_sim[movie_index,:][0]))
|
99 |
+
|
100 |
+
# Sort the result, in descending order
|
101 |
+
sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)
|
102 |
+
sorted_similar_movies[:5]
|
103 |
+
### Finding the data from the 10 movies more simmilar to the user's choice
|
104 |
+
top_15_indexes = [ m[0] for m in sorted_similar_movies[:15] ]
|
105 |
+
top_15_scores = [ m[1] for m in sorted_similar_movies[:15] ]
|
106 |
+
top_15_sim = df.iloc[top_15_indexes].drop(['combined_features'], axis=1)
|
107 |
+
top_15_sim['similarity'] = top_15_scores
|
108 |
+
|
109 |
+
return top_15_sim
|