ronirigoni commited on
Commit
5716fc9
1 Parent(s): 998ecb0

app.py versao hello world

Browse files
Files changed (2) hide show
  1. app.py +11 -0
  2. imdb_recommender.py +83 -78
app.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ #import imdb_recommender
3
+
4
+ def greet(name):
5
+ return "Hello " + name + "!!"
6
+
7
+ iface = gr.Interface(fn=greet, inputs="text", outputs="text")
8
+ iface.launch()
9
+
10
+ #iface = gr.Interface(fn=imdb_recommender.main, inputs="text", outputs="text")
11
+ #iface.launch()
imdb_recommender.py CHANGED
@@ -8,63 +8,8 @@ from sklearn.metrics.pairwise import cosine_similarity
8
  import requests
9
  from bs4 import BeautifulSoup
10
 
11
- movie_name_given_by_user = 'O poderoso chefão'
12
-
13
- search_url = 'https://www.imdb.com/find'
14
- payload = {'q': movie_name_given_by_user}
15
- soup_search = BeautifulSoup(requests.get(search_url, params=payload).text, 'html.parser')
16
- movie_found = soup_search.select("a[href*='/title/tt']")
17
-
18
- if movie_found == []:
19
- raise SystemExit(f'Filme \"{movie_name_given_by_user}\" não encontrado no Imdb. Verifique e tente novamente.')
20
-
21
- # Assumes that the first result of the imdb search is the correct one
22
- imdb_id_given_by_user = str.replace(movie_found[0].attrs['href'], "/title/", "")[:-1]
23
- df = pd.read_csv('imdb_top_1000.csv')
24
- df.head()
25
- ### The user choose a movie to compare. This movie is in the Top 1000?
26
- # If we didn't find the movie in the DataFrame -> Get movie data from imdb.com
27
- if df[ df['imdb_id'] == imdb_id_given_by_user ].shape[0] == 0:
28
- genres = []
29
- url = 'https://www.imdb.com/title/' + imdb_id_given_by_user
30
- soup = BeautifulSoup(requests.get(url).text, 'html.parser')
31
-
32
- title = soup.find('h1').text
33
- plot = soup.find('span', attrs={'data-testid': 'plot-xl'}).text
34
- director = soup.select("a[href*='/name/nm']")[0].text
35
- year = soup.select("a[href*='/releaseinfo?ref_=tt_ov_rdat']")[0].text
36
- genres_soup = soup.select("a[href*='/search/title?genres=']")
37
- for genre in genres_soup:
38
- genres.append(genre.span.text)
39
- genres = ", ".join(genres)
40
- rate = float(soup.find('div', {'data-testid': 'hero-rating-bar__aggregate-rating__score'}).span.text)
41
-
42
- # Adding the new movie to the DataFrame
43
- new_movie = pd.DataFrame([{
44
- 'imdb_id': imdb_id_given_by_user,
45
- 'title': title,
46
- 'rate': rate,
47
- 'year': year,
48
- 'director': director,
49
- 'genres': genres,
50
- 'plot': plot }])
51
-
52
- df = pd.concat([df, new_movie], ignore_index=True)
53
-
54
- df.tail()
55
- ### Concatenating 'genres' and 'director' with 'plot' to tokenize later
56
- # Here we dupplicate the director name so it has more weight comparing two movies
57
- df['combined_features'] = df['title'] + " " + df['director'] + " " + \
58
- df['genres'].apply(lambda x: str.replace(x, '\'', '')) + \
59
- " " + df['plot']
60
- df['combined_features'].tail()
61
- ### Function that tokenize and stem a text
62
- We need to tokenize to perform the cosine distance analisys. We also stem the text to transform the words into their root form, so words like "run" and "running" are grouped in the same token.
63
- # Create an English language SnowballStemmer object
64
- stemmer_en = SnowballStemmer("english")
65
-
66
  def tokenize_and_stem(text):
67
-
68
  # Tokenize by sentence, then by word
69
  tokens = [ word for sent in sent_tokenize(text) for word in word_tokenize(sent) ]
70
 
@@ -76,29 +21,89 @@ def tokenize_and_stem(text):
76
 
77
  return stems
78
 
79
- # testing the function:
80
- # print( tokenize_and_stem("[Drama, Romance, War] At a U.S. Army base at 1945") )
81
- ### Vectorizing the movie plot and genres with TfidfVectorizer
82
- tfidf_vectorizer = TfidfVectorizer( max_features=50000, stop_words='english',
83
- tokenizer=tokenize_and_stem, ngram_range=(1,2) )
 
 
 
 
84
 
85
- tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'].values)
 
 
 
86
 
87
- print(tfidf_matrix.shape)
88
- ### Calculating the Cosine Similarity
89
- cosine_sim = cosine_similarity(tfidf_matrix)
90
- print(cosine_sim[0:4,0:4], cosine_sim.shape)
91
- movie_index = df[ df['imdb_id'] == imdb_id_given_by_user ].index
 
92
 
93
- # Find the cosine similarity tax with all the movies from the df, addint the index in the tupple
94
- similar_movies = list(enumerate(cosine_sim[movie_index,:][0]))
 
 
 
 
 
 
 
95
 
96
- # Sort the result, in descending order
97
- sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)
98
- sorted_similar_movies[:5]
99
- ### Finding the data from the 10 movies more simmilar to the user's choice
100
- top_15_indexes = [ m[0] for m in sorted_similar_movies[:15] ]
101
- top_15_scores = [ m[1] for m in sorted_similar_movies[:15] ]
102
- top_15_sim = df.iloc[top_15_indexes].drop(['combined_features'], axis=1)
103
- top_15_sim['similarity'] = top_15_scores
104
- top_15_sim
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  import requests
9
  from bs4 import BeautifulSoup
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def tokenize_and_stem(text):
12
+
13
  # Tokenize by sentence, then by word
14
  tokens = [ word for sent in sent_tokenize(text) for word in word_tokenize(sent) ]
15
 
 
21
 
22
  return stems
23
 
24
+ def main(movie_name):
25
+
26
+ search_url = 'https://www.imdb.com/find'
27
+ payload = {'q': movie_name}
28
+ soup_search = BeautifulSoup(requests.get(search_url, params=payload).text, 'html.parser')
29
+ movie_found = soup_search.select("a[href*='/title/tt']")
30
+
31
+ if movie_found == []:
32
+ raise SystemExit(f'Filme \"{movie_name}\" não encontrado no Imdb. Verifique e tente novamente.')
33
 
34
+ # Assumes that the first result of the imdb search is the correct one
35
+ imdb_id_given_by_user = str.replace(movie_found[0].attrs['href'], "/title/", "")[:-1]
36
+ df = pd.read_csv('imdb_top_1000.csv')
37
+ #df.head()
38
 
39
+ ### The user choose a movie to compare. This movie is in the Top 1000?
40
+ # If we didn't find the movie in the DataFrame -> Get movie data from imdb.com
41
+ if df[ df['imdb_id'] == imdb_id_given_by_user ].shape[0] == 0:
42
+ genres = []
43
+ url = 'https://www.imdb.com/title/' + imdb_id_given_by_user
44
+ soup = BeautifulSoup(requests.get(url).text, 'html.parser')
45
 
46
+ title = soup.find('h1').text
47
+ plot = soup.find('span', attrs={'data-testid': 'plot-xl'}).text
48
+ director = soup.select("a[href*='/name/nm']")[0].text
49
+ year = soup.select("a[href*='/releaseinfo?ref_=tt_ov_rdat']")[0].text
50
+ genres_soup = soup.select("a[href*='/search/title?genres=']")
51
+ for genre in genres_soup:
52
+ genres.append(genre.span.text)
53
+ genres = ", ".join(genres)
54
+ rate = float(soup.find('div', {'data-testid': 'hero-rating-bar__aggregate-rating__score'}).span.text)
55
 
56
+ # Adding the new movie to the DataFrame
57
+ new_movie = pd.DataFrame([{
58
+ 'imdb_id': imdb_id_given_by_user,
59
+ 'title': title,
60
+ 'rate': rate,
61
+ 'year': year,
62
+ 'director': director,
63
+ 'genres': genres,
64
+ 'plot': plot }])
65
+
66
+ df = pd.concat([df, new_movie], ignore_index=True)
67
+
68
+ #df.tail()
69
+
70
+
71
+ ### Concatenating 'genres' and 'director' with 'plot' to tokenize later
72
+ # Here we dupplicate the director name so it has more weight comparing two movies
73
+ df['combined_features'] = df['title'] + " " + df['director'] + " " + \
74
+ df['genres'].apply(lambda x: str.replace(x, '\'', '')) + \
75
+ " " + df['plot']
76
+ df['combined_features'].tail()
77
+ ### Function that tokenize and stem a text
78
+ We need to tokenize to perform the cosine distance analisys. We also stem the text to transform the words into their root form, so words like "run" and "running" are grouped in the same token.
79
+ # Create an English language SnowballStemmer object
80
+ stemmer_en = SnowballStemmer("english")
81
+
82
+ # testing the function:
83
+ # print( tokenize_and_stem("[Drama, Romance, War] At a U.S. Army base at 1945") )
84
+ ### Vectorizing the movie plot and genres with TfidfVectorizer
85
+ tfidf_vectorizer = TfidfVectorizer( max_features=50000, stop_words='english',
86
+ tokenizer=tokenize_and_stem, ngram_range=(1,2) )
87
+
88
+ tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'].values)
89
+ #print(tfidf_matrix.shape)
90
+
91
+
92
+ ### Calculating the Cosine Similarity
93
+ cosine_sim = cosine_similarity(tfidf_matrix)
94
+ #print(cosine_sim[0:4,0:4], cosine_sim.shape)
95
+ movie_index = df[ df['imdb_id'] == imdb_id_given_by_user ].index
96
+
97
+ # Find the cosine similarity tax with all the movies from the df, addint the index in the tupple
98
+ similar_movies = list(enumerate(cosine_sim[movie_index,:][0]))
99
+
100
+ # Sort the result, in descending order
101
+ sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)
102
+ sorted_similar_movies[:5]
103
+ ### Finding the data from the 10 movies more simmilar to the user's choice
104
+ top_15_indexes = [ m[0] for m in sorted_similar_movies[:15] ]
105
+ top_15_scores = [ m[1] for m in sorted_similar_movies[:15] ]
106
+ top_15_sim = df.iloc[top_15_indexes].drop(['combined_features'], axis=1)
107
+ top_15_sim['similarity'] = top_15_scores
108
+
109
+ return top_15_sim