Spaces:

ronirigoni
/

imdb_recommender

Runtime error

App Files Files Community

ronirigoni commited on Sep 2, 2022

Commit

5716fc9

1 Parent(s): 998ecb0

app.py versao hello world

Browse files

Files changed (2) hide show

app.py +11 -0
imdb_recommender.py +83 -78

app.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import gradio as gr
+#import imdb_recommender
+def greet(name):
+    return "Hello " + name + "!!"
+iface = gr.Interface(fn=greet, inputs="text", outputs="text")
+iface.launch()
+#iface = gr.Interface(fn=imdb_recommender.main, inputs="text", outputs="text")
+#iface.launch()

imdb_recommender.py CHANGED Viewed

@@ -8,63 +8,8 @@ from sklearn.metrics.pairwise import cosine_similarity
 import requests
 from bs4 import BeautifulSoup
-movie_name_given_by_user = 'O poderoso chefão'
-search_url = 'https://www.imdb.com/find'
-payload = {'q': movie_name_given_by_user}
-soup_search = BeautifulSoup(requests.get(search_url, params=payload).text, 'html.parser')
-movie_found = soup_search.select("a[href*='/title/tt']")
-if movie_found == []:
-    raise SystemExit(f'Filme \"{movie_name_given_by_user}\" não encontrado no Imdb. Verifique e tente novamente.')
-# Assumes that the first result of the imdb search is the correct one
-imdb_id_given_by_user = str.replace(movie_found[0].attrs['href'], "/title/", "")[:-1]
-df = pd.read_csv('imdb_top_1000.csv')
-df.head()
-### The user choose a movie to compare. This movie is in the Top 1000?
-# If we didn't find the movie in the DataFrame -> Get movie data from imdb.com
-if df[ df['imdb_id'] == imdb_id_given_by_user ].shape[0] == 0:
-    genres = []
-    url = 'https://www.imdb.com/title/' + imdb_id_given_by_user
-    soup = BeautifulSoup(requests.get(url).text, 'html.parser')
-    title = soup.find('h1').text
-    plot = soup.find('span', attrs={'data-testid': 'plot-xl'}).text
-    director = soup.select("a[href*='/name/nm']")[0].text
-    year = soup.select("a[href*='/releaseinfo?ref_=tt_ov_rdat']")[0].text
-    genres_soup = soup.select("a[href*='/search/title?genres=']")
-    for genre in genres_soup:
-        genres.append(genre.span.text)
-    genres = ", ".join(genres)
-    rate = float(soup.find('div', {'data-testid': 'hero-rating-bar__aggregate-rating__score'}).span.text)
-    # Adding the new movie to the DataFrame
-    new_movie = pd.DataFrame([{
-                    'imdb_id': imdb_id_given_by_user,
-                    'title': title,
-                    'rate': rate,
-                    'year': year,
-                    'director': director,
-                    'genres': genres,
-                    'plot': plot }])
-    df = pd.concat([df, new_movie], ignore_index=True)
-df.tail()
-### Concatenating 'genres' and 'director' with 'plot' to tokenize later
-# Here we dupplicate the director name so it has more weight comparing two movies
-df['combined_features'] = df['title'] + " " + df['director'] + " " + \
-                    df['genres'].apply(lambda x: str.replace(x, '\'', '')) + \
-                    " " + df['plot']
-df['combined_features'].tail()
-### Function that tokenize and stem a text
-We need to tokenize to perform the cosine distance analisys. We also stem the text to transform the words into their root form, so words like "run" and "running" are grouped in the same token.
-# Create an English language SnowballStemmer object
-stemmer_en = SnowballStemmer("english")
 def tokenize_and_stem(text):
     # Tokenize by sentence, then by word
     tokens = [ word for sent in sent_tokenize(text) for word in word_tokenize(sent) ]
@@ -76,29 +21,89 @@ def tokenize_and_stem(text):
     return stems
-# testing the function:
-# print( tokenize_and_stem("[Drama, Romance, War] At a U.S. Army base at 1945") )
-### Vectorizing the movie plot and genres with TfidfVectorizer
-tfidf_vectorizer = TfidfVectorizer( max_features=50000, stop_words='english',
-                                    tokenizer=tokenize_and_stem, ngram_range=(1,2) )
-tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'].values)
-print(tfidf_matrix.shape)
-### Calculating the Cosine Similarity
-cosine_sim = cosine_similarity(tfidf_matrix)
-print(cosine_sim[0:4,0:4], cosine_sim.shape)
-movie_index = df[ df['imdb_id'] == imdb_id_given_by_user ].index
-# Find the cosine similarity tax with all the movies from the df, addint the index in the tupple
-similar_movies = list(enumerate(cosine_sim[movie_index,:][0]))
-# Sort the result, in descending order
-sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)
-sorted_similar_movies[:5]
-### Finding the data from the 10 movies more simmilar to the user's choice
-top_15_indexes = [ m[0] for m in sorted_similar_movies[:15] ]
-top_15_scores = [ m[1] for m in sorted_similar_movies[:15] ]
-top_15_sim = df.iloc[top_15_indexes].drop(['combined_features'], axis=1)
-top_15_sim['similarity'] = top_15_scores
-top_15_sim

 import requests
 from bs4 import BeautifulSoup
 def tokenize_and_stem(text):
     # Tokenize by sentence, then by word
     tokens = [ word for sent in sent_tokenize(text) for word in word_tokenize(sent) ]
     return stems
+def main(movie_name):
+    search_url = 'https://www.imdb.com/find'
+    payload = {'q': movie_name}
+    soup_search = BeautifulSoup(requests.get(search_url, params=payload).text, 'html.parser')
+    movie_found = soup_search.select("a[href*='/title/tt']")
+    if movie_found == []:
+        raise SystemExit(f'Filme \"{movie_name}\" não encontrado no Imdb. Verifique e tente novamente.')
+    # Assumes that the first result of the imdb search is the correct one
+    imdb_id_given_by_user = str.replace(movie_found[0].attrs['href'], "/title/", "")[:-1]
+    df = pd.read_csv('imdb_top_1000.csv')
+    #df.head()
+    ### The user choose a movie to compare. This movie is in the Top 1000?
+    # If we didn't find the movie in the DataFrame -> Get movie data from imdb.com
+    if df[ df['imdb_id'] == imdb_id_given_by_user ].shape[0] == 0:
+        genres = []
+        url = 'https://www.imdb.com/title/' + imdb_id_given_by_user
+        soup = BeautifulSoup(requests.get(url).text, 'html.parser')
+        title = soup.find('h1').text
+        plot = soup.find('span', attrs={'data-testid': 'plot-xl'}).text
+        director = soup.select("a[href*='/name/nm']")[0].text
+        year = soup.select("a[href*='/releaseinfo?ref_=tt_ov_rdat']")[0].text
+        genres_soup = soup.select("a[href*='/search/title?genres=']")
+        for genre in genres_soup:
+            genres.append(genre.span.text)
+        genres = ", ".join(genres)
+        rate = float(soup.find('div', {'data-testid': 'hero-rating-bar__aggregate-rating__score'}).span.text)
+        # Adding the new movie to the DataFrame
+        new_movie = pd.DataFrame([{
+                        'imdb_id': imdb_id_given_by_user,
+                        'title': title,
+                        'rate': rate,
+                        'year': year,
+                        'director': director,
+                        'genres': genres,
+                        'plot': plot }])
+        df = pd.concat([df, new_movie], ignore_index=True)
+    #df.tail()
+    ### Concatenating 'genres' and 'director' with 'plot' to tokenize later
+    # Here we dupplicate the director name so it has more weight comparing two movies
+    df['combined_features'] = df['title'] + " " + df['director'] + " " + \
+                        df['genres'].apply(lambda x: str.replace(x, '\'', '')) + \
+                        " " + df['plot']
+    df['combined_features'].tail()
+    ### Function that tokenize and stem a text
+    We need to tokenize to perform the cosine distance analisys. We also stem the text to transform the words into their root form, so words like "run" and "running" are grouped in the same token.
+    # Create an English language SnowballStemmer object
+    stemmer_en = SnowballStemmer("english")
+    # testing the function:
+    # print( tokenize_and_stem("[Drama, Romance, War] At a U.S. Army base at 1945") )
+    ### Vectorizing the movie plot and genres with TfidfVectorizer
+    tfidf_vectorizer = TfidfVectorizer( max_features=50000, stop_words='english',
+                                        tokenizer=tokenize_and_stem, ngram_range=(1,2) )
+    tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'].values)
+    #print(tfidf_matrix.shape)
+    ### Calculating the Cosine Similarity
+    cosine_sim = cosine_similarity(tfidf_matrix)
+    #print(cosine_sim[0:4,0:4], cosine_sim.shape)
+    movie_index = df[ df['imdb_id'] == imdb_id_given_by_user ].index
+    # Find the cosine similarity tax with all the movies from the df, addint the index in the tupple
+    similar_movies = list(enumerate(cosine_sim[movie_index,:][0]))
+    # Sort the result, in descending order
+    sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)
+    sorted_similar_movies[:5]
+    ### Finding the data from the 10 movies more simmilar to the user's choice
+    top_15_indexes = [ m[0] for m in sorted_similar_movies[:15] ]
+    top_15_scores = [ m[1] for m in sorted_similar_movies[:15] ]
+    top_15_sim = df.iloc[top_15_indexes].drop(['combined_features'], axis=1)
+    top_15_sim['similarity'] = top_15_scores
+    return top_15_sim