Spaces:

ronirigoni
/

imdb_recommender

Runtime error

App Files Files Community

ronirigoni commited on Sep 2, 2022

Commit

2e720e3

•

1 Parent(s): e741969

Primeiro teste do recommender

Browse files

Files changed (1) hide show

imdb_recommender.py +8 -2

imdb_recommender.py CHANGED Viewed

@@ -69,18 +69,22 @@ def main(movie_name):
     ### Concatenating 'genres' and 'director' with 'plot' to tokenize later
-    # Here we dupplicate the director name so it has more weight comparing two movies
     df['combined_features'] = df['title'] + " " + df['director'] + " " + \
                         df['genres'].apply(lambda x: str.replace(x, '\'', '')) + \
                         " " + df['plot']
     df['combined_features'].tail()
     ### Function that tokenize and stem a text
-    We need to tokenize to perform the cosine distance analisys. We also stem the text to transform the words into their root form, so words like "run" and "running" are grouped in the same token.
     # Create an English language SnowballStemmer object
     stemmer_en = SnowballStemmer("english")
     # testing the function:
     # print( tokenize_and_stem("[Drama, Romance, War] At a U.S. Army base at 1945") )
     ### Vectorizing the movie plot and genres with TfidfVectorizer
     tfidf_vectorizer = TfidfVectorizer( max_features=50000, stop_words='english',
                                         tokenizer=tokenize_and_stem, ngram_range=(1,2) )
@@ -100,6 +104,8 @@ def main(movie_name):
     # Sort the result, in descending order
     sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)
     sorted_similar_movies[:5]
     ### Finding the data from the 10 movies more simmilar to the user's choice
     top_15_indexes = [ m[0] for m in sorted_similar_movies[:15] ]
     top_15_scores = [ m[1] for m in sorted_similar_movies[:15] ]

     ### Concatenating 'genres' and 'director' with 'plot' to tokenize later
     df['combined_features'] = df['title'] + " " + df['director'] + " " + \
                         df['genres'].apply(lambda x: str.replace(x, '\'', '')) + \
                         " " + df['plot']
     df['combined_features'].tail()
     ### Function that tokenize and stem a text
+    # We need to tokenize to perform the cosine distance analisys. We also stem the text to transform the words into their root form, so # words like "run" and "running" are grouped in the same token.
     # Create an English language SnowballStemmer object
     stemmer_en = SnowballStemmer("english")
     # testing the function:
     # print( tokenize_and_stem("[Drama, Romance, War] At a U.S. Army base at 1945") )
     ### Vectorizing the movie plot and genres with TfidfVectorizer
     tfidf_vectorizer = TfidfVectorizer( max_features=50000, stop_words='english',
                                         tokenizer=tokenize_and_stem, ngram_range=(1,2) )
     # Sort the result, in descending order
     sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)
     sorted_similar_movies[:5]
     ### Finding the data from the 10 movies more simmilar to the user's choice
     top_15_indexes = [ m[0] for m in sorted_similar_movies[:15] ]
     top_15_scores = [ m[1] for m in sorted_similar_movies[:15] ]