ronirigoni commited on
Commit
2e720e3
1 Parent(s): e741969

Primeiro teste do recommender

Browse files
Files changed (1) hide show
  1. imdb_recommender.py +8 -2
imdb_recommender.py CHANGED
@@ -69,18 +69,22 @@ def main(movie_name):
69
 
70
 
71
  ### Concatenating 'genres' and 'director' with 'plot' to tokenize later
72
- # Here we dupplicate the director name so it has more weight comparing two movies
73
  df['combined_features'] = df['title'] + " " + df['director'] + " " + \
74
  df['genres'].apply(lambda x: str.replace(x, '\'', '')) + \
75
  " " + df['plot']
76
  df['combined_features'].tail()
 
 
77
  ### Function that tokenize and stem a text
78
- We need to tokenize to perform the cosine distance analisys. We also stem the text to transform the words into their root form, so words like "run" and "running" are grouped in the same token.
 
79
  # Create an English language SnowballStemmer object
80
  stemmer_en = SnowballStemmer("english")
81
 
82
  # testing the function:
83
  # print( tokenize_and_stem("[Drama, Romance, War] At a U.S. Army base at 1945") )
 
 
84
  ### Vectorizing the movie plot and genres with TfidfVectorizer
85
  tfidf_vectorizer = TfidfVectorizer( max_features=50000, stop_words='english',
86
  tokenizer=tokenize_and_stem, ngram_range=(1,2) )
@@ -100,6 +104,8 @@ def main(movie_name):
100
  # Sort the result, in descending order
101
  sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)
102
  sorted_similar_movies[:5]
 
 
103
  ### Finding the data from the 10 movies more simmilar to the user's choice
104
  top_15_indexes = [ m[0] for m in sorted_similar_movies[:15] ]
105
  top_15_scores = [ m[1] for m in sorted_similar_movies[:15] ]
 
69
 
70
 
71
  ### Concatenating 'genres' and 'director' with 'plot' to tokenize later
 
72
  df['combined_features'] = df['title'] + " " + df['director'] + " " + \
73
  df['genres'].apply(lambda x: str.replace(x, '\'', '')) + \
74
  " " + df['plot']
75
  df['combined_features'].tail()
76
+
77
+
78
  ### Function that tokenize and stem a text
79
+ # We need to tokenize to perform the cosine distance analisys. We also stem the text to transform the words into their root form, so # words like "run" and "running" are grouped in the same token.
80
+
81
  # Create an English language SnowballStemmer object
82
  stemmer_en = SnowballStemmer("english")
83
 
84
  # testing the function:
85
  # print( tokenize_and_stem("[Drama, Romance, War] At a U.S. Army base at 1945") )
86
+
87
+
88
  ### Vectorizing the movie plot and genres with TfidfVectorizer
89
  tfidf_vectorizer = TfidfVectorizer( max_features=50000, stop_words='english',
90
  tokenizer=tokenize_and_stem, ngram_range=(1,2) )
 
104
  # Sort the result, in descending order
105
  sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)
106
  sorted_similar_movies[:5]
107
+
108
+
109
  ### Finding the data from the 10 movies more simmilar to the user's choice
110
  top_15_indexes = [ m[0] for m in sorted_similar_movies[:15] ]
111
  top_15_scores = [ m[1] for m in sorted_similar_movies[:15] ]