shoukaku commited on
Commit
b75eb47
1 Parent(s): b069bf8

refactor code and fix imports

Browse files
.gitignore ADDED
File without changes
.idea/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
README.md DELETED
@@ -1,12 +0,0 @@
1
- ---
2
- title: Movie Recommendation
3
- emoji: 💩
4
- colorFrom: gray
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 3.0.13
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -7,7 +7,7 @@ import pandas as pd
7
  df = pd.read_csv('movie_data/movie_data.csv')
8
 
9
  recommender = Model(df)
10
- recommender._load('movie_data/_similarity')
11
 
12
  movie = [[df['id'].iloc[i], df['title'].iloc[i], df['year'].iloc[i]] for i in range(len(df))]
13
  corpus = df['title'].values.tolist()
@@ -22,12 +22,12 @@ def search_movie(title):
22
  search_res = 10
23
  s_res = search_model.search(title, search_res)
24
  s_res = [i[0] for i in s_res]
25
- return(f'Search Results For "{title}"\n' + "\n".join([f"[{i[0]}] {i[1]} ({int(i[2])})" for i in s_res]))
26
 
27
  def get_recommendation(ids):
28
  id = [int(id) for id in ids.split()]
29
  rec = recommender.forward(id)
30
- return(f'Movies That You Might Like\n' + "\n".join([f"- {i[0]} ({int(i[1])})" for i in rec]))
31
 
32
  interface = gradio.Blocks()
33
 
 
7
  df = pd.read_csv('movie_data/movie_data.csv')
8
 
9
  recommender = Model(df)
10
+ recommender.load('movie_data/_similarity')
11
 
12
  movie = [[df['id'].iloc[i], df['title'].iloc[i], df['year'].iloc[i]] for i in range(len(df))]
13
  corpus = df['title'].values.tolist()
 
22
  search_res = 10
23
  s_res = search_model.search(title, search_res)
24
  s_res = [i[0] for i in s_res]
25
+ return(f'Search Results For "{title}"\n' + "\n".join([f"[{i[0]}] {i[1]} ({i[2]})" for i in s_res]))
26
 
27
  def get_recommendation(ids):
28
  id = [int(id) for id in ids.split()]
29
  rec = recommender.forward(id)
30
+ return(f'Movies That You Might Like\n' + "\n".join([f"- {i[0]} ({i[1]})" for i in rec]))
31
 
32
  interface = gradio.Blocks()
33
 
models/data_preprocessing.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import pandas as pd
3
+ from recommendation_model import Model
4
+
5
+ """
6
+ The dataset is obtained from TMDB 5000 Movie Dataset
7
+ https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata
8
+ """
9
+
10
+
11
+ def get_name(x):
12
+ return ', '.join([i['name'].lower() for i in ast.literal_eval(x)][:5])
13
+
14
+
15
+ def get_director(x):
16
+ return ', '.join(i['name'].lower() for i in ast.literal_eval(x) if i['job'].lower() == 'director')
17
+
18
+
19
+ def get_year(x):
20
+ return str(x)[:4]
21
+
22
+
23
+ def normalize_data(x):
24
+ return (x - x.min()) / (x.max() - x.min())
25
+
26
+
27
+ raw1 = pd.read_csv('tmdb_5000_movies.csv')
28
+ raw2 = pd.read_csv('tmdb_5000_credits.csv')
29
+ raw2 = raw2.rename(columns={'movie_id': 'id'})
30
+ df = pd.merge(raw1, raw2, on='id')
31
+ df = df.drop([
32
+ 'budget',
33
+ 'homepage',
34
+ 'overview',
35
+ 'tagline',
36
+ 'status',
37
+ 'production_companies',
38
+ 'production_countries',
39
+ 'revenue',
40
+ 'spoken_languages',
41
+ 'title_x',
42
+ 'title_y',
43
+ 'vote_count'
44
+ ], axis=1)
45
+ df['genres'] = df['genres'].map(get_name)
46
+ df['keywords'] = df['keywords'].map(get_name)
47
+ df['cast'] = df['cast'].map(get_name)
48
+ df['crew'] = df['crew'].map(get_director)
49
+ df['release_date'] = df['release_date'].map(get_year)
50
+ for i in range(len(df)):
51
+ df.loc[i, 'id'] = i
52
+ df = df.rename(columns={
53
+ 'original_language': 'language',
54
+ 'original_title': 'title',
55
+ 'release_date': 'year',
56
+ 'vote_average': 'rating',
57
+ 'crew': 'director'
58
+ })
59
+ df = df[[
60
+ 'id',
61
+ 'title',
62
+ 'genres',
63
+ 'keywords',
64
+ 'director',
65
+ 'cast',
66
+ 'year',
67
+ 'language',
68
+ 'runtime',
69
+ 'popularity',
70
+ 'rating'
71
+ ]]
72
+ df['id'] = df['id'].apply(lambda x: str(x))
73
+ df['year'] = df['year'].apply(lambda x: str(x))
74
+ df['runtime'] = normalize_data(df['runtime'])
75
+ df['popularity'] = normalize_data(df['popularity'])
76
+ df['rating'] = normalize_data(df['rating'])
77
+ df_trim = df[['title', 'genres', 'keywords', 'director', 'cast']]
78
+
79
+ model = Model(df)
80
+ model.fit(save=True)
models/recommendation_model.py CHANGED
@@ -1,13 +1,21 @@
1
  import pickle
 
 
 
 
 
2
 
3
  class Model:
4
  def __init__(self, corpus):
5
  self.data = corpus
6
  self.similarity = [[-1 for i in range(len(corpus))] for j in range(len(corpus))]
7
- def _load(self, path = '_similarity'):
 
8
  with open(path, 'rb') as fp:
9
  self.similarity = pickle.load(fp)
10
- def get_cosine(self, id1, id2):
 
 
11
  col = [
12
  'title',
13
  'genres',
@@ -20,8 +28,8 @@ class Model:
20
  data2 = df.iloc[id2]
21
  res = 0
22
  for i in col:
23
- vec1 = collections.Counter(re_words.findall(data1[i]))
24
- vec2 = collections.Counter(re_words.findall(data2[i]))
25
  intersect = set(vec1.keys()) & set(vec2.keys())
26
  a = sum([vec1[x] * vec2[x] for x in intersect])
27
  sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
@@ -32,8 +40,9 @@ class Model:
32
  else:
33
  res += float(a) / b
34
  return res / len(col)
35
- def fit(self, save = False):
36
- for id in tqdm(range(len(self.data)), desc = 'Progress'):
 
37
  for i in range(len(self.data)):
38
  if i == id:
39
  self.similarity[id][i] = 0
@@ -41,20 +50,21 @@ class Model:
41
  temp_sim = self.get_cosine(id, i)
42
  self.similarity[id][i] = temp_sim
43
  self.similarity[i][id] = temp_sim
44
- if(save):
45
  with open('_similarity', 'wb') as fp:
46
  pickle.dump(self.similarity, fp)
 
47
  def forward(self, ids):
48
  res = [1 for i in range(len(self.data))]
49
  for id in ids:
50
  res = [res[i] * self.similarity[id][i] for i in range(len(self.data))]
51
  res = [[i, res[i]] for i in range(len(self.data))]
52
- res.sort(key = lambda x: x[1], reverse = True)
53
  res = [
54
  [
55
- self.data.loc[movie[0], ('title')],
56
- self.data.loc[movie[0], ('year')],
57
  movie[1]
58
  ] for movie in res
59
  ]
60
- return res[:10]
 
1
  import pickle
2
+ import re
3
+ import collections
4
+ import math
5
+ from tqdm import tqdm
6
+
7
 
8
  class Model:
9
  def __init__(self, corpus):
10
  self.data = corpus
11
  self.similarity = [[-1 for i in range(len(corpus))] for j in range(len(corpus))]
12
+
13
+ def load(self, path='_similarity'):
14
  with open(path, 'rb') as fp:
15
  self.similarity = pickle.load(fp)
16
+
17
+ @staticmethod
18
+ def get_cosine(self, df, id1, id2):
19
  col = [
20
  'title',
21
  'genres',
 
28
  data2 = df.iloc[id2]
29
  res = 0
30
  for i in col:
31
+ vec1 = collections.Counter(re_words.findall(data1[i]))
32
+ vec2 = collections.Counter(re_words.findall(data2[i]))
33
  intersect = set(vec1.keys()) & set(vec2.keys())
34
  a = sum([vec1[x] * vec2[x] for x in intersect])
35
  sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
 
40
  else:
41
  res += float(a) / b
42
  return res / len(col)
43
+
44
+ def fit(self, save=False):
45
+ for id in tqdm(range(len(self.data)), desc='Progress'):
46
  for i in range(len(self.data)):
47
  if i == id:
48
  self.similarity[id][i] = 0
 
50
  temp_sim = self.get_cosine(id, i)
51
  self.similarity[id][i] = temp_sim
52
  self.similarity[i][id] = temp_sim
53
+ if save:
54
  with open('_similarity', 'wb') as fp:
55
  pickle.dump(self.similarity, fp)
56
+
57
  def forward(self, ids):
58
  res = [1 for i in range(len(self.data))]
59
  for id in ids:
60
  res = [res[i] * self.similarity[id][i] for i in range(len(self.data))]
61
  res = [[i, res[i]] for i in range(len(self.data))]
62
+ res.sort(key=lambda x: x[1], reverse=True)
63
  res = [
64
  [
65
+ self.data.loc[movie[0], 'title'],
66
+ self.data.loc[movie[0], 'year'],
67
  movie[1]
68
  ] for movie in res
69
  ]
70
+ return res[:10]
models/search_model.py CHANGED
@@ -1,6 +1,7 @@
1
  import gensim
2
  import pandas as pd
3
 
 
4
  class MovieSearch:
5
  def __init__(self, movie, corpus, stopwords):
6
  self.movie = movie
@@ -10,12 +11,13 @@ class MovieSearch:
10
  p_corpus = [[w for w in doc if w not in stopwords] for doc in p_corpus]
11
  self.dictionary = gensim.corpora.Dictionary(p_corpus)
12
  self.bow_corpus = [self.dictionary.doc2bow(doc) for doc in p_corpus]
13
- self.model = gensim.models.LsiModel(self.bow_corpus, id2word = self.dictionary)
 
14
  def search(self, query, len_results):
15
  vec_bow = self.dictionary.doc2bow(query.lower().split())
16
  vec_model = self.model[vec_bow]
17
  index = gensim.similarities.MatrixSimilarity(self.model[self.bow_corpus])
18
  sims = index[vec_model]
19
  sims = [[self.movie[i], sims[i]] for i in range(len(sims))]
20
- sims.sort(key = lambda x: x[1], reverse = True)
21
- return(sims[:len_results])
 
1
  import gensim
2
  import pandas as pd
3
 
4
+
5
  class MovieSearch:
6
  def __init__(self, movie, corpus, stopwords):
7
  self.movie = movie
 
11
  p_corpus = [[w for w in doc if w not in stopwords] for doc in p_corpus]
12
  self.dictionary = gensim.corpora.Dictionary(p_corpus)
13
  self.bow_corpus = [self.dictionary.doc2bow(doc) for doc in p_corpus]
14
+ self.model = gensim.models.LsiModel(self.bow_corpus, id2word=self.dictionary)
15
+
16
  def search(self, query, len_results):
17
  vec_bow = self.dictionary.doc2bow(query.lower().split())
18
  vec_model = self.model[vec_bow]
19
  index = gensim.similarities.MatrixSimilarity(self.model[self.bow_corpus])
20
  sims = index[vec_model]
21
  sims = [[self.movie[i], sims[i]] for i in range(len(sims))]
22
+ sims.sort(key=lambda x: x[1], reverse=True)
23
+ return sims[:len_results]
movie_data/_similarity DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a50ddfc9a9be319ac3f7725240cc3965e30bb29c568937860cf2f99c65e9726
3
- size 207661759
 
 
 
 
movie_data/movie_data.csv DELETED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -1,2 +1,4 @@
 
1
  pandas
2
- gensim
 
 
1
+ numpy
2
  pandas
3
+ gensim
4
+ tqdm