Spaces:
Runtime error
Runtime error
refactor code and fix imports
Browse files- .gitignore +0 -0
- .idea/.gitignore +3 -0
- README.md +0 -12
- app.py +3 -3
- models/data_preprocessing.py +80 -0
- models/recommendation_model.py +21 -11
- models/search_model.py +5 -3
- movie_data/_similarity +0 -3
- movie_data/movie_data.csv +0 -0
- requirements.txt +3 -1
.gitignore
ADDED
File without changes
|
.idea/.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# Default ignored files
|
2 |
+
/shelf/
|
3 |
+
/workspace.xml
|
README.md
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
---
|
2 |
-
title: Movie Recommendation
|
3 |
-
emoji: 💩
|
4 |
-
colorFrom: gray
|
5 |
-
colorTo: yellow
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 3.0.13
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -7,7 +7,7 @@ import pandas as pd
|
|
7 |
df = pd.read_csv('movie_data/movie_data.csv')
|
8 |
|
9 |
recommender = Model(df)
|
10 |
-
recommender.
|
11 |
|
12 |
movie = [[df['id'].iloc[i], df['title'].iloc[i], df['year'].iloc[i]] for i in range(len(df))]
|
13 |
corpus = df['title'].values.tolist()
|
@@ -22,12 +22,12 @@ def search_movie(title):
|
|
22 |
search_res = 10
|
23 |
s_res = search_model.search(title, search_res)
|
24 |
s_res = [i[0] for i in s_res]
|
25 |
-
return(f'Search Results For "{title}"\n' + "\n".join([f"[{i[0]}] {i[1]} ({
|
26 |
|
27 |
def get_recommendation(ids):
|
28 |
id = [int(id) for id in ids.split()]
|
29 |
rec = recommender.forward(id)
|
30 |
-
return(f'Movies That You Might Like\n' + "\n".join([f"- {i[0]} ({
|
31 |
|
32 |
interface = gradio.Blocks()
|
33 |
|
|
|
7 |
df = pd.read_csv('movie_data/movie_data.csv')
|
8 |
|
9 |
recommender = Model(df)
|
10 |
+
recommender.load('movie_data/_similarity')
|
11 |
|
12 |
movie = [[df['id'].iloc[i], df['title'].iloc[i], df['year'].iloc[i]] for i in range(len(df))]
|
13 |
corpus = df['title'].values.tolist()
|
|
|
22 |
search_res = 10
|
23 |
s_res = search_model.search(title, search_res)
|
24 |
s_res = [i[0] for i in s_res]
|
25 |
+
return(f'Search Results For "{title}"\n' + "\n".join([f"[{i[0]}] {i[1]} ({i[2]})" for i in s_res]))
|
26 |
|
27 |
def get_recommendation(ids):
|
28 |
id = [int(id) for id in ids.split()]
|
29 |
rec = recommender.forward(id)
|
30 |
+
return(f'Movies That You Might Like\n' + "\n".join([f"- {i[0]} ({i[1]})" for i in rec]))
|
31 |
|
32 |
interface = gradio.Blocks()
|
33 |
|
models/data_preprocessing.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import ast
|
2 |
+
import pandas as pd
|
3 |
+
from recommendation_model import Model
|
4 |
+
|
5 |
+
"""
|
6 |
+
The dataset is obtained from TMDB 5000 Movie Dataset
|
7 |
+
https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata
|
8 |
+
"""
|
9 |
+
|
10 |
+
|
11 |
+
def get_name(x):
|
12 |
+
return ', '.join([i['name'].lower() for i in ast.literal_eval(x)][:5])
|
13 |
+
|
14 |
+
|
15 |
+
def get_director(x):
|
16 |
+
return ', '.join(i['name'].lower() for i in ast.literal_eval(x) if i['job'].lower() == 'director')
|
17 |
+
|
18 |
+
|
19 |
+
def get_year(x):
|
20 |
+
return str(x)[:4]
|
21 |
+
|
22 |
+
|
23 |
+
def normalize_data(x):
|
24 |
+
return (x - x.min()) / (x.max() - x.min())
|
25 |
+
|
26 |
+
|
27 |
+
raw1 = pd.read_csv('tmdb_5000_movies.csv')
|
28 |
+
raw2 = pd.read_csv('tmdb_5000_credits.csv')
|
29 |
+
raw2 = raw2.rename(columns={'movie_id': 'id'})
|
30 |
+
df = pd.merge(raw1, raw2, on='id')
|
31 |
+
df = df.drop([
|
32 |
+
'budget',
|
33 |
+
'homepage',
|
34 |
+
'overview',
|
35 |
+
'tagline',
|
36 |
+
'status',
|
37 |
+
'production_companies',
|
38 |
+
'production_countries',
|
39 |
+
'revenue',
|
40 |
+
'spoken_languages',
|
41 |
+
'title_x',
|
42 |
+
'title_y',
|
43 |
+
'vote_count'
|
44 |
+
], axis=1)
|
45 |
+
df['genres'] = df['genres'].map(get_name)
|
46 |
+
df['keywords'] = df['keywords'].map(get_name)
|
47 |
+
df['cast'] = df['cast'].map(get_name)
|
48 |
+
df['crew'] = df['crew'].map(get_director)
|
49 |
+
df['release_date'] = df['release_date'].map(get_year)
|
50 |
+
for i in range(len(df)):
|
51 |
+
df.loc[i, 'id'] = i
|
52 |
+
df = df.rename(columns={
|
53 |
+
'original_language': 'language',
|
54 |
+
'original_title': 'title',
|
55 |
+
'release_date': 'year',
|
56 |
+
'vote_average': 'rating',
|
57 |
+
'crew': 'director'
|
58 |
+
})
|
59 |
+
df = df[[
|
60 |
+
'id',
|
61 |
+
'title',
|
62 |
+
'genres',
|
63 |
+
'keywords',
|
64 |
+
'director',
|
65 |
+
'cast',
|
66 |
+
'year',
|
67 |
+
'language',
|
68 |
+
'runtime',
|
69 |
+
'popularity',
|
70 |
+
'rating'
|
71 |
+
]]
|
72 |
+
df['id'] = df['id'].apply(lambda x: str(x))
|
73 |
+
df['year'] = df['year'].apply(lambda x: str(x))
|
74 |
+
df['runtime'] = normalize_data(df['runtime'])
|
75 |
+
df['popularity'] = normalize_data(df['popularity'])
|
76 |
+
df['rating'] = normalize_data(df['rating'])
|
77 |
+
df_trim = df[['title', 'genres', 'keywords', 'director', 'cast']]
|
78 |
+
|
79 |
+
model = Model(df)
|
80 |
+
model.fit(save=True)
|
models/recommendation_model.py
CHANGED
@@ -1,13 +1,21 @@
|
|
1 |
import pickle
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
class Model:
|
4 |
def __init__(self, corpus):
|
5 |
self.data = corpus
|
6 |
self.similarity = [[-1 for i in range(len(corpus))] for j in range(len(corpus))]
|
7 |
-
|
|
|
8 |
with open(path, 'rb') as fp:
|
9 |
self.similarity = pickle.load(fp)
|
10 |
-
|
|
|
|
|
11 |
col = [
|
12 |
'title',
|
13 |
'genres',
|
@@ -20,8 +28,8 @@ class Model:
|
|
20 |
data2 = df.iloc[id2]
|
21 |
res = 0
|
22 |
for i in col:
|
23 |
-
vec1 =
|
24 |
-
vec2 =
|
25 |
intersect = set(vec1.keys()) & set(vec2.keys())
|
26 |
a = sum([vec1[x] * vec2[x] for x in intersect])
|
27 |
sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
|
@@ -32,8 +40,9 @@ class Model:
|
|
32 |
else:
|
33 |
res += float(a) / b
|
34 |
return res / len(col)
|
35 |
-
|
36 |
-
|
|
|
37 |
for i in range(len(self.data)):
|
38 |
if i == id:
|
39 |
self.similarity[id][i] = 0
|
@@ -41,20 +50,21 @@ class Model:
|
|
41 |
temp_sim = self.get_cosine(id, i)
|
42 |
self.similarity[id][i] = temp_sim
|
43 |
self.similarity[i][id] = temp_sim
|
44 |
-
if
|
45 |
with open('_similarity', 'wb') as fp:
|
46 |
pickle.dump(self.similarity, fp)
|
|
|
47 |
def forward(self, ids):
|
48 |
res = [1 for i in range(len(self.data))]
|
49 |
for id in ids:
|
50 |
res = [res[i] * self.similarity[id][i] for i in range(len(self.data))]
|
51 |
res = [[i, res[i]] for i in range(len(self.data))]
|
52 |
-
res.sort(key
|
53 |
res = [
|
54 |
[
|
55 |
-
self.data.loc[movie[0],
|
56 |
-
self.data.loc[movie[0],
|
57 |
movie[1]
|
58 |
] for movie in res
|
59 |
]
|
60 |
-
return res[:10]
|
|
|
1 |
import pickle
|
2 |
+
import re
|
3 |
+
import collections
|
4 |
+
import math
|
5 |
+
from tqdm import tqdm
|
6 |
+
|
7 |
|
8 |
class Model:
|
9 |
def __init__(self, corpus):
|
10 |
self.data = corpus
|
11 |
self.similarity = [[-1 for i in range(len(corpus))] for j in range(len(corpus))]
|
12 |
+
|
13 |
+
def load(self, path='_similarity'):
|
14 |
with open(path, 'rb') as fp:
|
15 |
self.similarity = pickle.load(fp)
|
16 |
+
|
17 |
+
@staticmethod
|
18 |
+
def get_cosine(self, df, id1, id2):
|
19 |
col = [
|
20 |
'title',
|
21 |
'genres',
|
|
|
28 |
data2 = df.iloc[id2]
|
29 |
res = 0
|
30 |
for i in col:
|
31 |
+
vec1 = collections.Counter(re_words.findall(data1[i]))
|
32 |
+
vec2 = collections.Counter(re_words.findall(data2[i]))
|
33 |
intersect = set(vec1.keys()) & set(vec2.keys())
|
34 |
a = sum([vec1[x] * vec2[x] for x in intersect])
|
35 |
sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
|
|
|
40 |
else:
|
41 |
res += float(a) / b
|
42 |
return res / len(col)
|
43 |
+
|
44 |
+
def fit(self, save=False):
|
45 |
+
for id in tqdm(range(len(self.data)), desc='Progress'):
|
46 |
for i in range(len(self.data)):
|
47 |
if i == id:
|
48 |
self.similarity[id][i] = 0
|
|
|
50 |
temp_sim = self.get_cosine(id, i)
|
51 |
self.similarity[id][i] = temp_sim
|
52 |
self.similarity[i][id] = temp_sim
|
53 |
+
if save:
|
54 |
with open('_similarity', 'wb') as fp:
|
55 |
pickle.dump(self.similarity, fp)
|
56 |
+
|
57 |
def forward(self, ids):
|
58 |
res = [1 for i in range(len(self.data))]
|
59 |
for id in ids:
|
60 |
res = [res[i] * self.similarity[id][i] for i in range(len(self.data))]
|
61 |
res = [[i, res[i]] for i in range(len(self.data))]
|
62 |
+
res.sort(key=lambda x: x[1], reverse=True)
|
63 |
res = [
|
64 |
[
|
65 |
+
self.data.loc[movie[0], 'title'],
|
66 |
+
self.data.loc[movie[0], 'year'],
|
67 |
movie[1]
|
68 |
] for movie in res
|
69 |
]
|
70 |
+
return res[:10]
|
models/search_model.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import gensim
|
2 |
import pandas as pd
|
3 |
|
|
|
4 |
class MovieSearch:
|
5 |
def __init__(self, movie, corpus, stopwords):
|
6 |
self.movie = movie
|
@@ -10,12 +11,13 @@ class MovieSearch:
|
|
10 |
p_corpus = [[w for w in doc if w not in stopwords] for doc in p_corpus]
|
11 |
self.dictionary = gensim.corpora.Dictionary(p_corpus)
|
12 |
self.bow_corpus = [self.dictionary.doc2bow(doc) for doc in p_corpus]
|
13 |
-
self.model = gensim.models.LsiModel(self.bow_corpus, id2word
|
|
|
14 |
def search(self, query, len_results):
|
15 |
vec_bow = self.dictionary.doc2bow(query.lower().split())
|
16 |
vec_model = self.model[vec_bow]
|
17 |
index = gensim.similarities.MatrixSimilarity(self.model[self.bow_corpus])
|
18 |
sims = index[vec_model]
|
19 |
sims = [[self.movie[i], sims[i]] for i in range(len(sims))]
|
20 |
-
sims.sort(key
|
21 |
-
return
|
|
|
1 |
import gensim
|
2 |
import pandas as pd
|
3 |
|
4 |
+
|
5 |
class MovieSearch:
|
6 |
def __init__(self, movie, corpus, stopwords):
|
7 |
self.movie = movie
|
|
|
11 |
p_corpus = [[w for w in doc if w not in stopwords] for doc in p_corpus]
|
12 |
self.dictionary = gensim.corpora.Dictionary(p_corpus)
|
13 |
self.bow_corpus = [self.dictionary.doc2bow(doc) for doc in p_corpus]
|
14 |
+
self.model = gensim.models.LsiModel(self.bow_corpus, id2word=self.dictionary)
|
15 |
+
|
16 |
def search(self, query, len_results):
|
17 |
vec_bow = self.dictionary.doc2bow(query.lower().split())
|
18 |
vec_model = self.model[vec_bow]
|
19 |
index = gensim.similarities.MatrixSimilarity(self.model[self.bow_corpus])
|
20 |
sims = index[vec_model]
|
21 |
sims = [[self.movie[i], sims[i]] for i in range(len(sims))]
|
22 |
+
sims.sort(key=lambda x: x[1], reverse=True)
|
23 |
+
return sims[:len_results]
|
movie_data/_similarity
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:1a50ddfc9a9be319ac3f7725240cc3965e30bb29c568937860cf2f99c65e9726
|
3 |
-
size 207661759
|
|
|
|
|
|
|
|
movie_data/movie_data.csv
DELETED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
CHANGED
@@ -1,2 +1,4 @@
|
|
|
|
1 |
pandas
|
2 |
-
gensim
|
|
|
|
1 |
+
numpy
|
2 |
pandas
|
3 |
+
gensim
|
4 |
+
tqdm
|