movie_recommendation / models /recommendation_model.py
shoukaku's picture
add all library that is needed to requirements.txt cuz i forget how to do it automatically
f7e46ad
import pickle
import re
import collections
import math
import pandas as pd
from tqdm import tqdm
class Model:
def __init__(self, corpus):
self.data = corpus
self.similarity = [[-1 for i in range(len(corpus))] for j in range(len(corpus))]
def load(self, path='_similarity'):
with open(path, 'rb') as fp:
self.similarity = pickle.load(fp)
@staticmethod
def get_cosine(self, df, id1, id2):
col = [
'title',
'genres',
'keywords',
'director',
'cast'
]
re_words = re.compile(r'\w+')
data1 = df.iloc[id1]
data2 = df.iloc[id2]
res = 0
for i in col:
vec1 = collections.Counter(re_words.findall(data1[i]))
vec2 = collections.Counter(re_words.findall(data2[i]))
intersect = set(vec1.keys()) & set(vec2.keys())
a = sum([vec1[x] * vec2[x] for x in intersect])
sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
b = math.sqrt(sum1) * math.sqrt(sum2)
if b == 0:
res += 0
else:
res += float(a) / b
return res / len(col)
def fit(self, save=False):
for id in tqdm(range(len(self.data)), desc='Progress'):
for i in range(len(self.data)):
if i == id:
self.similarity[id][i] = 0
elif self.similarity[id][i] == -1:
temp_sim = self.get_cosine(id, i)
self.similarity[id][i] = temp_sim
self.similarity[i][id] = temp_sim
if save:
with open('_similarity', 'wb') as fp:
pickle.dump(self.similarity, fp)
def forward(self, ids):
res = [1 for i in range(len(self.data))]
for id in ids:
res = [res[i] * self.similarity[id][i] for i in range(len(self.data))]
res = [[i, res[i]] for i in range(len(self.data))]
res.sort(key=lambda x: x[1], reverse=True)
res = [
[
self.data.loc[movie[0], 'title'],
self.data.loc[movie[0], 'year'],
movie[1]
] for movie in res
]
return res[:10]