Spaces:
Runtime error
Runtime error
import pickle | |
import re | |
import collections | |
import math | |
import pandas as pd | |
from tqdm import tqdm | |
class Model: | |
def __init__(self, corpus): | |
self.data = corpus | |
self.similarity = [[-1 for i in range(len(corpus))] for j in range(len(corpus))] | |
def load(self, path='_similarity'): | |
with open(path, 'rb') as fp: | |
self.similarity = pickle.load(fp) | |
def get_cosine(self, df, id1, id2): | |
col = [ | |
'title', | |
'genres', | |
'keywords', | |
'director', | |
'cast' | |
] | |
re_words = re.compile(r'\w+') | |
data1 = df.iloc[id1] | |
data2 = df.iloc[id2] | |
res = 0 | |
for i in col: | |
vec1 = collections.Counter(re_words.findall(data1[i])) | |
vec2 = collections.Counter(re_words.findall(data2[i])) | |
intersect = set(vec1.keys()) & set(vec2.keys()) | |
a = sum([vec1[x] * vec2[x] for x in intersect]) | |
sum1 = sum([vec1[x] ** 2 for x in vec1.keys()]) | |
sum2 = sum([vec2[x] ** 2 for x in vec2.keys()]) | |
b = math.sqrt(sum1) * math.sqrt(sum2) | |
if b == 0: | |
res += 0 | |
else: | |
res += float(a) / b | |
return res / len(col) | |
def fit(self, save=False): | |
for id in tqdm(range(len(self.data)), desc='Progress'): | |
for i in range(len(self.data)): | |
if i == id: | |
self.similarity[id][i] = 0 | |
elif self.similarity[id][i] == -1: | |
temp_sim = self.get_cosine(id, i) | |
self.similarity[id][i] = temp_sim | |
self.similarity[i][id] = temp_sim | |
if save: | |
with open('_similarity', 'wb') as fp: | |
pickle.dump(self.similarity, fp) | |
def forward(self, ids): | |
res = [1 for i in range(len(self.data))] | |
for id in ids: | |
res = [res[i] * self.similarity[id][i] for i in range(len(self.data))] | |
res = [[i, res[i]] for i in range(len(self.data))] | |
res.sort(key=lambda x: x[1], reverse=True) | |
res = [ | |
[ | |
self.data.loc[movie[0], 'title'], | |
self.data.loc[movie[0], 'year'], | |
movie[1] | |
] for movie in res | |
] | |
return res[:10] | |