Spaces:
Runtime error
Runtime error
File size: 2,326 Bytes
6c28f88 b75eb47 f7e46ad b75eb47 6c28f88 b75eb47 6c28f88 b75eb47 6c28f88 b75eb47 6c28f88 b75eb47 6c28f88 b75eb47 6c28f88 b75eb47 f0663e6 b75eb47 6c28f88 b75eb47 6c28f88 b75eb47 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import pickle
import re
import collections
import math
import pandas as pd
from tqdm import tqdm
class Model:
def __init__(self, corpus):
self.data = corpus
self.similarity = [[-1 for i in range(len(corpus))] for j in range(len(corpus))]
def load(self, path='_similarity'):
with open(path, 'rb') as fp:
self.similarity = pickle.load(fp)
@staticmethod
def get_cosine(self, df, id1, id2):
col = [
'title',
'genres',
'keywords',
'director',
'cast'
]
re_words = re.compile(r'\w+')
data1 = df.iloc[id1]
data2 = df.iloc[id2]
res = 0
for i in col:
vec1 = collections.Counter(re_words.findall(data1[i]))
vec2 = collections.Counter(re_words.findall(data2[i]))
intersect = set(vec1.keys()) & set(vec2.keys())
a = sum([vec1[x] * vec2[x] for x in intersect])
sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
b = math.sqrt(sum1) * math.sqrt(sum2)
if b == 0:
res += 0
else:
res += float(a) / b
return res / len(col)
def fit(self, save=False):
for id in tqdm(range(len(self.data)), desc='Progress'):
for i in range(len(self.data)):
if i == id:
self.similarity[id][i] = 0
elif self.similarity[id][i] == -1:
temp_sim = self.get_cosine(id, i)
self.similarity[id][i] = temp_sim
self.similarity[i][id] = temp_sim
if save:
with open('_similarity', 'wb') as fp:
pickle.dump(self.similarity, fp)
def forward(self, ids):
res = [1 for i in range(len(self.data))]
for id in ids:
res = [res[i] * self.similarity[id][i] for i in range(len(self.data))]
res = [[i, res[i]] for i in range(len(self.data))]
res.sort(key=lambda x: x[1], reverse=True)
res = [
[
self.data.loc[movie[0], 'title'],
self.data.loc[movie[0], 'year'],
movie[1]
] for movie in res
]
return res[:10]
|