import pickle import re import collections import math import pandas as pd from tqdm import tqdm class Model: def __init__(self, corpus): self.data = corpus self.similarity = [[-1 for i in range(len(corpus))] for j in range(len(corpus))] def load(self, path='_similarity'): with open(path, 'rb') as fp: self.similarity = pickle.load(fp) @staticmethod def get_cosine(self, df, id1, id2): col = [ 'title', 'genres', 'keywords', 'director', 'cast' ] re_words = re.compile(r'\w+') data1 = df.iloc[id1] data2 = df.iloc[id2] res = 0 for i in col: vec1 = collections.Counter(re_words.findall(data1[i])) vec2 = collections.Counter(re_words.findall(data2[i])) intersect = set(vec1.keys()) & set(vec2.keys()) a = sum([vec1[x] * vec2[x] for x in intersect]) sum1 = sum([vec1[x] ** 2 for x in vec1.keys()]) sum2 = sum([vec2[x] ** 2 for x in vec2.keys()]) b = math.sqrt(sum1) * math.sqrt(sum2) if b == 0: res += 0 else: res += float(a) / b return res / len(col) def fit(self, save=False): for id in tqdm(range(len(self.data)), desc='Progress'): for i in range(len(self.data)): if i == id: self.similarity[id][i] = 0 elif self.similarity[id][i] == -1: temp_sim = self.get_cosine(id, i) self.similarity[id][i] = temp_sim self.similarity[i][id] = temp_sim if save: with open('_similarity', 'wb') as fp: pickle.dump(self.similarity, fp) def forward(self, ids): res = [1 for i in range(len(self.data))] for id in ids: res = [res[i] * self.similarity[id][i] for i in range(len(self.data))] res = [[i, res[i]] for i in range(len(self.data))] res.sort(key=lambda x: x[1], reverse=True) res = [ [ self.data.loc[movie[0], 'title'], self.data.loc[movie[0], 'year'], movie[1] ] for movie in res ] return res[:10]