File size: 2,326 Bytes
6c28f88
b75eb47
 
 
f7e46ad
b75eb47
 
6c28f88
 
 
 
 
b75eb47
 
6c28f88
 
b75eb47
 
 
6c28f88
 
 
 
 
 
 
 
 
 
 
 
b75eb47
 
6c28f88
 
 
 
 
 
 
 
 
 
b75eb47
 
 
6c28f88
 
 
 
 
 
 
b75eb47
6c28f88
 
b75eb47
f0663e6
 
 
 
 
b75eb47
6c28f88
 
b75eb47
 
6c28f88
 
 
b75eb47
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import pickle
import re
import collections
import math
import pandas as pd
from tqdm import tqdm


class Model:
    def __init__(self, corpus):
        self.data = corpus
        self.similarity = [[-1 for i in range(len(corpus))] for j in range(len(corpus))]

    def load(self, path='_similarity'):
        with open(path, 'rb') as fp:
            self.similarity = pickle.load(fp)

    @staticmethod
    def get_cosine(self, df, id1, id2):
        col = [
            'title',
            'genres',
            'keywords',
            'director',
            'cast'
        ]
        re_words = re.compile(r'\w+')
        data1 = df.iloc[id1]
        data2 = df.iloc[id2]
        res = 0
        for i in col:
            vec1 = collections.Counter(re_words.findall(data1[i]))
            vec2 = collections.Counter(re_words.findall(data2[i]))
            intersect = set(vec1.keys()) & set(vec2.keys())
            a = sum([vec1[x] * vec2[x] for x in intersect])
            sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
            sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
            b = math.sqrt(sum1) * math.sqrt(sum2)
            if b == 0:
                res += 0
            else:
                res += float(a) / b
        return res / len(col)

    def fit(self, save=False):
        for id in tqdm(range(len(self.data)), desc='Progress'):
            for i in range(len(self.data)):
                if i == id:
                    self.similarity[id][i] = 0
                elif self.similarity[id][i] == -1:
                    temp_sim = self.get_cosine(id, i)
                    self.similarity[id][i] = temp_sim
                    self.similarity[i][id] = temp_sim
        if save:
            with open('_similarity', 'wb') as fp:
                pickle.dump(self.similarity, fp)

    def forward(self, ids):
        res = [1 for i in range(len(self.data))]
        for id in ids:
            res = [res[i] * self.similarity[id][i] for i in range(len(self.data))]
        res = [[i, res[i]] for i in range(len(self.data))]
        res.sort(key=lambda x: x[1], reverse=True)
        res = [
            [
                self.data.loc[movie[0], 'title'],
                self.data.loc[movie[0], 'year'],
                movie[1]
            ] for movie in res
        ]
        return res[:10]