File size: 2,567 Bytes
41b0868
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import pandas as pd
import torch
import faiss
import numpy as np
from numpy import dot
from numpy.linalg import norm


def table_maker(
    df: pd.DataFrame,
    country: list = [],
    min_year: int = 1999,
    max_year: int = None,
    tagger=set(),
    rating: bool = True,
):

    x = df.copy()
    # фильтр по рейтингк
    if rating:
        rat_con = ~(x["rating"].isna())
    else:
        rat_con = ~(x["url"].isna())
    # фильтр по стране
    if country == []:
        con_con = ~(x["url"].isna())
    else:
        con_con = x["county"].isin(country)
    # фильтр по тегам
    if tagger == set():
        tagger_con = ~(x["url"].isna())
    else:
        tagger_con = x["tags"].ge(tagger)

    # Условие для фильтрации по минимальному году
    year_cond = x["year"] >= min_year

    # Добавляем условие для фильтрации по максимальному году, если оно задано
    if max_year is not None:
        year_cond &= x["year"] <= max_year

    condi = rat_con & con_con & tagger_con & year_cond

    return x.loc[condi]


class RecSys:
    def __init__(self, df: pd.DataFrame, input_, model):
        self.df = df
        self.input_ = input_
        self.model = model
        with torch.no_grad():
            self.emb = model.encode(self.input_)

    def __call__(self):

        def compute(a):
            return dot(a, self.emb) / (norm(a) * norm(self.emb))

        res = self.df.copy()
        res["compute"] = res["vec"].map(compute)
        res["compute2"] = res["vec2"].map(compute)
        self.df["score"] = res["compute"] * 0.8 + res["compute2"] * 0.2

        return self.df.sort_values("score", ascending=False)


class FAISS_inference:
    def __init__(self, df, emb, k=5):
        self.df = df
        self.emb = emb.reshape(1, -1)
        self.k = k

        vec = df["vec"].to_numpy()
        self.d = vec[0].shape[0]
        for i, e in enumerate(vec):
            if i == 0:
                vex = e.T
            else:
                temp = e.T
                vex = np.append(vex, temp)
        self.vex = np.reshape(vex, (-1, 384))

        # self.index = faiss.IndexFlatIP(self.d)
        # self.index = faiss.IndexFlatL2(self.d)
        self.index = faiss.IndexFlat(self.d)

        self.index.add(self.vex)

    def __call__(self):

        d, i = self.index.search(self.emb, self.k)

        faiss_table = self.df.iloc[i[0]]
        faiss_table.loc[:, "score"] = d[0]
        return faiss_table