Spaces:

IvT-DS
/

find_my_show

Sleeping

File size: 3,099 Bytes

import pandas as pd
import torch
import faiss
import numpy as np
from numpy import dot
from numpy.linalg import norm


def table_maker(
    df: pd.DataFrame,
    country: list = [],
    min_year: int = 1999,
    max_year: int = None,
    tagger=set(),
    rating: bool = True,
):

    x = df.copy()
    # фильтр по рейтингк
    if rating:
        rat_con = ~(x["rating"].isna())
    else:
        rat_con = ~(x["url"].isna())
    # фильтр по стране
    if country == []:
        con_con = ~(x["url"].isna())
    else:
        con_con = x["county"].isin(country)
    # фильтр по тегам
    if tagger == set():
        tagger_con = ~(x["url"].isna())
    else:
        tagger_con = x["tags"].ge(tagger)

    # Условие для фильтрации по минимальному году
    year_cond = x["year"] >= min_year

    # Добавляем условие для фильтрации по максимальному году, если оно задано
    if max_year is not None:
        year_cond &= x["year"] <= max_year

    condi = rat_con & con_con & tagger_con & year_cond

    return x.loc[condi]


class RecSys:
    def __init__(self, df: pd.DataFrame, input_, model):
        self.df = df
        self.input_ = input_
        self.model = model
        with torch.no_grad():
            self.emb = model.encode(self.input_)

    def __call__(self):

        def compute(a):
            return dot(a, self.emb) / (norm(a) * norm(self.emb))

        res = self.df.copy()
        res["compute"] = res["vec"].map(compute)
        res["compute2"] = res["vec2"].map(compute)
        self.df["score"] = res["compute"] * 0.8 + res["compute2"] * 0.2

        return self.df.sort_values("score", ascending=False)


class FAISS_inference:
    def __init__(self, df, emb, k=5):
        self.df = df
        self.emb = emb.reshape(1, -1)
        self.k = k

        vec = df["vec"].to_numpy()
        self.d = vec[0].shape[0]
        # for i, e in enumerate(vec):
        #     if i == 0:
        #         vex = e.T
        #     else:
        #         temp = e.T
        #         vex = np.append(vex, temp)
        vex = np.vstack(vec)  # Используем vstack для объединения массивов

        # self.vex = np.reshape(vex, (-1, 384))

        self.vex = vex

        # self.index = faiss.IndexFlatIP(self.d)
        # self.index = faiss.IndexFlatL2(self.d)
        self.index = faiss.IndexFlat(self.d)

        self.index.add(self.vex)

    def __call__(self):

        d, i = self.index.search(self.emb, self.k)

        unique_indices = np.unique(
            i[0]
        )  # Получаем уникальные индексы для исключения дубликатов

        # faiss_table = self.df.iloc[i[0]]
        # faiss_table.loc[:, "score"] = d[0]

        faiss_table = self.df.iloc[unique_indices]
        faiss_table["score"] = d[0][
            : len(unique_indices)
        ]  # Присваиваем скоры, учитывая уникальность
        return faiss_table