Spaces:
Sleeping
Sleeping
import pandas as pd | |
import torch | |
import faiss | |
import numpy as np | |
from numpy import dot | |
from numpy.linalg import norm | |
def table_maker( | |
df: pd.DataFrame, | |
country: list = [], | |
min_year: int = 1999, | |
max_year: int = None, | |
tagger=set(), | |
rating: bool = True, | |
): | |
x = df.copy() | |
# фильтр по рейтингк | |
if rating: | |
rat_con = ~(x["rating"].isna()) | |
else: | |
rat_con = ~(x["url"].isna()) | |
# фильтр по стране | |
if country == []: | |
con_con = ~(x["url"].isna()) | |
else: | |
con_con = x["county"].isin(country) | |
# фильтр по тегам | |
if tagger == set(): | |
tagger_con = ~(x["url"].isna()) | |
else: | |
tagger_con = x["tags"].ge(tagger) | |
# Условие для фильтрации по минимальному году | |
year_cond = x["year"] >= min_year | |
# Добавляем условие для фильтрации по максимальному году, если оно задано | |
if max_year is not None: | |
year_cond &= x["year"] <= max_year | |
condi = rat_con & con_con & tagger_con & year_cond | |
return x.loc[condi] | |
class RecSys: | |
def __init__(self, df: pd.DataFrame, input_, model): | |
self.df = df | |
self.input_ = input_ | |
self.model = model | |
with torch.no_grad(): | |
self.emb = model.encode(self.input_) | |
def __call__(self): | |
def compute(a): | |
return dot(a, self.emb) / (norm(a) * norm(self.emb)) | |
res = self.df.copy() | |
res["compute"] = res["vec"].map(compute) | |
res["compute2"] = res["vec2"].map(compute) | |
self.df["score"] = res["compute"] * 0.8 + res["compute2"] * 0.2 | |
return self.df.sort_values("score", ascending=False) | |
class FAISS_inference: | |
def __init__(self, df, emb, k=5): | |
self.df = df | |
self.emb = emb.reshape(1, -1) | |
self.k = k | |
vec = df["vec"].to_numpy() | |
self.d = vec[0].shape[0] | |
# for i, e in enumerate(vec): | |
# if i == 0: | |
# vex = e.T | |
# else: | |
# temp = e.T | |
# vex = np.append(vex, temp) | |
vex = np.vstack(vec) # Используем vstack для объединения массивов | |
# self.vex = np.reshape(vex, (-1, 384)) | |
self.vex = vex | |
# self.index = faiss.IndexFlatIP(self.d) | |
# self.index = faiss.IndexFlatL2(self.d) | |
self.index = faiss.IndexFlat(self.d) | |
self.index.add(self.vex) | |
def __call__(self): | |
d, i = self.index.search(self.emb, self.k) | |
unique_indices = np.unique( | |
i[0] | |
) # Получаем уникальные индексы для исключения дубликатов | |
# faiss_table = self.df.iloc[i[0]] | |
# faiss_table.loc[:, "score"] = d[0] | |
faiss_table = self.df.iloc[unique_indices] | |
faiss_table["score"] = d[0][ | |
: len(unique_indices) | |
] # Присваиваем скоры, учитывая уникальность | |
return faiss_table | |