test_2 / func.py
RMakushkin's picture
Update func.py
03f8214
raw
history blame contribute delete
No virus
2.64 kB
import pandas as pd
import numpy as np
import torch
from transformers import BertModel, BertTokenizer
from sklearn.metrics.pairwise import cosine_similarity
tokenizer = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
model = BertModel.from_pretrained("DeepPavlov/rubert-base-cased-sentence", output_hidden_states = True)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
def filter_by_ganre(df: pd.DataFrame, ganre_list: list):
filtered_df = df[df['ganres'].apply(lambda x: any(g in ganre_list for g in(x)))]
filt_ind = filtered_df.index.to_list()
return filt_ind
# def mean_pooling(model_output, attention_mask):
# token_embeddings = model_output['last_hidden_state']
# input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
# sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
# sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
# return sum_embeddings / sum_mask
# def recommendation(filt_ind: list, embeddings: np.array, user_text: str, n=10):
# token_user_text = tokenizer(user_text, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
# user_embeddings = torch.Tensor().to(device)
# model.to(device)
# model.eval()
# with torch.no_grad():
# batch = {k: v.to(device) for k, v in token_user_text.items()}
# outputs = model(**batch)
# user_embeddings = torch.cat([user_embeddings, mean_pooling(outputs, batch['attention_mask'])])
# user_embeddings = user_embeddings.cpu().numpy()
# cosine_similarities = cosine_similarity(embeddings[filt_ind], user_embeddings.reshape(1, -1))
# df_res = pd.DataFrame(cosine_similarities.ravel(), columns=['cos_sim']).sort_values('cos_sim', ascending=False)
# dict_topn = df_res.iloc[:n, :].cos_sim.to_dict()
# return dict_topn
def recommendation(filt_ind: list, embeddings:np.array, user_text: str, n=10):
tokens = tokenizer(user_text, return_tensors="pt", padding=True, truncation=True)
model.to(device)
model.eval()
with torch.no_grad():
tokens = {key: value.to(model.device) for key, value in tokens.items()}
outputs = model(**tokens)
user_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().detach().numpy()
cosine_similarities = cosine_similarity(embeddings[filt_ind], user_embedding.reshape(1, -1))
df_res = pd.DataFrame(cosine_similarities.ravel(), columns=['cos_sim']).sort_values('cos_sim', ascending=False)
dict_topn = df_res.iloc[:n, :].cos_sim.to_dict()
return dict_topn