Spaces:
Sleeping
Sleeping
File size: 1,398 Bytes
0024afc 6b534e9 0024afc 6b534e9 0024afc 6b534e9 0024afc 6b534e9 0024afc 6b534e9 0024afc 6b534e9 0024afc 6b534e9 0024afc 6b534e9 0024afc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
import pandas as pd
import numpy as np
import torch
import faiss
from transformers import AutoTokenizer, AutoModel
CHECKPOINT = "cointegrated/rubert-tiny2"
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
model = AutoModel.from_pretrained(CHECKPOINT)
vectors_annotation = np.load('datasets/annotation_embeddings2.npy')
data_frame = pd.read_csv('datasets/final_dataset.csv')
data_frame = pd.DataFrame({
'Cсылка на книгу': data_frame['page_url'],
'Обложка': data_frame['image_url'],
'Инфо': data_frame[['category_name', 'age', 'title', 'author']].agg(', '.join, axis=1),
'Аннотация': data_frame['annotation']
})
MAX_LEN = 512
faiss_index = faiss.IndexFlatL2(312)
faiss_index.add(vectors_annotation)
def recommend(query: str, top_k: int) -> pd.DataFrame:
tokenized_text = tokenizer.encode(query, add_special_tokens=True, truncation=True, max_length=MAX_LEN)
tokenized_text = torch.tensor(tokenized_text).unsqueeze(0)
with torch.inference_mode():
predict = model(tokenized_text)
vector = np.array([predict[0][:, 0, :].squeeze().cpu().numpy()])
value_metrics, index = faiss_index.search(vector, k=top_k)
value_metrics = np.round(value_metrics.reshape(top_k, ))
recommend_books = data_frame.iloc[index.reshape(top_k, ), 1:].reset_index(drop=True)
return recommend_books, value_metrics
|