Spaces:

alizhgir
/

find-my-book

Sleeping

File size: 1,398 Bytes

0024afc
 
 
 
 
 
 
 
6b534e9
0024afc
6b534e9
 
0024afc
 
6b534e9
 
 
 
 
 
 
 
 
0024afc
 
 
 
 
 
 
 
6b534e9
0024afc
6b534e9
0024afc
 
 
 
 
6b534e9
0024afc
 
 
6b534e9
 
0024afc
6b534e9
0024afc

import pandas as pd
import numpy as np
import torch
import faiss

from transformers import AutoTokenizer, AutoModel


CHECKPOINT = "cointegrated/rubert-tiny2"

tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
model = AutoModel.from_pretrained(CHECKPOINT)

vectors_annotation = np.load('datasets/annotation_embeddings2.npy')

data_frame = pd.read_csv('datasets/final_dataset.csv')

data_frame = pd.DataFrame({
    'Cсылка на книгу': data_frame['page_url'],
    'Обложка': data_frame['image_url'],
    'Инфо': data_frame[['category_name', 'age', 'title', 'author']].agg(', '.join, axis=1),
    'Аннотация': data_frame['annotation']
})

MAX_LEN = 512

faiss_index = faiss.IndexFlatL2(312)

faiss_index.add(vectors_annotation)


def recommend(query: str, top_k: int) -> pd.DataFrame:

    tokenized_text = tokenizer.encode(query, add_special_tokens=True, truncation=True, max_length=MAX_LEN)
    tokenized_text = torch.tensor(tokenized_text).unsqueeze(0)

    with torch.inference_mode():
        predict = model(tokenized_text)

        vector = np.array([predict[0][:, 0, :].squeeze().cpu().numpy()])

    value_metrics, index = faiss_index.search(vector, k=top_k)

    value_metrics = np.round(value_metrics.reshape(top_k, ))
    recommend_books = data_frame.iloc[index.reshape(top_k, ), 1:].reset_index(drop=True)

    return recommend_books, value_metrics