File size: 1,398 Bytes
0024afc
 
 
 
 
 
 
 
6b534e9
0024afc
6b534e9
 
0024afc
 
6b534e9
 
 
 
 
 
 
 
 
0024afc
 
 
 
 
 
 
 
6b534e9
0024afc
6b534e9
0024afc
 
 
 
 
6b534e9
0024afc
 
 
6b534e9
 
0024afc
6b534e9
0024afc
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import pandas as pd
import numpy as np
import torch
import faiss

from transformers import AutoTokenizer, AutoModel


CHECKPOINT = "cointegrated/rubert-tiny2"

tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
model = AutoModel.from_pretrained(CHECKPOINT)

vectors_annotation = np.load('datasets/annotation_embeddings2.npy')

data_frame = pd.read_csv('datasets/final_dataset.csv')

data_frame = pd.DataFrame({
    'Cсылка на книгу': data_frame['page_url'],
    'Обложка': data_frame['image_url'],
    'Инфо': data_frame[['category_name', 'age', 'title', 'author']].agg(', '.join, axis=1),
    'Аннотация': data_frame['annotation']
})

MAX_LEN = 512

faiss_index = faiss.IndexFlatL2(312)

faiss_index.add(vectors_annotation)


def recommend(query: str, top_k: int) -> pd.DataFrame:

    tokenized_text = tokenizer.encode(query, add_special_tokens=True, truncation=True, max_length=MAX_LEN)
    tokenized_text = torch.tensor(tokenized_text).unsqueeze(0)

    with torch.inference_mode():
        predict = model(tokenized_text)

        vector = np.array([predict[0][:, 0, :].squeeze().cpu().numpy()])

    value_metrics, index = faiss_index.search(vector, k=top_k)

    value_metrics = np.round(value_metrics.reshape(top_k, ))
    recommend_books = data_frame.iloc[index.reshape(top_k, ), 1:].reset_index(drop=True)

    return recommend_books, value_metrics