File size: 2,024 Bytes
701d7dd
 
 
 
 
 
5bbc7f7
 
701d7dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import pandas as pd
import pickle
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", cache_dir="model/")
bert_model = AutoModel.from_pretrained("distilbert-base-uncased", cache_dir="model/")


def mean_pool(token_embeds: torch.tensor, attention_mask: torch.tensor) -> torch.tensor:
    in_mask = attention_mask.unsqueeze(-1).expand(token_embeds.size()).float()
    pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(in_mask.sum(1), min=1e-9)
    return pool


def encode(input_texts: list[str], tokenizer: AutoTokenizer, model: AutoModel, device: str = "cpu"
) -> torch.tensor:

    model.eval()
    tokenized_texts = tokenizer(input_texts, max_length=512,
                                padding='max_length', truncation=True, return_tensors="pt")
    token_embeds = model(tokenized_texts["input_ids"].to(device),
                         tokenized_texts["attention_mask"].to(device)).last_hidden_state
    pooled_embeds = mean_pool(token_embeds, tokenized_texts["attention_mask"].to(device))
    return pooled_embeds


with open('data/sentences.pkl', 'rb') as f:
    sentences = pickle.load(f)

with open('data/corpus.pkl', 'rb') as f:
    corpus = pickle.load(f)

df = pd.DataFrame.from_dict(sentences)
df['corpus'] = corpus


def get_question(context: str, question: str):
    cont_quest = f"{context} [Cont_token] {question}"
    pooled_embeds = encode(cont_quest, tokenizer, bert_model, "cpu")
    pooled_embeds = pooled_embeds.cpu().detach().numpy()
    return pooled_embeds


def cosine_sim(question, embed):
    return cosine_similarity(question, embed)[0][0]


def get_corpus(context: str, question: str):
    question_embed = get_question(context, question)
    df['cosine_similarity'] = df.apply(lambda x: cosine_sim(question_embed, x['embeds']), axis=1)
    corp = df.sort_values(by=['cosine_similarity'], ascending=False).head(10)['corpus'].tolist()
    return corp