NLP_HW1 / bi_encoder.py
PetrovDE's picture
out cache in biencode
9de0472
import pandas as pd
import pickle
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
bert_model = AutoModel.from_pretrained("distilbert-base-uncased")
def mean_pool(token_embeds: torch.tensor, attention_mask: torch.tensor) -> torch.tensor:
in_mask = attention_mask.unsqueeze(-1).expand(token_embeds.size()).float()
pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(in_mask.sum(1), min=1e-9)
return pool
def encode(input_texts: list[str], tokenizer: AutoTokenizer, model: AutoModel, device: str = "cpu"
) -> torch.tensor:
model.eval()
tokenized_texts = tokenizer(input_texts, max_length=512,
padding='max_length', truncation=True, return_tensors="pt")
token_embeds = model(tokenized_texts["input_ids"].to(device),
tokenized_texts["attention_mask"].to(device)).last_hidden_state
pooled_embeds = mean_pool(token_embeds, tokenized_texts["attention_mask"].to(device))
return pooled_embeds
with open('data/sentences.pkl', 'rb') as f:
sentences = pickle.load(f)
with open('data/corpus.pkl', 'rb') as f:
corpus = pickle.load(f)
df = pd.DataFrame.from_dict(sentences)
df['corpus'] = corpus
def get_question(context: str, question: str):
cont_quest = f"{context} [Cont_token] {question}"
pooled_embeds = encode(cont_quest, tokenizer, bert_model, "cpu")
pooled_embeds = pooled_embeds.cpu().detach().numpy()
return pooled_embeds
def cosine_sim(question, embed):
return cosine_similarity(question, embed)[0][0]
def get_corpus(context: str, question: str):
question_embed = get_question(context, question)
df['cosine_similarity'] = df.apply(lambda x: cosine_sim(question_embed, x['embeds']), axis=1)
corp = df.sort_values(by=['cosine_similarity'], ascending=False).head(10)['corpus'].tolist()
return corp