Spaces:
Sleeping
Sleeping
import torch | |
from transformers import AutoModel, AutoTokenizer | |
class QAEmbedder: | |
def __init__(self, model_name="paraphrase-MiniLM-L6-v2"): | |
""" | |
Defines a QA embedding model. This is, given a set of questions, | |
this class returns the corresponding embedding vectors. | |
Args: | |
model_name (`str`): Directory containing the necessary tokenizer | |
and model files. | |
""" | |
self.model = None | |
self.tokenizer = None | |
self.model_name = model_name | |
self.set_model(model_name) | |
def get_model(self, model_name): | |
""" | |
Loads a general tokenizer and model using pytorch | |
'AutoTokenizer' and 'AutoModel' | |
Args: | |
model_name (`str`): Directory containing the necessary tokenizer | |
and model files. | |
""" | |
model = AutoModel.from_pretrained(model_name) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
return model, tokenizer | |
def set_model(self, model_name): | |
""" | |
Sets a general tokenizer and model using the 'self.get_model' | |
method. | |
Args: | |
model_name (`str`): Directory containing the necessary tokenizer | |
and model files. | |
""" | |
self.model, self.tokenizer = self.get_model(self.model_name) | |
def _mean_pooling(self, model_output, attention_mask): | |
""" | |
Internal method that takes a model output and an attention | |
mask and outputs a mean pooling layer. | |
Args: | |
model_output (`torch.Tensor`): output from the QA model | |
attention_mask (`torch.Tensor`): attention mask defined in the QA tokenizer | |
Returns: | |
The averaged tensor. | |
""" | |
token_embeddings = model_output[0] | |
input_mask_expanded = ( | |
attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() | |
) | |
pool_emb = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp( | |
input_mask_expanded.sum(1), min=1e-9 | |
) | |
return pool_emb | |
def get_embeddings(self, questions, batch=32): | |
""" | |
Gets the corresponding embeddings for a set of input 'questions'. | |
Args: | |
questions (`list` of `str`): List of strings defining the questions to be embedded | |
batch (`int`): Performs the embedding job 'batch' questions at a time | |
Returns: | |
The embedding vectors. | |
""" | |
question_embeddings = [] | |
for i in range(0, len(questions), batch): | |
# Tokenize sentences | |
encoded_input = self.tokenizer( | |
questions[i : i + batch], | |
padding=True, | |
truncation=True, | |
return_tensors="pt", | |
) | |
# Compute token embeddings | |
with torch.no_grad(): | |
model_output = self.model(**encoded_input) | |
# Perform mean pooling | |
batch_embeddings = self._mean_pooling( | |
model_output, encoded_input["attention_mask"] | |
) | |
question_embeddings.append(batch_embeddings) | |
question_embeddings = torch.cat(question_embeddings, dim=0) | |
return question_embeddings | |
class QASearcher: | |
def __init__(self, model_name="paraphrase-MiniLM-L6-v2"): | |
""" | |
Defines a QA Search model. This is, given a new question it searches | |
the most similar questions in a set 'context' and returns both the best | |
question and associated answer. | |
Args: | |
model_name (`str`): Directory containing the necessary tokenizer | |
and model files. | |
""" | |
self.answers = None | |
self.questions = None | |
self.question_embeddings = None | |
self.embedder = QAEmbedder(model_name=model_name) | |
def set_context_qa(self, questions, answers): | |
""" | |
Sets the QA context to be used during search. | |
Args: | |
questions (`list` of `str`): List of strings defining the questions to be embedded | |
answers (`list` of `str`): Best answer for each question in 'questions' | |
""" | |
self.answers = answers | |
self.questions = questions | |
self.question_embeddings = self.get_q_embeddings(questions) | |
def get_q_embeddings(self, questions): | |
""" | |
Gets the embeddings for the questions in 'context'. | |
Args: | |
questions (`list` of `str`): List of strings defining the questions to be embedded | |
Returns: | |
The embedding vectors. | |
""" | |
question_embeddings = self.embedder.get_embeddings(questions) | |
question_embeddings = torch.nn.functional.normalize( | |
question_embeddings, p=2, dim=1 | |
) | |
return question_embeddings.transpose(0, 1) | |
def cosine_similarity(self, questions, batch=32): | |
""" | |
Gets the cosine similarity between the new questions and the 'context' questions. | |
Args: | |
questions (`list` of `str`): List of strings defining the questions to be embedded | |
batch (`int`): Performs the embedding job 'batch' questions at a time | |
Returns: | |
The cosine similarity | |
""" | |
question_embeddings = self.embedder.get_embeddings(questions, batch=batch) | |
question_embeddings = torch.nn.functional.normalize( | |
question_embeddings, p=2, dim=1 | |
) | |
cosine_sim = torch.mm(question_embeddings, self.question_embeddings) | |
return cosine_sim | |
def get_answers(self, questions, batch=32): | |
""" | |
Gets the best answers in the stored 'context' for the given new 'questions'. | |
Args: | |
questions (`list` of `str`): List of strings defining the questions to be embedded | |
batch (`int`): Performs the embedding job 'batch' questions at a time | |
Returns: | |
A `list` of `dict`'s containing the original question ('orig_q'), the most similar | |
question in the context ('best_q') and the associated answer ('best_a'). | |
""" | |
similarity = self.cosine_similarity(questions, batch=batch) | |
response = [] | |
for i in range(similarity.shape[0]): | |
best_ix = similarity[i].argmax() | |
best_q = self.questions[best_ix] | |
best_a = self.answers[best_ix] | |
response.append( | |
{ | |
"orig_q": questions[i], | |
"best_q": best_q, | |
"best_a": best_a, | |
} | |
) | |
return response | |