import torch from transformers import AutoModel, AutoTokenizer class QAEmbedder: def __init__(self, model_name="paraphrase-MiniLM-L6-v2"): """ Defines a QA embedding model. This is, given a set of questions, this class returns the corresponding embedding vectors. Args: model_name (`str`): Directory containing the necessary tokenizer and model files. """ self.model = None self.tokenizer = None self.model_name = model_name self.set_model(model_name) def get_model(self, model_name): """ Loads a general tokenizer and model using pytorch 'AutoTokenizer' and 'AutoModel' Args: model_name (`str`): Directory containing the necessary tokenizer and model files. """ model = AutoModel.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) return model, tokenizer def set_model(self, model_name): """ Sets a general tokenizer and model using the 'self.get_model' method. Args: model_name (`str`): Directory containing the necessary tokenizer and model files. """ self.model, self.tokenizer = self.get_model(self.model_name) def _mean_pooling(self, model_output, attention_mask): """ Internal method that takes a model output and an attention mask and outputs a mean pooling layer. Args: model_output (`torch.Tensor`): output from the QA model attention_mask (`torch.Tensor`): attention mask defined in the QA tokenizer Returns: The averaged tensor. """ token_embeddings = model_output[0] input_mask_expanded = ( attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() ) pool_emb = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp( input_mask_expanded.sum(1), min=1e-9 ) return pool_emb def get_embeddings(self, questions, batch=32): """ Gets the corresponding embeddings for a set of input 'questions'. Args: questions (`list` of `str`): List of strings defining the questions to be embedded batch (`int`): Performs the embedding job 'batch' questions at a time Returns: The embedding vectors. """ question_embeddings = [] for i in range(0, len(questions), batch): # Tokenize sentences encoded_input = self.tokenizer( questions[i : i + batch], padding=True, truncation=True, return_tensors="pt", ) # Compute token embeddings with torch.no_grad(): model_output = self.model(**encoded_input) # Perform mean pooling batch_embeddings = self._mean_pooling( model_output, encoded_input["attention_mask"] ) question_embeddings.append(batch_embeddings) question_embeddings = torch.cat(question_embeddings, dim=0) return question_embeddings class QASearcher: def __init__(self, model_name="paraphrase-MiniLM-L6-v2"): """ Defines a QA Search model. This is, given a new question it searches the most similar questions in a set 'context' and returns both the best question and associated answer. Args: model_name (`str`): Directory containing the necessary tokenizer and model files. """ self.answers = None self.questions = None self.question_embeddings = None self.embedder = QAEmbedder(model_name=model_name) def set_context_qa(self, questions, answers): """ Sets the QA context to be used during search. Args: questions (`list` of `str`): List of strings defining the questions to be embedded answers (`list` of `str`): Best answer for each question in 'questions' """ self.answers = answers self.questions = questions self.question_embeddings = self.get_q_embeddings(questions) def get_q_embeddings(self, questions): """ Gets the embeddings for the questions in 'context'. Args: questions (`list` of `str`): List of strings defining the questions to be embedded Returns: The embedding vectors. """ question_embeddings = self.embedder.get_embeddings(questions) question_embeddings = torch.nn.functional.normalize( question_embeddings, p=2, dim=1 ) return question_embeddings.transpose(0, 1) def cosine_similarity(self, questions, batch=32): """ Gets the cosine similarity between the new questions and the 'context' questions. Args: questions (`list` of `str`): List of strings defining the questions to be embedded batch (`int`): Performs the embedding job 'batch' questions at a time Returns: The cosine similarity """ question_embeddings = self.embedder.get_embeddings(questions, batch=batch) question_embeddings = torch.nn.functional.normalize( question_embeddings, p=2, dim=1 ) cosine_sim = torch.mm(question_embeddings, self.question_embeddings) return cosine_sim def get_answers(self, questions, batch=32): """ Gets the best answers in the stored 'context' for the given new 'questions'. Args: questions (`list` of `str`): List of strings defining the questions to be embedded batch (`int`): Performs the embedding job 'batch' questions at a time Returns: A `list` of `dict`'s containing the original question ('orig_q'), the most similar question in the context ('best_q') and the associated answer ('best_a'). """ similarity = self.cosine_similarity(questions, batch=batch) response = [] for i in range(similarity.shape[0]): best_ix = similarity[i].argmax() best_q = self.questions[best_ix] best_a = self.answers[best_ix] response.append( { "orig_q": questions[i], "best_q": best_q, "best_a": best_a, } ) return response