Spaces:

csvaldellon
/

qa_model

Sleeping

File size: 6,534 Bytes

f0c8e2c

import torch
from transformers import AutoModel, AutoTokenizer


class QAEmbedder:
    def __init__(self, model_name="paraphrase-MiniLM-L6-v2"):
        """
        Defines a QA embedding model. This is, given a set of questions,
        this class returns the corresponding embedding vectors.

        Args:
          model_name (`str`): Directory containing the necessary tokenizer
            and model files.
        """
        self.model = None
        self.tokenizer = None
        self.model_name = model_name
        self.set_model(model_name)

    def get_model(self, model_name):
        """
        Loads a general tokenizer and model using pytorch
        'AutoTokenizer' and 'AutoModel'

        Args:
          model_name (`str`): Directory containing the necessary tokenizer
            and model files.
        """
        model = AutoModel.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        return model, tokenizer

    def set_model(self, model_name):
        """
        Sets a general tokenizer and model using the 'self.get_model'
        method.

        Args:
          model_name (`str`): Directory containing the necessary tokenizer
            and model files.
        """
        self.model, self.tokenizer = self.get_model(self.model_name)

    def _mean_pooling(self, model_output, attention_mask):
        """
        Internal method that takes a model output and an attention
        mask and outputs a mean pooling layer.

        Args:
          model_output (`torch.Tensor`): output from the QA model
          attention_mask (`torch.Tensor`): attention mask defined in the QA tokenizer

        Returns:
          The averaged tensor.
        """
        token_embeddings = model_output[0]

        input_mask_expanded = (
            attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        )

        pool_emb = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
            input_mask_expanded.sum(1), min=1e-9
        )

        return pool_emb

    def get_embeddings(self, questions, batch=32):
        """
        Gets the corresponding embeddings for a set of input 'questions'.

        Args:
          questions (`list` of `str`): List of strings defining the questions to be embedded
          batch (`int`): Performs the embedding job 'batch' questions at a time

        Returns:
          The embedding vectors.
        """
        question_embeddings = []
        for i in range(0, len(questions), batch):

            # Tokenize sentences
            encoded_input = self.tokenizer(
                questions[i : i + batch],
                padding=True,
                truncation=True,
                return_tensors="pt",
            )

            # Compute token embeddings
            with torch.no_grad():
                model_output = self.model(**encoded_input)

            # Perform mean pooling
            batch_embeddings = self._mean_pooling(
                model_output, encoded_input["attention_mask"]
            )
            question_embeddings.append(batch_embeddings)

        question_embeddings = torch.cat(question_embeddings, dim=0)
        return question_embeddings


class QASearcher:
    def __init__(self, model_name="paraphrase-MiniLM-L6-v2"):
        """
        Defines a QA Search model. This is, given a new question it searches
        the most similar questions in a set 'context' and returns both the best
        question and associated answer.

        Args:
          model_name (`str`): Directory containing the necessary tokenizer
            and model files.
        """
        self.answers = None
        self.questions = None
        self.question_embeddings = None
        self.embedder = QAEmbedder(model_name=model_name)

    def set_context_qa(self, questions, answers):
        """
        Sets the QA context to be used during search.

        Args:
          questions (`list` of `str`):  List of strings defining the questions to be embedded
          answers (`list` of `str`): Best answer for each question in 'questions'
        """
        self.answers = answers
        self.questions = questions
        self.question_embeddings = self.get_q_embeddings(questions)

    def get_q_embeddings(self, questions):
        """
        Gets the embeddings for the questions in 'context'.

        Args:
          questions (`list` of `str`):  List of strings defining the questions to be embedded

        Returns:
          The embedding vectors.
        """
        question_embeddings = self.embedder.get_embeddings(questions)
        question_embeddings = torch.nn.functional.normalize(
            question_embeddings, p=2, dim=1
        )
        return question_embeddings.transpose(0, 1)

    def cosine_similarity(self, questions, batch=32):
        """
        Gets the cosine similarity between the new questions and the 'context' questions.

        Args:
          questions (`list` of `str`):  List of strings defining the questions to be embedded
          batch (`int`): Performs the embedding job 'batch' questions at a time

        Returns:
          The cosine similarity
        """
        question_embeddings = self.embedder.get_embeddings(questions, batch=batch)
        question_embeddings = torch.nn.functional.normalize(
            question_embeddings, p=2, dim=1
        )

        cosine_sim = torch.mm(question_embeddings, self.question_embeddings)

        return cosine_sim

    def get_answers(self, questions, batch=32):
        """
        Gets the best answers in the stored 'context' for the given new 'questions'.

        Args:
          questions (`list` of `str`):  List of strings defining the questions to be embedded
          batch (`int`): Performs the embedding job 'batch' questions at a time

        Returns:
          A `list` of `dict`'s containing the original question ('orig_q'), the most similar
          question in the context ('best_q') and the associated answer ('best_a').
        """
        similarity = self.cosine_similarity(questions, batch=batch)

        response = []
        for i in range(similarity.shape[0]):
            best_ix = similarity[i].argmax()
            best_q = self.questions[best_ix]
            best_a = self.answers[best_ix]

            response.append(
                {
                    "orig_q": questions[i],
                    "best_q": best_q,
                    "best_a": best_a,
                }
            )

        return response