File size: 6,534 Bytes
f0c8e2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import torch
from transformers import AutoModel, AutoTokenizer


class QAEmbedder:
    def __init__(self, model_name="paraphrase-MiniLM-L6-v2"):
        """
        Defines a QA embedding model. This is, given a set of questions,
        this class returns the corresponding embedding vectors.

        Args:
          model_name (`str`): Directory containing the necessary tokenizer
            and model files.
        """
        self.model = None
        self.tokenizer = None
        self.model_name = model_name
        self.set_model(model_name)

    def get_model(self, model_name):
        """
        Loads a general tokenizer and model using pytorch
        'AutoTokenizer' and 'AutoModel'

        Args:
          model_name (`str`): Directory containing the necessary tokenizer
            and model files.
        """
        model = AutoModel.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        return model, tokenizer

    def set_model(self, model_name):
        """
        Sets a general tokenizer and model using the 'self.get_model'
        method.

        Args:
          model_name (`str`): Directory containing the necessary tokenizer
            and model files.
        """
        self.model, self.tokenizer = self.get_model(self.model_name)

    def _mean_pooling(self, model_output, attention_mask):
        """
        Internal method that takes a model output and an attention
        mask and outputs a mean pooling layer.

        Args:
          model_output (`torch.Tensor`): output from the QA model
          attention_mask (`torch.Tensor`): attention mask defined in the QA tokenizer

        Returns:
          The averaged tensor.
        """
        token_embeddings = model_output[0]

        input_mask_expanded = (
            attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        )

        pool_emb = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
            input_mask_expanded.sum(1), min=1e-9
        )

        return pool_emb

    def get_embeddings(self, questions, batch=32):
        """
        Gets the corresponding embeddings for a set of input 'questions'.

        Args:
          questions (`list` of `str`): List of strings defining the questions to be embedded
          batch (`int`): Performs the embedding job 'batch' questions at a time

        Returns:
          The embedding vectors.
        """
        question_embeddings = []
        for i in range(0, len(questions), batch):

            # Tokenize sentences
            encoded_input = self.tokenizer(
                questions[i : i + batch],
                padding=True,
                truncation=True,
                return_tensors="pt",
            )

            # Compute token embeddings
            with torch.no_grad():
                model_output = self.model(**encoded_input)

            # Perform mean pooling
            batch_embeddings = self._mean_pooling(
                model_output, encoded_input["attention_mask"]
            )
            question_embeddings.append(batch_embeddings)

        question_embeddings = torch.cat(question_embeddings, dim=0)
        return question_embeddings


class QASearcher:
    def __init__(self, model_name="paraphrase-MiniLM-L6-v2"):
        """
        Defines a QA Search model. This is, given a new question it searches
        the most similar questions in a set 'context' and returns both the best
        question and associated answer.

        Args:
          model_name (`str`): Directory containing the necessary tokenizer
            and model files.
        """
        self.answers = None
        self.questions = None
        self.question_embeddings = None
        self.embedder = QAEmbedder(model_name=model_name)

    def set_context_qa(self, questions, answers):
        """
        Sets the QA context to be used during search.

        Args:
          questions (`list` of `str`):  List of strings defining the questions to be embedded
          answers (`list` of `str`): Best answer for each question in 'questions'
        """
        self.answers = answers
        self.questions = questions
        self.question_embeddings = self.get_q_embeddings(questions)

    def get_q_embeddings(self, questions):
        """
        Gets the embeddings for the questions in 'context'.

        Args:
          questions (`list` of `str`):  List of strings defining the questions to be embedded

        Returns:
          The embedding vectors.
        """
        question_embeddings = self.embedder.get_embeddings(questions)
        question_embeddings = torch.nn.functional.normalize(
            question_embeddings, p=2, dim=1
        )
        return question_embeddings.transpose(0, 1)

    def cosine_similarity(self, questions, batch=32):
        """
        Gets the cosine similarity between the new questions and the 'context' questions.

        Args:
          questions (`list` of `str`):  List of strings defining the questions to be embedded
          batch (`int`): Performs the embedding job 'batch' questions at a time

        Returns:
          The cosine similarity
        """
        question_embeddings = self.embedder.get_embeddings(questions, batch=batch)
        question_embeddings = torch.nn.functional.normalize(
            question_embeddings, p=2, dim=1
        )

        cosine_sim = torch.mm(question_embeddings, self.question_embeddings)

        return cosine_sim

    def get_answers(self, questions, batch=32):
        """
        Gets the best answers in the stored 'context' for the given new 'questions'.

        Args:
          questions (`list` of `str`):  List of strings defining the questions to be embedded
          batch (`int`): Performs the embedding job 'batch' questions at a time

        Returns:
          A `list` of `dict`'s containing the original question ('orig_q'), the most similar
          question in the context ('best_q') and the associated answer ('best_a').
        """
        similarity = self.cosine_similarity(questions, batch=batch)

        response = []
        for i in range(similarity.shape[0]):
            best_ix = similarity[i].argmax()
            best_q = self.questions[best_ix]
            best_a = self.answers[best_ix]

            response.append(
                {
                    "orig_q": questions[i],
                    "best_q": best_q,
                    "best_a": best_a,
                }
            )

        return response