File size: 9,945 Bytes
63858e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
import h5py
import numpy as np
from functools import partial
from utils.gen_utils import map_nlist, vround
import regex as re
from spacyface.simple_spacy_token import SimpleSpacyToken

ZERO_BUFFER = 12 # Number of decimal places each index takes
main_key = r"{:0" + str(ZERO_BUFFER) + r"}"
suppl_attn_key = r"{:0" + str(ZERO_BUFFER) + r"}_attn"

def zip_len_check(*iters):
    """Zip iterables with a check that they are all the same length"""
    if len(iters) < 2:
        raise ValueError(f"Expected at least 2 iterables to combine. Got {len(iters)} iterables")
    n = len(iters[0])
    for i in iters:
        n_ = len(i)
        if n_ != n:
            raise ValueError(f"Expected all iterations to have len {n} but found {n_}")

    return zip(*iters)

class SentenceH5Data:
    def __init__(self, grp):
        self.grp = grp

    @property
    def n_layers(self):
        return self.embeddings.shape[0] - 1 # 1 was added at the input, not a hidden layer
        
    @property
    def sentence(self):
        return self.grp.attrs['sentence']

    @property
    def embeddings(self):
        return self.grp['embeddings'][:]

    @property
    def zero_special_embeddings(self):
        out = self.embeddings.copy()
        out[:, self.mask_is_special] = np.zeros(out[:, self.mask_is_special].shape)
        return out
    
    @property
    def contexts(self):
        return self.grp['contexts'][:]
    
    @property
    def zero_special_contexts(self):
        out = self.contexts.copy()
        out[:, self.mask_is_special] = np.zeros(out[:, self.mask_is_special].shape)
        return out

    @property
    def attentions(self):
        """Return all attentions, including [CLS] and [SEP]
        
        Note that if the hdf5 is created with CLS and SEP attentions, it will have CLS and SEP attentions"""
        return self.grp['attentions'][:] # Converts to numpy array

    @property
    def mask_is_special(self):
        return np.logical_or(self.deps == '', self.poss == '')

    @property
    def tokens(self):
        return self.grp.attrs['token']

    @property
    def poss(self):
        return self.grp.attrs['pos']

    @property
    def deps(self):
        return self.grp.attrs['dep']

    @property
    def is_ents(self):
        return self.grp.attrs['is_ent']
    
    @property
    def heads(self):
        """Not the attention heads, but rather the head word of the orig sentence"""
        return self.grp.attrs['head']
    
    @property
    def norms(self):
        return self.grp.attrs['norm']
    
    @property
    def tags(self):
        return self.grp.attrs['tag']
    
    @property
    def lemmas(self):
        return self.grp.attrs['lemma']

    def __len__(self):
        return len(self.tokens)

    def __repr__(self):
        sent_len = 40
        if len(self.sentence) > sent_len: s = self.sentence[:(sent_len - 3)] + '...'
        else: s = self.sentence
        return f"SentenceH5Data({s})"

class TokenH5Data(SentenceH5Data):
    """A wrapper around the HDF5 file storage information allowing easy access to information about each 
    processed sentence.

    Sometimes, and index of -1 is used to represent the entire object in memory
    """
    def __init__(self, grp, index):
        """Represents returned from the refmap of the CorpusEmbedding class"""
        if type(grp) == SentenceH5Data: super().__init__(grp.grp)
        elif type(grp) == h5py._hl.group.Group: super().__init__(grp)
        self.index = index

    @property
    def embedding(self):
        return self.embeddings[:, self.index, :]
    
    @property
    def context(self):
        return self.contexts[:, self.index, :]

    @property
    def attentions_out(self):
        """Access all attention OUT of this token"""
        output = self.attentions[:,:, self.index, :]
        return output

    @property
    def attentions_in(self):
        """Access all attention INTO this token"""
        new_attention = self.attentions.transpose((0,1,3,2))
        return new_attention[:,:, self.index, :]

    def _select_from_attention(self, layer, heads):
        if type(heads) is int:
            heads = [heads]

        # Select layer and heads
        modified_attentions = self.attentions[layer, heads].mean(0)
        attentions_out = modified_attentions
        attentions_in = modified_attentions.transpose()
        return attentions_out, attentions_in

    def _calc_offset_single(self, attention):
        """Get offset to location of max attention"""
        curr_idx = self.index
        max_atts = np.argmax(attention)
        return max_atts - curr_idx

    # Define metadata properties. 
    # Right now, needs manual curation of fields from SimpleSpacyToken. Ideally, this is automated
    
    @property
    def token(self):
        return self.tokens[self.index]

    @property
    def pos(self):
        return self.poss[self.index]

    @property
    def dep(self):
        return self.deps[self.index]
    
    @property
    def is_ent(self):
        return bool(self.is_ents[self.index])

    @property
    def norm(self):
        return self.norms[self.index]
    
    @property
    def head(self):
        return self.heads[self.index]
    
    @property
    def lemma(self):
        return self.lemmas[self.index]
    
    @property
    def tag(self):
        return self.tags[self.index]

    def to_json(self, layer, heads, top_k=5, ndigits=4):
        """
        Convert token information and attention to return to frontend
        
        Require layer, heads, and top_k to convert the attention into value to return to frontend.
        
        Output:
            {
                sentence: str
                index: number
                match: str
                is_match: bool
                is_next_word: bool
                matched_att: {
                    in: { att: number[]
                        , offset_to_max: number
                        , loc_of_max: float 
                        }
                    out: { att: number[]
                        , offset_to_max: number
                        , loc_of_max: float 
                        }
                },
                matched_att_plus_1: {
                    in: { att: number[]
                        , offset_to_max: number
                        }
                    out: { att: number[]
                        , offset_to_max: number
                        }
                }
                tokens: List[
                    { token: string
                    , pos: string
                    , dep: string
                    , is_ent: boolean
                    , inward: number[]
                    , outward: number[]
                    }
                ]
            }
        """
        keys = [
            "token",
            "pos",
            "dep",
            "is_ent",
            "inward",
            "outward",
        ]

        token_arr = []
        matched_attentions = {}
        N = len(self)

        # Iterate through the following
        tokens = self.tokens.tolist()
        poss = [p.lower() for p in self.poss.tolist()]
        deps = [d.lower() for d in self.deps.tolist()]
        ents = self.is_ents.tolist()
        attentions_out, attentions_in = self._select_from_attention(layer, heads)

        matched_att_plus_1 = None
        next_index = None

        for i, tok_info in enumerate(zip_len_check( 
            tokens
            , poss
            , deps
            , ents
            , attentions_out.tolist()
            , attentions_in.tolist())):

            def get_interesting_attentions():
                return {
                    "in": {
                        "att": att_in,
                        "offset_to_max": self._calc_offset_single(att_in).item(),
                        # "loc_of_max": np.argmax(att_in), # Broken
                    },
                    "out": {
                        "att": att_out,
                        "offset_to_max": self._calc_offset_single(att_out).item(),
                        # "loc_of_max": np.argmax(att_out), # Broken
                    }
                }


            # Perform rounding of attentions
            rounder = partial(round, ndigits=ndigits)
            att_out = map_nlist(rounder, tok_info[-2])
            att_in = map_nlist(rounder, tok_info[-1])

            obj = {k: v for (k, v) in zip_len_check(keys, tok_info)}

            IS_LAST_TOKEN = i == (N-1)

            if (i == self.index) or ((i - 1) == self.index):
                interesting_attentions = get_interesting_attentions()

                if i == self.index:
                    obj['is_match'] = True
                    matched_attentions = interesting_attentions

                elif (i-1) == self.index:
                    matched_att_plus_1 = interesting_attentions
                    obj['is_next_word'] = True
                    next_index = i

                # Edge case for final iteration through sentence

            else:
                obj['is_match'] = False
                obj['is_next_word'] = False

            if (IS_LAST_TOKEN and (matched_att_plus_1 is None)):
                print("Saving matched_att_plus_1 to: ", interesting_attentions)
                obj['is_next_word'] = True
                matched_att_plus_1 = get_interesting_attentions()
                next_index = i
            
            token_arr.append(obj) 

        next_token = self.tokens[next_index]

        obj = {
            "sentence": self.sentence,
            "index": self.index,
            "match": self.token,
            "next_index": next_index,
            "match_plus_1": next_token,
            "matched_att": matched_attentions,
            "matched_att_plus_1": matched_att_plus_1,
            "tokens": token_arr,
        }

        return obj
    
    def __repr__(self):
        return f"{self.token}: [{self.pos}, {self.dep}, {self.is_ent}]"