File size: 16,426 Bytes
1a3c007
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
# cython: language_level=3

"""
Copyright 2019 Brian Thompson

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import numpy as np

cimport numpy as np
cimport cython


def make_x_y_offsets(alignment_types):
    # alignment types for which we will precompute costs

    # deletion/insertion is added later
    for x, y in alignment_types:
        assert (x > 0)
        assert (y > 0)

    x_offsets = np.array([x for x, y in alignment_types], dtype=np.int32)  # MUST **NOT** INCLUDE (0,1), (1,0)
    y_offsets = np.array([y for x, y in alignment_types], dtype=np.int32)  # MUST **NOT** INCLUDE (0,1), (1,0)
    return x_offsets, y_offsets


def make_dense_costs(np.ndarray[float, ndim=3] vecs0,  # itput
                     np.ndarray[float, ndim=3] vecs1,  # input
                     np.ndarray[float, ndim=2] norm0,  # input
                     np.ndarray[float, ndim=2] norm1,  # input
                     int offset0 = 0,  # index into vecs0/norms0
                     int offset1 = 0,  # index into vecs1/norms1
                     ):
    """
    Make a full N*M feature matrix. By default, makes 1-1 alignments, 
       can build others by specifying offset0, offset1 to index into
       vecs0, norms0 and vecs1, norms1 respectivly. 
    """
    assert vecs0.shape[0] > offset0
    assert vecs1.shape[0] > offset1
    assert norm0.shape[0] > offset0
    assert norm1.shape[0] > offset1

    cdef int size0 = np.shape(vecs0)[1]
    assert norm0.shape[1] == size0

    cdef int size1 = np.shape(vecs1)[1]
    assert norm1.shape[1] == size1

    cdef int vecsize = np.shape(vecs0)[2]
    assert vecs1.shape[2] == vecsize

    cdef int xi, yi
    cdef float sumx

    cdef np.ndarray[float, ndim=2] costs = np.empty((size0, size1), dtype=np.float32)

    for xi in range(size0):
        for yi in range(size1):
            sumx = 0.0
            for jj in range(vecsize):
                sumx += vecs0[offset0, xi, jj] * vecs1[offset1, yi, jj]

            costs[xi, yi] = 2.0 * (1.0 - sumx) / (1e-6 + norm0[offset0, xi] + norm1[offset1, yi])
            # normalize by alignment type  
            costs[xi, yi] = costs[xi, yi] * (offset0 + 1) * (offset1 + 1)

    return costs


def dense_dp(np.ndarray[float, ndim=2] alignment_cost, float pen):
    """
    Compute cost matrix (csum) and backpointers (bp) 
    from full 2-D 1-1 alignment costs matrix (alignment_cost) 
    """

    size0 = alignment_cost.shape[0]
    size1 = alignment_cost.shape[1]
    # csum and traceback matrix are both on nodes
    #   so they are +1 in each dimension compared to the jump costs matrix
    # For anything being used in accumulation, use float64
    cdef np.ndarray[double, ndim=2] csum = np.empty((size0 + 1, size1 + 1), dtype=np.float64)
    cdef np.ndarray[int, ndim=2] bp = np.empty((size0 + 1, size1 + 1), dtype=np.int32)

    # bp and csum are nodes, 
    #   while alignment_cost is the cost of going between the nodes
    # Size of nodes should be one larger than alignment costs
    b0, b1 = np.shape(bp)
    c0, c1 = np.shape(csum)
    j0, j1 = np.shape(alignment_cost)
    assert (b0 == c0 == j0 + 1)
    assert (b1 == c1 == j1 + 1)

    cdef int cmax = np.shape(csum)[1]
    cdef int rmax = np.shape(csum)[0]
    cdef int c, r
    cdef double cost0, cost1, cost2

    # initialize the all c-direction deletion path
    for c in range(cmax):
        csum[0, c] = c * pen
        bp[0, c] = 1

    # initialize the all r-direction deletion path
    for r in range(rmax):
        csum[r, 0] = r * pen
        bp[r, 0] = 2

    # Initial cost is 0.0
    csum[0, 0] = 0.0  # noop
    bp[0, 0] = 4  # should not matter

    # Calculate the rest recursively
    for c in range(1, cmax):
        for r in range(1, rmax):

            # alignment_cost indexes are off by 1 wrt
            #   csum/bp, since csum/bp are nodes
            cost0 = csum[r - 1, c - 1] + alignment_cost[r - 1, c - 1]
            cost1 = csum[r, c - 1] + pen
            cost2 = csum[r - 1, c] + pen

            csum[r, c] = cost0
            bp[r, c] = 0

            if cost1 < csum[r, c]:
                csum[r, c] = cost1
                bp[r, c] = 1
            if cost2 < csum[r, c]:
                csum[r, c] = cost2
                bp[r, c] = 2

    return csum, bp


def score_path(np.ndarray[int, ndim=1] xx,
               np.ndarray[int, ndim=1] yy,
               np.ndarray[float, ndim=1] norm1,
               np.ndarray[float, ndim=1] norm2,
               np.ndarray[float, ndim=2] vecs1,
               np.ndarray[float, ndim=2] vecs2,
               np.ndarray[float, ndim=1] out):
    cdef int xi, yi, ii, jj
    cdef float outx
    cdef int lenxy = xx.shape[0]
    cdef int vecsize = vecs1.shape[1]

    for ii in range(lenxy):
        xi = xx[ii]
        yi = yy[ii]
        outx = 0.0
        for jj in range(vecsize):
            outx += vecs1[xi, jj] * vecs2[yi, jj]
        out[ii] = 2.0 * (1.0 - outx) / (norm1[xi] + norm2[yi])


# Bounds checking and wraparound slow things down by about 2x
# Division by 0 checking has minimal speed impact
@cython.boundscheck(False)  # turn off bounds-checking for entire function
@cython.wraparound(False)  # turn off negative index wrapping for entire function
@cython.cdivision(True)  # use c-style division (no division-by-zero check)
def make_sparse_costs(np.ndarray[float, ndim=3] vecs0,  # intput: num aligns X num sents X dim
                      np.ndarray[float, ndim=3] vecs1,  # input
                      np.ndarray[float, ndim=2] norms0,  # intput: num aligns X num sents
                      np.ndarray[float, ndim=2] norms1,  # input
                      x_y_path,
                      alignment_types,
                      int width_over2):
    """
    Make features for DP, *for lines running across approximate path*, *for each alignment type*
    x_offsets, y_offsets should not include (0,1), (1,0)

    Basically, we take the feature matrix, rotate it 45 degress, 
       and compute a "wavy" matrix for the features.
    It's like the diagonal but it moves around to hopefully always include the true path.
    """

    cdef np.ndarray[int, ndim=2] x_y_path_ = np.array(x_y_path).astype(np.int32)

    assert (vecs0.shape[0] == norms0.shape[0])
    assert (vecs1.shape[0] == norms1.shape[0])

    assert (vecs0.shape[1] == norms0.shape[1])
    assert (vecs1.shape[1] == norms1.shape[1])

    # check how many overlaps vectors were passed in
    num_overlaps_in_vecs0 = vecs0.shape[0]
    num_overlaps_in_vecs1 = vecs1.shape[0]

    # check how many overlaps were requested
    # edge case: alignment_types could be empty
    #    In that case, we should just return insertions/deletions
    #    and max_x_overlap == max_y_overlap == 0
    max_x_overlap = max([0] + [x for x, y in alignment_types])  # add [0] in case alignment_types is empty
    max_y_overlap = max([0] + [y for x, y in alignment_types])  # add [0] in case alignment_types is empty

    # note: alignment types are specified 1-based, but vectors are stored 0-based
    if max_x_overlap > num_overlaps_in_vecs0:
        raise Exception('%d x overlaps requrested (via alignment_types), but vecs0 only has %d' % (
            max_x_overlap, num_overlaps_in_vecs0))
    if max_y_overlap > num_overlaps_in_vecs1:
        raise Exception('%d y overlaps requrested (via alignment_types), but vecs1 only has %d' % (
            max_y_overlap, num_overlaps_in_vecs1))

    # number of sentences in each document
    cdef int xsize = vecs0.shape[1]
    cdef int ysize = vecs1.shape[1]

    # vector diminsions should match
    assert (vecs0.shape[2] == vecs1.shape[2])

    cdef np.ndarray[int, ndim=1] x_offsets, y_offsets
    x_offsets, y_offsets = make_x_y_offsets(alignment_types)

    # reserve outputs
    a_len = x_y_path_.shape[0]
    b_len = 2 * width_over2
    cdef np.ndarray[float, ndim=3] a_b_feats = np.empty((len(alignment_types), a_len, b_len), dtype=np.float32)
    cdef np.ndarray[int, ndim=1] b_offset = np.empty(a_len).astype(np.int32)

    cdef int x, y, aa, bb, xx, yy, a_idx, b_idx, bb2, x_offset, y_offset, ii_align, x_offset_idx, y_offset_idx
    cdef int vecsize = vecs0.shape[2]
    cdef int num_alignments = x_offsets.shape[0]

    cdef float sumx, feat
    cdef float inf = np.inf

    for ii in range(x_y_path_.shape[0]):
        x = x_y_path_[ii, 0]
        y = x_y_path_[ii, 1]

        # convert xy to ab cords
        aa = x + y
        bb = y

        a_idx = aa
        b_offset[aa] = bb - width_over2
        for b_idx, bb2 in enumerate(range(bb - width_over2, bb + width_over2)):
            # convert ab to xy cords
            xx = aa - bb2
            yy = bb2

            for ii_align in range(num_alignments):
                x_offset = x_offsets[ii_align]
                x_offset_idx = x_offset - 1  # overlaps start at 1, vectors stored 0-based
                y_offset = y_offsets[ii_align]
                y_offset_idx = y_offset - 1

                if 0 <= xx < xsize and 0 <= yy < ysize:
                    sumx = 0.0
                    for jj in range(vecsize):
                        sumx += vecs0[x_offset_idx, xx, jj] * vecs1[y_offset_idx, yy, jj]
                    feat = 2.0 * x_offset * y_offset * (1.0 - sumx) / (
                            1e-6 + norms0[x_offset_idx, xx] + norms1[y_offset_idx, yy])

                else:
                    feat = inf

                a_b_feats[ii_align, a_idx, b_idx] = feat

    return a_b_feats, b_offset


def sparse_dp(np.ndarray[float, ndim=3] a_b_costs,
              np.ndarray[int, ndim=1] b_offset_in,
              alignment_types,
              double del_penalty,
              int x_in_size,
              int y_in_size):
    """
    Do DP along a path, using features saved off along path.
    x_offsets, y_offsets should not include (0,1), (1,0)

    xsize, ysize refer to the costs a_b_csum, but in x/y space

    As in the simpler full-DP case, 
       we compute cumulative costs and backpointers on notes,
       and there are COSTS associated with moving between them.

    This means the size of the notes  +1,+1 larger (in x,y) than the COSTS.
    
    So the size of a_b_csum, a_b_xp, a_b_yp are all one larger in x and y compared to the costs

    In order to save memory (and time, vs a sparse matrix with hashes to look up values), let:
             a = x + y
             b = x - y

    b_offsets tells us how far from the left edge the features are computed for.
         basically it's like we are computing along the diagonal, 
         but we shift the diagonal around based on our belief
         about where the alignments are. 

    b_offsets is used for both costs AND csum, backpointers, so it needs to be 
        +2 longer (it is in the a-direction) than the costs (in the a direction)

    """
    cdef np.ndarray[int, ndim=1] x_offsets, y_offsets
    x_offsets, y_offsets = make_x_y_offsets(alignment_types)

    # make x/y offsets, including (0,1), (1,), i.e. including deletion and insertion
    x_offsets = np.concatenate([x_offsets, np.array([0, 1], dtype=np.int32)])
    y_offsets = np.concatenate([y_offsets, np.array([1, 0], dtype=np.int32)])

    cdef int a_in_size = a_b_costs.shape[1]
    cdef int b_in_size = a_b_costs.shape[2]

    cdef int a_out_size = a_in_size + 2
    cdef int b_out_size = b_in_size

    cdef int x_out_size = x_in_size + 1
    cdef int y_out_size = y_in_size + 1

    # costs are the costs of going between nodes.
    # in x,y for the nodes, we basically add a buffer 
    #   at x=0 and y=0, and shift the cost by (x=+1,y=+1)
    # In a,b space, this means adding two points (for the buffer)
    #      at the beginning, and shifting by (a=+0,b=+1) since
    #      a=x+y and b=y
    # for the first two points, we can simply replicate the
    #    original b_offset, since it should be -width_over2
    # i.e. b_offset_in[0] == -width_over2
    extra_two_points = np.array([b_offset_in[0], b_offset_in[0]], dtype=np.int32)
    cdef np.ndarray[int, ndim=1] b_offset_out = np.concatenate([extra_two_points, b_offset_in + 1])

    # outputs
    # For anything being used in accumulation, use float64
    cdef np.ndarray[double, ndim=2] a_b_csum = np.zeros((a_in_size + 2, b_in_size),
                                                        dtype=np.float64) + np.inf  # error cumulative sum
    cdef np.ndarray[int, ndim=2] a_b_xp = np.zeros((a_in_size + 2, b_in_size), dtype=np.int32) - 2  # backpointer for x
    cdef np.ndarray[int, ndim=2] a_b_yp = np.zeros((a_in_size + 2, b_in_size), dtype=np.int32) - 2  # backpointer for y

    cdef int num_alignments = x_offsets.shape[0]
    cdef double inf = np.inf
    cdef int xx_out, yy_out, ii_align, x_offset, y_offset
    cdef int aa_in_cost, bb_in_cost, aa_out, bb_out, aa_out_prev, bb_out_prev, xx_in_cost, yy_in_cost, xx_out_prev, yy_out_prev

    cdef double alignment_cost, total_cost, prev_cost

    # increasing in a is the same as going along diagonals in x/y, so DP order works
    #  (and any ordering is fine in b - nothing depends on values adjacent on diagonal in x/y)
    for aa_out in range(a_in_size + 2):
        for bb_out in range(b_in_size):
            #xx_out, yy_out = ab2xy_w_offset(aa_out, bb_out, b_offset_out)
            yy_out = bb_out + b_offset_out[aa_out]
            xx_out = aa_out - yy_out

            # edge case: all deletions in y-direction
            if xx_out == 0 and 0 <= yy_out < y_out_size:
                a_b_csum[aa_out, bb_out] = del_penalty * yy_out
                a_b_xp[aa_out, bb_out] = 0
                a_b_yp[aa_out, bb_out] = 1

            # edge case: all deletions in x-direction
            elif yy_out == 0 and 0 <= xx_out < x_out_size:
                a_b_csum[aa_out, bb_out] = del_penalty * xx_out
                a_b_xp[aa_out, bb_out] = 1
                a_b_yp[aa_out, bb_out] = 0

            else:
                # initialize output to inf
                a_b_csum[aa_out, bb_out] = inf
                a_b_xp[aa_out, bb_out] = -42
                a_b_yp[aa_out, bb_out] = -42

                for ii_align in range(num_alignments):
                    x_offset = x_offsets[ii_align]
                    y_offset = y_offsets[ii_align]

                    # coords of location of alignment cost, in input x/y space
                    xx_in_cost = xx_out - 1  # features were front padded,
                    yy_in_cost = yy_out - 1  #   so offset is always 1

                    # the coords of location of previous cumsum cost, in input x/y space
                    xx_out_prev = xx_out - x_offset
                    yy_out_prev = yy_out - y_offset

                    if 0 <= xx_in_cost < x_in_size and 0 <= yy_in_cost < y_in_size and 0 <= xx_out_prev < x_out_size and 0 <= yy_out_prev < y_out_size:
                        # convert x,y to a,b
                        aa_in_cost = xx_in_cost + yy_in_cost
                        bb_in_cost = yy_in_cost - b_offset_in[aa_in_cost]

                        aa_out_prev = xx_out_prev + yy_out_prev
                        bb_out_prev = yy_out_prev - b_offset_out[aa_out_prev]

                        if 0 <= aa_in_cost < a_in_size and 0 <= bb_in_cost < b_in_size and 0 <= aa_out_prev < a_out_size and 0 <= bb_out_prev < b_out_size:
                            if x_offset == 0 or y_offset == 0:
                                alignment_cost = del_penalty
                            else:
                                alignment_cost = a_b_costs[ii_align, aa_in_cost, bb_in_cost]

                            prev_cost = a_b_csum[aa_out_prev, bb_out_prev]

                            total_cost = prev_cost + alignment_cost

                            if total_cost < a_b_csum[aa_out, bb_out]:
                                a_b_csum[aa_out, bb_out] = total_cost
                                a_b_xp[aa_out, bb_out] = x_offset
                                a_b_yp[aa_out, bb_out] = y_offset

    return a_b_csum, a_b_xp, a_b_yp, b_offset_out