# cython: language_level=3 """ Copyright 2019 Brian Thompson Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at https://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import numpy as np cimport numpy as np cimport cython def make_x_y_offsets(alignment_types): # alignment types for which we will precompute costs # deletion/insertion is added later for x, y in alignment_types: assert (x > 0) assert (y > 0) x_offsets = np.array([x for x, y in alignment_types], dtype=np.int32) # MUST **NOT** INCLUDE (0,1), (1,0) y_offsets = np.array([y for x, y in alignment_types], dtype=np.int32) # MUST **NOT** INCLUDE (0,1), (1,0) return x_offsets, y_offsets def make_dense_costs(np.ndarray[float, ndim=3] vecs0, # itput np.ndarray[float, ndim=3] vecs1, # input np.ndarray[float, ndim=2] norm0, # input np.ndarray[float, ndim=2] norm1, # input int offset0 = 0, # index into vecs0/norms0 int offset1 = 0, # index into vecs1/norms1 ): """ Make a full N*M feature matrix. By default, makes 1-1 alignments, can build others by specifying offset0, offset1 to index into vecs0, norms0 and vecs1, norms1 respectivly. """ assert vecs0.shape[0] > offset0 assert vecs1.shape[0] > offset1 assert norm0.shape[0] > offset0 assert norm1.shape[0] > offset1 cdef int size0 = np.shape(vecs0)[1] assert norm0.shape[1] == size0 cdef int size1 = np.shape(vecs1)[1] assert norm1.shape[1] == size1 cdef int vecsize = np.shape(vecs0)[2] assert vecs1.shape[2] == vecsize cdef int xi, yi cdef float sumx cdef np.ndarray[float, ndim=2] costs = np.empty((size0, size1), dtype=np.float32) for xi in range(size0): for yi in range(size1): sumx = 0.0 for jj in range(vecsize): sumx += vecs0[offset0, xi, jj] * vecs1[offset1, yi, jj] costs[xi, yi] = 2.0 * (1.0 - sumx) / (1e-6 + norm0[offset0, xi] + norm1[offset1, yi]) # normalize by alignment type costs[xi, yi] = costs[xi, yi] * (offset0 + 1) * (offset1 + 1) return costs def dense_dp(np.ndarray[float, ndim=2] alignment_cost, float pen): """ Compute cost matrix (csum) and backpointers (bp) from full 2-D 1-1 alignment costs matrix (alignment_cost) """ size0 = alignment_cost.shape[0] size1 = alignment_cost.shape[1] # csum and traceback matrix are both on nodes # so they are +1 in each dimension compared to the jump costs matrix # For anything being used in accumulation, use float64 cdef np.ndarray[double, ndim=2] csum = np.empty((size0 + 1, size1 + 1), dtype=np.float64) cdef np.ndarray[int, ndim=2] bp = np.empty((size0 + 1, size1 + 1), dtype=np.int32) # bp and csum are nodes, # while alignment_cost is the cost of going between the nodes # Size of nodes should be one larger than alignment costs b0, b1 = np.shape(bp) c0, c1 = np.shape(csum) j0, j1 = np.shape(alignment_cost) assert (b0 == c0 == j0 + 1) assert (b1 == c1 == j1 + 1) cdef int cmax = np.shape(csum)[1] cdef int rmax = np.shape(csum)[0] cdef int c, r cdef double cost0, cost1, cost2 # initialize the all c-direction deletion path for c in range(cmax): csum[0, c] = c * pen bp[0, c] = 1 # initialize the all r-direction deletion path for r in range(rmax): csum[r, 0] = r * pen bp[r, 0] = 2 # Initial cost is 0.0 csum[0, 0] = 0.0 # noop bp[0, 0] = 4 # should not matter # Calculate the rest recursively for c in range(1, cmax): for r in range(1, rmax): # alignment_cost indexes are off by 1 wrt # csum/bp, since csum/bp are nodes cost0 = csum[r - 1, c - 1] + alignment_cost[r - 1, c - 1] cost1 = csum[r, c - 1] + pen cost2 = csum[r - 1, c] + pen csum[r, c] = cost0 bp[r, c] = 0 if cost1 < csum[r, c]: csum[r, c] = cost1 bp[r, c] = 1 if cost2 < csum[r, c]: csum[r, c] = cost2 bp[r, c] = 2 return csum, bp def score_path(np.ndarray[int, ndim=1] xx, np.ndarray[int, ndim=1] yy, np.ndarray[float, ndim=1] norm1, np.ndarray[float, ndim=1] norm2, np.ndarray[float, ndim=2] vecs1, np.ndarray[float, ndim=2] vecs2, np.ndarray[float, ndim=1] out): cdef int xi, yi, ii, jj cdef float outx cdef int lenxy = xx.shape[0] cdef int vecsize = vecs1.shape[1] for ii in range(lenxy): xi = xx[ii] yi = yy[ii] outx = 0.0 for jj in range(vecsize): outx += vecs1[xi, jj] * vecs2[yi, jj] out[ii] = 2.0 * (1.0 - outx) / (norm1[xi] + norm2[yi]) # Bounds checking and wraparound slow things down by about 2x # Division by 0 checking has minimal speed impact @cython.boundscheck(False) # turn off bounds-checking for entire function @cython.wraparound(False) # turn off negative index wrapping for entire function @cython.cdivision(True) # use c-style division (no division-by-zero check) def make_sparse_costs(np.ndarray[float, ndim=3] vecs0, # intput: num aligns X num sents X dim np.ndarray[float, ndim=3] vecs1, # input np.ndarray[float, ndim=2] norms0, # intput: num aligns X num sents np.ndarray[float, ndim=2] norms1, # input x_y_path, alignment_types, int width_over2): """ Make features for DP, *for lines running across approximate path*, *for each alignment type* x_offsets, y_offsets should not include (0,1), (1,0) Basically, we take the feature matrix, rotate it 45 degress, and compute a "wavy" matrix for the features. It's like the diagonal but it moves around to hopefully always include the true path. """ cdef np.ndarray[int, ndim=2] x_y_path_ = np.array(x_y_path).astype(np.int32) assert (vecs0.shape[0] == norms0.shape[0]) assert (vecs1.shape[0] == norms1.shape[0]) assert (vecs0.shape[1] == norms0.shape[1]) assert (vecs1.shape[1] == norms1.shape[1]) # check how many overlaps vectors were passed in num_overlaps_in_vecs0 = vecs0.shape[0] num_overlaps_in_vecs1 = vecs1.shape[0] # check how many overlaps were requested # edge case: alignment_types could be empty # In that case, we should just return insertions/deletions # and max_x_overlap == max_y_overlap == 0 max_x_overlap = max([0] + [x for x, y in alignment_types]) # add [0] in case alignment_types is empty max_y_overlap = max([0] + [y for x, y in alignment_types]) # add [0] in case alignment_types is empty # note: alignment types are specified 1-based, but vectors are stored 0-based if max_x_overlap > num_overlaps_in_vecs0: raise Exception('%d x overlaps requrested (via alignment_types), but vecs0 only has %d' % ( max_x_overlap, num_overlaps_in_vecs0)) if max_y_overlap > num_overlaps_in_vecs1: raise Exception('%d y overlaps requrested (via alignment_types), but vecs1 only has %d' % ( max_y_overlap, num_overlaps_in_vecs1)) # number of sentences in each document cdef int xsize = vecs0.shape[1] cdef int ysize = vecs1.shape[1] # vector diminsions should match assert (vecs0.shape[2] == vecs1.shape[2]) cdef np.ndarray[int, ndim=1] x_offsets, y_offsets x_offsets, y_offsets = make_x_y_offsets(alignment_types) # reserve outputs a_len = x_y_path_.shape[0] b_len = 2 * width_over2 cdef np.ndarray[float, ndim=3] a_b_feats = np.empty((len(alignment_types), a_len, b_len), dtype=np.float32) cdef np.ndarray[int, ndim=1] b_offset = np.empty(a_len).astype(np.int32) cdef int x, y, aa, bb, xx, yy, a_idx, b_idx, bb2, x_offset, y_offset, ii_align, x_offset_idx, y_offset_idx cdef int vecsize = vecs0.shape[2] cdef int num_alignments = x_offsets.shape[0] cdef float sumx, feat cdef float inf = np.inf for ii in range(x_y_path_.shape[0]): x = x_y_path_[ii, 0] y = x_y_path_[ii, 1] # convert xy to ab cords aa = x + y bb = y a_idx = aa b_offset[aa] = bb - width_over2 for b_idx, bb2 in enumerate(range(bb - width_over2, bb + width_over2)): # convert ab to xy cords xx = aa - bb2 yy = bb2 for ii_align in range(num_alignments): x_offset = x_offsets[ii_align] x_offset_idx = x_offset - 1 # overlaps start at 1, vectors stored 0-based y_offset = y_offsets[ii_align] y_offset_idx = y_offset - 1 if 0 <= xx < xsize and 0 <= yy < ysize: sumx = 0.0 for jj in range(vecsize): sumx += vecs0[x_offset_idx, xx, jj] * vecs1[y_offset_idx, yy, jj] feat = 2.0 * x_offset * y_offset * (1.0 - sumx) / ( 1e-6 + norms0[x_offset_idx, xx] + norms1[y_offset_idx, yy]) else: feat = inf a_b_feats[ii_align, a_idx, b_idx] = feat return a_b_feats, b_offset def sparse_dp(np.ndarray[float, ndim=3] a_b_costs, np.ndarray[int, ndim=1] b_offset_in, alignment_types, double del_penalty, int x_in_size, int y_in_size): """ Do DP along a path, using features saved off along path. x_offsets, y_offsets should not include (0,1), (1,0) xsize, ysize refer to the costs a_b_csum, but in x/y space As in the simpler full-DP case, we compute cumulative costs and backpointers on notes, and there are COSTS associated with moving between them. This means the size of the notes +1,+1 larger (in x,y) than the COSTS. So the size of a_b_csum, a_b_xp, a_b_yp are all one larger in x and y compared to the costs In order to save memory (and time, vs a sparse matrix with hashes to look up values), let: a = x + y b = x - y b_offsets tells us how far from the left edge the features are computed for. basically it's like we are computing along the diagonal, but we shift the diagonal around based on our belief about where the alignments are. b_offsets is used for both costs AND csum, backpointers, so it needs to be +2 longer (it is in the a-direction) than the costs (in the a direction) """ cdef np.ndarray[int, ndim=1] x_offsets, y_offsets x_offsets, y_offsets = make_x_y_offsets(alignment_types) # make x/y offsets, including (0,1), (1,), i.e. including deletion and insertion x_offsets = np.concatenate([x_offsets, np.array([0, 1], dtype=np.int32)]) y_offsets = np.concatenate([y_offsets, np.array([1, 0], dtype=np.int32)]) cdef int a_in_size = a_b_costs.shape[1] cdef int b_in_size = a_b_costs.shape[2] cdef int a_out_size = a_in_size + 2 cdef int b_out_size = b_in_size cdef int x_out_size = x_in_size + 1 cdef int y_out_size = y_in_size + 1 # costs are the costs of going between nodes. # in x,y for the nodes, we basically add a buffer # at x=0 and y=0, and shift the cost by (x=+1,y=+1) # In a,b space, this means adding two points (for the buffer) # at the beginning, and shifting by (a=+0,b=+1) since # a=x+y and b=y # for the first two points, we can simply replicate the # original b_offset, since it should be -width_over2 # i.e. b_offset_in[0] == -width_over2 extra_two_points = np.array([b_offset_in[0], b_offset_in[0]], dtype=np.int32) cdef np.ndarray[int, ndim=1] b_offset_out = np.concatenate([extra_two_points, b_offset_in + 1]) # outputs # For anything being used in accumulation, use float64 cdef np.ndarray[double, ndim=2] a_b_csum = np.zeros((a_in_size + 2, b_in_size), dtype=np.float64) + np.inf # error cumulative sum cdef np.ndarray[int, ndim=2] a_b_xp = np.zeros((a_in_size + 2, b_in_size), dtype=np.int32) - 2 # backpointer for x cdef np.ndarray[int, ndim=2] a_b_yp = np.zeros((a_in_size + 2, b_in_size), dtype=np.int32) - 2 # backpointer for y cdef int num_alignments = x_offsets.shape[0] cdef double inf = np.inf cdef int xx_out, yy_out, ii_align, x_offset, y_offset cdef int aa_in_cost, bb_in_cost, aa_out, bb_out, aa_out_prev, bb_out_prev, xx_in_cost, yy_in_cost, xx_out_prev, yy_out_prev cdef double alignment_cost, total_cost, prev_cost # increasing in a is the same as going along diagonals in x/y, so DP order works # (and any ordering is fine in b - nothing depends on values adjacent on diagonal in x/y) for aa_out in range(a_in_size + 2): for bb_out in range(b_in_size): #xx_out, yy_out = ab2xy_w_offset(aa_out, bb_out, b_offset_out) yy_out = bb_out + b_offset_out[aa_out] xx_out = aa_out - yy_out # edge case: all deletions in y-direction if xx_out == 0 and 0 <= yy_out < y_out_size: a_b_csum[aa_out, bb_out] = del_penalty * yy_out a_b_xp[aa_out, bb_out] = 0 a_b_yp[aa_out, bb_out] = 1 # edge case: all deletions in x-direction elif yy_out == 0 and 0 <= xx_out < x_out_size: a_b_csum[aa_out, bb_out] = del_penalty * xx_out a_b_xp[aa_out, bb_out] = 1 a_b_yp[aa_out, bb_out] = 0 else: # initialize output to inf a_b_csum[aa_out, bb_out] = inf a_b_xp[aa_out, bb_out] = -42 a_b_yp[aa_out, bb_out] = -42 for ii_align in range(num_alignments): x_offset = x_offsets[ii_align] y_offset = y_offsets[ii_align] # coords of location of alignment cost, in input x/y space xx_in_cost = xx_out - 1 # features were front padded, yy_in_cost = yy_out - 1 # so offset is always 1 # the coords of location of previous cumsum cost, in input x/y space xx_out_prev = xx_out - x_offset yy_out_prev = yy_out - y_offset if 0 <= xx_in_cost < x_in_size and 0 <= yy_in_cost < y_in_size and 0 <= xx_out_prev < x_out_size and 0 <= yy_out_prev < y_out_size: # convert x,y to a,b aa_in_cost = xx_in_cost + yy_in_cost bb_in_cost = yy_in_cost - b_offset_in[aa_in_cost] aa_out_prev = xx_out_prev + yy_out_prev bb_out_prev = yy_out_prev - b_offset_out[aa_out_prev] if 0 <= aa_in_cost < a_in_size and 0 <= bb_in_cost < b_in_size and 0 <= aa_out_prev < a_out_size and 0 <= bb_out_prev < b_out_size: if x_offset == 0 or y_offset == 0: alignment_cost = del_penalty else: alignment_cost = a_b_costs[ii_align, aa_in_cost, bb_in_cost] prev_cost = a_b_csum[aa_out_prev, bb_out_prev] total_cost = prev_cost + alignment_cost if total_cost < a_b_csum[aa_out, bb_out]: a_b_csum[aa_out, bb_out] = total_cost a_b_xp[aa_out, bb_out] = x_offset a_b_yp[aa_out, bb_out] = y_offset return a_b_csum, a_b_xp, a_b_yp, b_offset_out