Bertalign

2021-05-17 23:33:49 +08:00
parent 0a92061119
commit 025bc2afe4
15 changed files with 451188 additions and 0 deletions
--- a/bin/vecalign/dp_core.pyx
+++ b/bin/vecalign/dp_core.pyx
@@ -0,0 +1,411 @@
+# cython: language_level=3
+
+"""
+Copyright 2019 Brian Thompson
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import numpy as np
+
+cimport numpy as np
+cimport cython
+
+
+def make_x_y_offsets(alignment_types):
+    # alignment types for which we will precompute costs
+
+    # deletion/insertion is added later
+    for x, y in alignment_types:
+        assert (x > 0)
+        assert (y > 0)
+
+    x_offsets = np.array([x for x, y in alignment_types], dtype=np.int32)  # MUST **NOT** INCLUDE (0,1), (1,0)
+    y_offsets = np.array([y for x, y in alignment_types], dtype=np.int32)  # MUST **NOT** INCLUDE (0,1), (1,0)
+    return x_offsets, y_offsets
+
+
+def make_dense_costs(np.ndarray[float, ndim=3] vecs0,  # itput
+                     np.ndarray[float, ndim=3] vecs1,  # input
+                     np.ndarray[float, ndim=2] norm0,  # input
+                     np.ndarray[float, ndim=2] norm1,  # input
+                     int offset0 = 0,  # index into vecs0/norms0
+                     int offset1 = 0,  # index into vecs1/norms1
+                     ):
+    """
+    Make a full N*M feature matrix. By default, makes 1-1 alignments, 
+       can build others by specifying offset0, offset1 to index into
+       vecs0, norms0 and vecs1, norms1 respectivly. 
+    """
+    assert vecs0.shape[0] > offset0
+    assert vecs1.shape[0] > offset1
+    assert norm0.shape[0] > offset0
+    assert norm1.shape[0] > offset1
+
+    cdef int size0 = np.shape(vecs0)[1]
+    assert norm0.shape[1] == size0
+
+    cdef int size1 = np.shape(vecs1)[1]
+    assert norm1.shape[1] == size1
+
+    cdef int vecsize = np.shape(vecs0)[2]
+    assert vecs1.shape[2] == vecsize
+
+    cdef int xi, yi
+    cdef float sumx
+
+    cdef np.ndarray[float, ndim=2] costs = np.empty((size0, size1), dtype=np.float32)
+
+    for xi in range(size0):
+        for yi in range(size1):
+            sumx = 0.0
+            for jj in range(vecsize):
+                sumx += vecs0[offset0, xi, jj] * vecs1[offset1, yi, jj]
+
+            costs[xi, yi] = 2.0 * (1.0 - sumx) / (1e-6 + norm0[offset0, xi] + norm1[offset1, yi])
+            # normalize by alignment type  
+            costs[xi, yi] = costs[xi, yi] * (offset0 + 1) * (offset1 + 1)
+
+    return costs
+
+
+def dense_dp(np.ndarray[float, ndim=2] alignment_cost, float pen):
+    """
+    Compute cost matrix (csum) and backpointers (bp) 
+    from full 2-D 1-1 alignment costs matrix (alignment_cost) 
+    """
+
+    size0 = alignment_cost.shape[0]
+    size1 = alignment_cost.shape[1]
+    # csum and traceback matrix are both on nodes
+    #   so they are +1 in each dimension compared to the jump costs matrix
+    # For anything being used in accumulation, use float64
+    cdef np.ndarray[double, ndim=2] csum = np.empty((size0 + 1, size1 + 1), dtype=np.float64)
+    cdef np.ndarray[int, ndim=2] bp = np.empty((size0 + 1, size1 + 1), dtype=np.int32)
+
+    # bp and csum are nodes, 
+    #   while alignment_cost is the cost of going between the nodes
+    # Size of nodes should be one larger than alignment costs
+    b0, b1 = np.shape(bp)
+    c0, c1 = np.shape(csum)
+    j0, j1 = np.shape(alignment_cost)
+    assert (b0 == c0 == j0 + 1)
+    assert (b1 == c1 == j1 + 1)
+
+    cdef int cmax = np.shape(csum)[1]
+    cdef int rmax = np.shape(csum)[0]
+    cdef int c, r
+    cdef double cost0, cost1, cost2
+
+    # initialize the all c-direction deletion path
+    for c in range(cmax):
+        csum[0, c] = c * pen
+        bp[0, c] = 1
+
+    # initialize the all r-direction deletion path
+    for r in range(rmax):
+        csum[r, 0] = r * pen
+        bp[r, 0] = 2
+
+    # Initial cost is 0.0
+    csum[0, 0] = 0.0  # noop
+    bp[0, 0] = 4  # should not matter
+
+    # Calculate the rest recursively
+    for c in range(1, cmax):
+        for r in range(1, rmax):
+
+            # alignment_cost indexes are off by 1 wrt
+            #   csum/bp, since csum/bp are nodes
+            cost0 = csum[r - 1, c - 1] + alignment_cost[r - 1, c - 1]
+            cost1 = csum[r, c - 1] + pen
+            cost2 = csum[r - 1, c] + pen
+
+            csum[r, c] = cost0
+            bp[r, c] = 0
+
+            if cost1 < csum[r, c]:
+                csum[r, c] = cost1
+                bp[r, c] = 1
+            if cost2 < csum[r, c]:
+                csum[r, c] = cost2
+                bp[r, c] = 2
+
+    return csum, bp
+
+
+def score_path(np.ndarray[int, ndim=1] xx,
+               np.ndarray[int, ndim=1] yy,
+               np.ndarray[float, ndim=1] norm1,
+               np.ndarray[float, ndim=1] norm2,
+               np.ndarray[float, ndim=2] vecs1,
+               np.ndarray[float, ndim=2] vecs2,
+               np.ndarray[float, ndim=1] out):
+    cdef int xi, yi, ii, jj
+    cdef float outx
+    cdef int lenxy = xx.shape[0]
+    cdef int vecsize = vecs1.shape[1]
+
+    for ii in range(lenxy):
+        xi = xx[ii]
+        yi = yy[ii]
+        outx = 0.0
+        for jj in range(vecsize):
+            outx += vecs1[xi, jj] * vecs2[yi, jj]
+        out[ii] = 2.0 * (1.0 - outx) / (norm1[xi] + norm2[yi])
+
+
+# Bounds checking and wraparound slow things down by about 2x
+# Division by 0 checking has minimal speed impact
+@cython.boundscheck(False)  # turn off bounds-checking for entire function
+@cython.wraparound(False)  # turn off negative index wrapping for entire function
+@cython.cdivision(True)  # use c-style division (no division-by-zero check)
+def make_sparse_costs(np.ndarray[float, ndim=3] vecs0,  # intput: num aligns X num sents X dim
+                      np.ndarray[float, ndim=3] vecs1,  # input
+                      np.ndarray[float, ndim=2] norms0,  # intput: num aligns X num sents
+                      np.ndarray[float, ndim=2] norms1,  # input
+                      x_y_path,
+                      alignment_types,
+                      int width_over2):
+    """
+    Make features for DP, *for lines running across approximate path*, *for each alignment type*
+    x_offsets, y_offsets should not include (0,1), (1,0)
+
+    Basically, we take the feature matrix, rotate it 45 degress, 
+       and compute a "wavy" matrix for the features.
+    It's like the diagonal but it moves around to hopefully always include the true path.
+    """
+
+    cdef np.ndarray[int, ndim=2] x_y_path_ = np.array(x_y_path).astype(np.int32)
+
+    assert (vecs0.shape[0] == norms0.shape[0])
+    assert (vecs1.shape[0] == norms1.shape[0])
+
+    assert (vecs0.shape[1] == norms0.shape[1])
+    assert (vecs1.shape[1] == norms1.shape[1])
+
+    # check how many overlaps vectors were passed in
+    num_overlaps_in_vecs0 = vecs0.shape[0]
+    num_overlaps_in_vecs1 = vecs1.shape[0]
+
+    # check how many overlaps were requested
+    # edge case: alignment_types could be empty
+    #    In that case, we should just return insertions/deletions
+    #    and max_x_overlap == max_y_overlap == 0
+    max_x_overlap = max([0] + [x for x, y in alignment_types])  # add [0] in case alignment_types is empty
+    max_y_overlap = max([0] + [y for x, y in alignment_types])  # add [0] in case alignment_types is empty
+
+    # note: alignment types are specified 1-based, but vectors are stored 0-based
+    if max_x_overlap > num_overlaps_in_vecs0:
+        raise Exception('%d x overlaps requrested (via alignment_types), but vecs0 only has %d' % (
+            max_x_overlap, num_overlaps_in_vecs0))
+    if max_y_overlap > num_overlaps_in_vecs1:
+        raise Exception('%d y overlaps requrested (via alignment_types), but vecs1 only has %d' % (
+            max_y_overlap, num_overlaps_in_vecs1))
+
+    # number of sentences in each document
+    cdef int xsize = vecs0.shape[1]
+    cdef int ysize = vecs1.shape[1]
+
+    # vector diminsions should match
+    assert (vecs0.shape[2] == vecs1.shape[2])
+
+    cdef np.ndarray[int, ndim=1] x_offsets, y_offsets
+    x_offsets, y_offsets = make_x_y_offsets(alignment_types)
+
+    # reserve outputs
+    a_len = x_y_path_.shape[0]
+    b_len = 2 * width_over2
+    cdef np.ndarray[float, ndim=3] a_b_feats = np.empty((len(alignment_types), a_len, b_len), dtype=np.float32)
+    cdef np.ndarray[int, ndim=1] b_offset = np.empty(a_len).astype(np.int32)
+
+    cdef int x, y, aa, bb, xx, yy, a_idx, b_idx, bb2, x_offset, y_offset, ii_align, x_offset_idx, y_offset_idx
+    cdef int vecsize = vecs0.shape[2]
+    cdef int num_alignments = x_offsets.shape[0]
+
+    cdef float sumx, feat
+    cdef float inf = np.inf
+
+    for ii in range(x_y_path_.shape[0]):
+        x = x_y_path_[ii, 0]
+        y = x_y_path_[ii, 1]
+
+        # convert xy to ab cords
+        aa = x + y
+        bb = y
+
+        a_idx = aa
+        b_offset[aa] = bb - width_over2
+        for b_idx, bb2 in enumerate(range(bb - width_over2, bb + width_over2)):
+            # convert ab to xy cords
+            xx = aa - bb2
+            yy = bb2
+
+            for ii_align in range(num_alignments):
+                x_offset = x_offsets[ii_align]
+                x_offset_idx = x_offset - 1  # overlaps start at 1, vectors stored 0-based
+                y_offset = y_offsets[ii_align]
+                y_offset_idx = y_offset - 1
+
+                if 0 <= xx < xsize and 0 <= yy < ysize:
+                    sumx = 0.0
+                    for jj in range(vecsize):
+                        sumx += vecs0[x_offset_idx, xx, jj] * vecs1[y_offset_idx, yy, jj]
+                    feat = 2.0 * x_offset * y_offset * (1.0 - sumx) / (
+                            1e-6 + norms0[x_offset_idx, xx] + norms1[y_offset_idx, yy])
+
+                else:
+                    feat = inf
+
+                a_b_feats[ii_align, a_idx, b_idx] = feat
+
+    return a_b_feats, b_offset
+
+
+def sparse_dp(np.ndarray[float, ndim=3] a_b_costs,
+              np.ndarray[int, ndim=1] b_offset_in,
+              alignment_types,
+              double del_penalty,
+              int x_in_size,
+              int y_in_size):
+    """
+    Do DP along a path, using features saved off along path.
+    x_offsets, y_offsets should not include (0,1), (1,0)
+
+    xsize, ysize refer to the costs a_b_csum, but in x/y space
+
+    As in the simpler full-DP case, 
+       we compute cumulative costs and backpointers on notes,
+       and there are COSTS associated with moving between them.
+
+    This means the size of the notes  +1,+1 larger (in x,y) than the COSTS.
+    
+    So the size of a_b_csum, a_b_xp, a_b_yp are all one larger in x and y compared to the costs
+
+    In order to save memory (and time, vs a sparse matrix with hashes to look up values), let:
+             a = x + y
+             b = x - y
+
+    b_offsets tells us how far from the left edge the features are computed for.
+         basically it's like we are computing along the diagonal, 
+         but we shift the diagonal around based on our belief
+         about where the alignments are. 
+
+    b_offsets is used for both costs AND csum, backpointers, so it needs to be 
+        +2 longer (it is in the a-direction) than the costs (in the a direction)
+
+    """
+    cdef np.ndarray[int, ndim=1] x_offsets, y_offsets
+    x_offsets, y_offsets = make_x_y_offsets(alignment_types)
+
+    # make x/y offsets, including (0,1), (1,), i.e. including deletion and insertion
+    x_offsets = np.concatenate([x_offsets, np.array([0, 1], dtype=np.int32)])
+    y_offsets = np.concatenate([y_offsets, np.array([1, 0], dtype=np.int32)])
+
+    cdef int a_in_size = a_b_costs.shape[1]
+    cdef int b_in_size = a_b_costs.shape[2]
+
+    cdef int a_out_size = a_in_size + 2
+    cdef int b_out_size = b_in_size
+
+    cdef int x_out_size = x_in_size + 1
+    cdef int y_out_size = y_in_size + 1
+
+    # costs are the costs of going between nodes.
+    # in x,y for the nodes, we basically add a buffer 
+    #   at x=0 and y=0, and shift the cost by (x=+1,y=+1)
+    # In a,b space, this means adding two points (for the buffer)
+    #      at the beginning, and shifting by (a=+0,b=+1) since
+    #      a=x+y and b=y
+    # for the first two points, we can simply replicate the
+    #    original b_offset, since it should be -width_over2
+    # i.e. b_offset_in[0] == -width_over2
+    extra_two_points = np.array([b_offset_in[0], b_offset_in[0]], dtype=np.int32)
+    cdef np.ndarray[int, ndim=1] b_offset_out = np.concatenate([extra_two_points, b_offset_in + 1])
+
+    # outputs
+    # For anything being used in accumulation, use float64
+    cdef np.ndarray[double, ndim=2] a_b_csum = np.zeros((a_in_size + 2, b_in_size),
+                                                        dtype=np.float64) + np.inf  # error cumulative sum
+    cdef np.ndarray[int, ndim=2] a_b_xp = np.zeros((a_in_size + 2, b_in_size), dtype=np.int32) - 2  # backpointer for x
+    cdef np.ndarray[int, ndim=2] a_b_yp = np.zeros((a_in_size + 2, b_in_size), dtype=np.int32) - 2  # backpointer for y
+
+    cdef int num_alignments = x_offsets.shape[0]
+    cdef double inf = np.inf
+    cdef int xx_out, yy_out, ii_align, x_offset, y_offset
+    cdef int aa_in_cost, bb_in_cost, aa_out, bb_out, aa_out_prev, bb_out_prev, xx_in_cost, yy_in_cost, xx_out_prev, yy_out_prev
+
+    cdef double alignment_cost, total_cost, prev_cost
+
+    # increasing in a is the same as going along diagonals in x/y, so DP order works
+    #  (and any ordering is fine in b - nothing depends on values adjacent on diagonal in x/y)
+    for aa_out in range(a_in_size + 2):
+        for bb_out in range(b_in_size):
+            #xx_out, yy_out = ab2xy_w_offset(aa_out, bb_out, b_offset_out)
+            yy_out = bb_out + b_offset_out[aa_out]
+            xx_out = aa_out - yy_out
+
+            # edge case: all deletions in y-direction
+            if xx_out == 0 and 0 <= yy_out < y_out_size:
+                a_b_csum[aa_out, bb_out] = del_penalty * yy_out
+                a_b_xp[aa_out, bb_out] = 0
+                a_b_yp[aa_out, bb_out] = 1
+
+            # edge case: all deletions in x-direction
+            elif yy_out == 0 and 0 <= xx_out < x_out_size:
+                a_b_csum[aa_out, bb_out] = del_penalty * xx_out
+                a_b_xp[aa_out, bb_out] = 1
+                a_b_yp[aa_out, bb_out] = 0
+
+            else:
+                # initialize output to inf
+                a_b_csum[aa_out, bb_out] = inf
+                a_b_xp[aa_out, bb_out] = -42
+                a_b_yp[aa_out, bb_out] = -42
+
+                for ii_align in range(num_alignments):
+                    x_offset = x_offsets[ii_align]
+                    y_offset = y_offsets[ii_align]
+
+                    # coords of location of alignment cost, in input x/y space
+                    xx_in_cost = xx_out - 1  # features were front padded,
+                    yy_in_cost = yy_out - 1  #   so offset is always 1
+
+                    # the coords of location of previous cumsum cost, in input x/y space
+                    xx_out_prev = xx_out - x_offset
+                    yy_out_prev = yy_out - y_offset
+
+                    if 0 <= xx_in_cost < x_in_size and 0 <= yy_in_cost < y_in_size and 0 <= xx_out_prev < x_out_size and 0 <= yy_out_prev < y_out_size:
+                        # convert x,y to a,b
+                        aa_in_cost = xx_in_cost + yy_in_cost
+                        bb_in_cost = yy_in_cost - b_offset_in[aa_in_cost]
+
+                        aa_out_prev = xx_out_prev + yy_out_prev
+                        bb_out_prev = yy_out_prev - b_offset_out[aa_out_prev]
+
+                        if 0 <= aa_in_cost < a_in_size and 0 <= bb_in_cost < b_in_size and 0 <= aa_out_prev < a_out_size and 0 <= bb_out_prev < b_out_size:
+                            if x_offset == 0 or y_offset == 0:
+                                alignment_cost = del_penalty
+                            else:
+                                alignment_cost = a_b_costs[ii_align, aa_in_cost, bb_in_cost]
+
+                            prev_cost = a_b_csum[aa_out_prev, bb_out_prev]
+
+                            total_cost = prev_cost + alignment_cost
+
+                            if total_cost < a_b_csum[aa_out, bb_out]:
+                                a_b_csum[aa_out, bb_out] = total_cost
+                                a_b_xp[aa_out, bb_out] = x_offset
+                                a_b_yp[aa_out, bb_out] = y_offset
+
+    return a_b_csum, a_b_xp, a_b_yp, b_offset_out