Bertalign

2021-05-17 23:33:49 +08:00
parent 0a92061119
commit 025bc2afe4
15 changed files with 451188 additions and 0 deletions
--- a/bin/bert_align.py
+++ b/bin/bert_align.py
@@ -0,0 +1,400 @@
 import argparse
 import os
 import sys
 import numpy as np
 import numba as nb
 import faiss
 import time
 def _main():
  # user-defined parameters
  parser = argparse.ArgumentParser('Multilingual sentence alignment using BERT embeddings',
                                   formatter_class = argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('--job', type=str, required=True, help='Job file for alignment task.')
  parser.add_argument('--src_embed', type=str, required=True, nargs=2, help='Source overlap and embedding files.')
  parser.add_argument('--tgt_embed', type=str, required=True, nargs=2, help='Target overlap and embedding files.')
  parser.add_argument('--max_align', type=int, default=5, help='Maximum alignment types, n + m <= this value.')
  parser.add_argument('--win', type=int, default=5, help='Window size for the second-pass alignment.')
  parser.add_argument('--top_k', type=int, default=3, help='Top-k target neighbors of each source sentence.')
  parser.add_argument('--skip', type=float, default=-0.1, help='Similarity score for 0-1 and 1-0 alignment.')
  parser.add_argument('--margin', action='store_true', help='Margin-based cosine similarity')
  args = parser.parse_args()
  # fixed parameters to determine the
  # window size for the first-pass alignment  
  min_win_size = 10
  max_win_size = 600
  win_per_100 = 8
  # read in embeddings
  src_sent2line, src_line_embeddings = read_in_embeddings(args.src_embed[0], args.src_embed[1])
  tgt_sent2line, tgt_line_embeddings = read_in_embeddings(args.tgt_embed[0], args.tgt_embed[1])
  embedding_size = src_line_embeddings.shape[1]
  # read in alignment jobs
  job = read_job(args.job)
  # start alignment
  for rec in job:
    src_file, tgt_file, align_file = rec.split("\t")
    print("Aligning {} to {}".format(src_file, tgt_file))
    # read in source and target sentences
    src_lines = open(src_file, 'rt', encoding="utf-8").readlines()
    tgt_lines = open(tgt_file, 'rt', encoding="utf-8").readlines()
    # convert source and target texts into embeddings
    # and calculate sentence length
    t_0 = time.time()
    src_vecs, src_lens = doc2feats(src_sent2line, src_line_embeddings, src_lines, args.max_align - 1)
    tgt_vecs, tgt_lens = doc2feats(tgt_sent2line, tgt_line_embeddings, tgt_lines, args.max_align - 1)
    char_ratio = np.sum(src_lens[0,]) / np.sum(tgt_lens[0,])
    print("Reading embeddings takes {}".format(time.time() - t_0))
    # using faiss, find in the target text
    # the k nearest neighbors of each source sentence
    #index = faiss.IndexFlatIP(embedding_size) # use inter product to build index
    t_1 = time.time()
    #index.add(tgt_vecs[0,:])
    #xq = src_vecs[0,:]
    #D,I = index.search(xq, args.top_k)
    res = faiss.StandardGpuResources() 
    index = faiss.IndexFlatIP(embedding_size)
    gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
    gpu_index.add(tgt_vecs[0,:]) 
    xq = src_vecs[0,:]
    D,I = gpu_index.search(xq,args.top_k) 
    print("Finding top-k neighbors takes {}".format(time.time() - t_1))
    # find 1-to-1 alignment
    t_2 = time.time()
    src_len = len(src_lines)
    tgt_len = len(tgt_lines)
    first_alignment_types = make_alignment_types(2) # 0-0， 1-0 and 1-1
    first_w, first_search_path = find_first_search_path(src_len, tgt_len, min_win_size, max_win_size, win_per_100)
    first_pointers = first_pass_align(src_len, tgt_len, first_w, first_search_path, first_alignment_types, D, I, args.top_k)
    first_alignment = first_back_track(src_len, tgt_len, first_pointers, first_search_path, first_alignment_types)
    print("First pass alignment takes {}".format(time.time() - t_2))
    # find m-to-n alignment
    t_3 = time.time()
    second_w, second_search_path = find_second_search_path(first_alignment, args.win, src_len, tgt_len)
    second_alignment_types = make_alignment_types(args.max_align)
    second_pointers = second_pass_align(src_vecs, tgt_vecs, src_lens, tgt_lens, second_w, second_search_path, second_alignment_types, char_ratio, args.skip, margin=args.margin)
    second_alignment = second_back_track(src_len, tgt_len, second_pointers, second_search_path, second_alignment_types)
    print("Second pass alignment takes {}".format(time.time() - t_3))
    # save alignment
    out_f = open(align_file, 'w', encoding="utf-8")
    print_alignments(second_alignment, file=out_f)
 def print_alignments(alignments, file=sys.stdout):
  for x, y in alignments:
    print('%s:%s' % (x, y), file=file)
 def second_back_track(i, j, b, search_path, a_types):
    alignment = []
    while ( i !=0 and j != 0 ):
        j_offset = j - search_path[i][0]
        a = b[i][j_offset]
        s = a_types[a][0]
        t = a_types[a][1]
        src_range = [i - offset - 1 for offset in range(s)][::-1]
        tgt_range = [j - offset - 1 for offset in range(t)][::-1]
        alignment.append((src_range, tgt_range))
        i = i-s
        j = j-t
    return alignment[::-1]
@nb.jit(nopython=True, fastmath=True, cache=True)
 def second_pass_align(src_vecs, tgt_vecs, src_lens, tgt_lens, w, search_path, align_types, char_ratio, skip, margin=False):
  src_len = src_vecs.shape[1]
  tgt_len = tgt_vecs.shape[1]
  # intialize sum matrix
  cost = np.zeros((src_len + 1, w))
  #back = np.zeros((tgt_len + 1, w), dtype=nb.int64)
  back = np.zeros((src_len + 1, w), dtype=nb.int64)
  cost[0][0] = 0
  back[0][0] = -1
  for i in range(1, src_len + 1):
    i_start = search_path[i][0]
    i_end = search_path[i][1]
    for j in range(i_start, i_end + 1):
      if i + j == 0:
        continue
      best_score = -np.inf
      best_a = -1
      for a in range(align_types.shape[0]):
        a_1 = align_types[a][0]
        a_2 = align_types[a][1]
        prev_i = i - a_1
        prev_j = j - a_2
        if prev_i < 0 or prev_j < 0 :  # no previous cell in DP table 
          continue
        prev_i_start = search_path[prev_i][0]
        prev_i_end =  search_path[prev_i][1]
        if prev_j < prev_i_start or prev_j > prev_i_end: # out of bound of cost matrix
            continue
        prev_j_offset = prev_j - prev_i_start
        score = cost[prev_i][prev_j_offset]
        if score == -np.inf:
          continue
        if a_1 == 0 or a_2 == 0:  # deletion or insertion
          cur_score = skip
        else:
          src_v = src_vecs[a_1-1,i-1,:]
          tgt_v = tgt_vecs[a_2-1,j-1,:]
          src_l = src_lens[a_1-1, i-1]
          tgt_l = tgt_lens[a_2-1, j-1]
          cur_score = get_score(src_v, tgt_v, a_1, a_2, i, j, src_vecs, tgt_vecs, src_len, tgt_len, margin=margin)
          tgt_l = tgt_l * char_ratio
          min_len = min(src_l, tgt_l)
          max_len = max(src_l, tgt_l)
          len_p = np.log2(1 + min_len / max_len)
          cur_score *= len_p
        score += cur_score
        if score > best_score:
          best_score = score
          best_a = a
      j_offset = j - i_start
      cost[i][j_offset] = best_score
      back[i][j_offset] = best_a
  return back
@nb.jit(nopython=True, fastmath=True, cache=True)
 def get_score(src_v, tgt_v, a_1, a_2, i, j, src_vecs, tgt_vecs, src_len, tgt_len, margin=False):
  similarity = nb_dot(src_v, tgt_v)
  if margin:
    tgt_neighbor_ave_sim = get_neighbor_sim(src_v, a_2, j, tgt_len, tgt_vecs)
    src_neighbor_ave_sim = get_neighbor_sim(tgt_v, a_1, i, src_len, src_vecs)
    neighbor_ave_sim = (tgt_neighbor_ave_sim + src_neighbor_ave_sim)/2
    similarity -= neighbor_ave_sim
  return similarity
@nb.jit(nopython=True, fastmath=True, cache=True)
 def get_neighbor_sim(vec, a, j, len, db):
  left_idx = j - a
  right_idx = j + 1
  if right_idx > len:
    neighbor_right_sim = 0
  else:
    right_embed = db[0,right_idx-1,:]
    neighbor_right_sim = nb_dot(vec, right_embed)
  if left_idx == 0:
    neighbor_left_sim = 0
  else:
    left_embed = db[0,left_idx-1,:]
    neighbor_left_sim = nb_dot(vec, left_embed)
  #if right_idx > LEN or left_idx < 0:
  if right_idx > len or left_idx == 0:
    neighbor_ave_sim = neighbor_left_sim + neighbor_right_sim
  else:
    neighbor_ave_sim = (neighbor_left_sim + neighbor_right_sim) / 2
  return neighbor_ave_sim
@nb.jit(nopython=True, fastmath=True, cache=True)
 def nb_dot(x, y):
    return np.dot(x,y)
 def find_second_search_path(align, w, src_len, tgt_len):
  '''
  Convert 1-1 alignment from first-pass to the path for second-pass alignment.
  The index along X-axis and Y-axis must be consecutive. 
  '''
  last_bead_src = align[-1][0]
  last_bead_tgt = align[-1][1]
  if last_bead_src != src_len:
    if last_bead_tgt == tgt_len:
      align.pop()
    align.append((src_len, tgt_len))
  else:
    if last_bead_tgt != tgt_len:
      align.pop()
      align.append((src_len, tgt_len))
  prev_src, prev_tgt = 0,0
  path = []
  max_w = -np.inf
  for src, tgt in align:
    lower_bound = max(0, prev_tgt - w)
    upper_bound = min(tgt_len, tgt + w)
    path.extend([(lower_bound, upper_bound) for id in range(prev_src+1, src+1)])
    prev_src, prev_tgt = src, tgt
    width = upper_bound - lower_bound
    if width > max_w:
      max_w = width
  path = [path[0]] + path
  return max_w + 1, np.array(path)
 def first_back_track(i, j, b, search_path, a_types):
    alignment = []
    while ( i !=0  and j != 0 ):
        j_offset = j - search_path[i][0]
        a = b[i][j_offset]
        s = a_types[a][0]
        t = a_types[a][1]
        if a == 2:
          alignment.append((i, j))
        i = i-s
        j = j-t
    return alignment[::-1]
@nb.jit(nopython=True, fastmath=True, cache=True)
 def first_pass_align(src_len, tgt_len, w, search_path, align_types, dist, index, top_k):
  #initialize cost and backpointer matrix
  cost = np.zeros((src_len + 1, 2 * w + 1))
  pointers = np.zeros((src_len + 1, 2 * w + 1), dtype=nb.int64)
  cost[0][0] = 0
  pointers[0][0] = -1
  for i in range(1, src_len +  1):
    i_start = search_path[i][0]
    i_end = search_path[i][1]
    for j in range(i_start, i_end + 1):
      if i + j == 0:
        continue
      best_score = -np.inf
      best_a = -1
      for a in range(align_types.shape[0]):
        a_1 = align_types[a][0]
        a_2 = align_types[a][1]
        prev_i = i - a_1
        prev_j = j - a_2
        if prev_i < 0 or prev_j < 0 :  # no previous cell 
          continue
        prev_i_start = search_path[prev_i][0]
        prev_i_end =  search_path[prev_i][1]
        if prev_j < prev_i_start or prev_j > prev_i_end: # out of bound of cost matrix
            continue
        prev_j_offset = prev_j - prev_i_start
        score = cost[prev_i][prev_j_offset]
        if score == -np.inf:
          continue
        if a_1 > 0 and a_2 > 0:
          for k in range(top_k):
            if index[i-1][k] == j - 1:
              score += dist[i-1][k]
        if score > best_score:
          best_score = score
          best_a = a
      j_offset = j - i_start
      cost[i][j_offset] = best_score
      pointers[i][j_offset] = best_a
  return pointers
@nb.jit(nopython=True, fastmath=True, cache=True)
 def find_first_search_path(src_len, tgt_len, min_win_size, max_win_size, win_per_100):
  yx_ratio = tgt_len / src_len
  win_size_1 = int(yx_ratio * tgt_len * win_per_100 / 100)
  win_size_2 = int(abs(tgt_len - src_len) * 3/4)
  w_1 = min(max(min_win_size, max(win_size_1, win_size_2)), max_win_size)
  #w_2 = int(max(src_len, tgt_len) * 0.05)
  w_2 = int(max(src_len, tgt_len) * 0.06)
  w = max(w_1, w_2)
  search_path = np.zeros((src_len + 1, 2), dtype=nb.int64)
  for i in range(0, src_len + 1):
    center = int(yx_ratio * i)
    w_start = max(0, center - w)
    w_end = min(center + w, tgt_len)
    search_path[i] = [w_start, w_end]
  return w, search_path
 def doc2feats(sent2line, line_embeddings, lines, num_overlaps):
    lines = [preprocess_line(line) for line in lines]
    vecsize = line_embeddings.shape[1]
    vecs0 = np.empty((num_overlaps, len(lines), vecsize), dtype=np.float32)
    vecs1 = np.empty((num_overlaps, len(lines)), dtype=np.int)
    for ii, overlap in enumerate(range(1, num_overlaps + 1)):
        for jj, out_line in enumerate(layer(lines, overlap)):
            try:
                line_id = sent2line[out_line]
            except KeyError:
                logger.warning('Failed to find overlap=%d line "%s". Will use random vector.', overlap, out_line)
                line_id = None
            if line_id is not None:
                vec = line_embeddings[line_id]
            else:
                vec = np.random.random(vecsize) - 0.5
                vec = vec / np.linalg.norm(vec)
            vecs0[ii, jj, :] = vec
            vecs1[ii, jj] = len(out_line.encode("utf-8"))
    return vecs0, vecs1
 def preprocess_line(line):
    line = line.strip()
    if len(line) == 0:
        line = 'BLANK_LINE'
    return line
 def layer(lines, num_overlaps, comb=' '):
    """
    make front-padded overlapping sentences
    """
    if num_overlaps < 1:
        raise Exception('num_overlaps must be >= 1')
    out = ['PAD', ] * min(num_overlaps - 1, len(lines))
    for ii in range(len(lines) - num_overlaps + 1):
        out.append(comb.join(lines[ii:ii + num_overlaps]))
    return out
 def read_in_embeddings(text_file, embed_file):
    sent2line = dict()
    with open(text_file, 'rt', encoding="utf-8") as fin:
        for ii, line in enumerate(fin):
            if line.strip() in sent2line:
                raise Exception('got multiple embeddings for the same line')
            sent2line[line.strip()] = ii
    line_embeddings = np.fromfile(embed_file, dtype=np.float32, count=-1)
    if line_embeddings.size == 0:
        raise Exception('Got empty embedding file')
    embedding_size = line_embeddings.size // len(sent2line)
    line_embeddings.resize(line_embeddings.shape[0] // embedding_size, embedding_size)
    return sent2line, line_embeddings
 def make_alignment_types(max_alignment_size):
    # Return list of all (n,m) where n+m <= this
    alignment_types = []
    for x in range(1, max_alignment_size):
        for y in range(1, max_alignment_size):
            if x + y <= max_alignment_size:
                alignment_types.append([x, y])
    alignment_types = [[0,1], [1,0]] + alignment_types
    return np.array(alignment_types)
 def read_job(file):
    job = []
    with open(file, 'r', encoding="utf-8") as f:
        for line in f:
            if not line.startswith("#"):
                job.append(line.strip())
    return job
 if __name__ == '__main__':
    _main()
--- a/bin/gale_align.py
+++ b/bin/gale_align.py
@@ -0,0 +1,193 @@
 import os
 import sys
 import argparse
 import time
 import math
 import numba as nb
 import numpy as np
 def _main():
  # user-defined parameters
  parser = argparse.ArgumentParser('Sentence alignment using Gale-Church Algrorithm',
                                   formatter_class = argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('--job', type=str, required=True, help='Job file for alignment task.')
  args = parser.parse_args()
  # fixed parameters to determine the
  # window size for alignment  
  min_win_size = 10
  max_win_size = 600
  win_per_100 = 8
  # alignment types
  align_types = np.array([
    [0,1],
    [1,0],
    [1,1],
    [1,2],
    [2,1],
    [2,2]
  ], dtype=np.int)
  # prior probability
  priors = np.array([0, 0.0099, 0.89, 0.089, 0.011])
  # mean and variance
  c = 1
  s2 = 6.8
  # gale church align
  job = read_job(args.job)
  for rec in job:
    src_file, tgt_file, align_file = rec.split("\t")
    print("Aligning {} to {}".format(src_file, tgt_file))
    src_lines = open(src_file, 'rt', encoding="utf-8").readlines() # UTF-8 byte length
    tgt_lines = open(tgt_file, 'rt', encoding="utf-8").readlines()
    src_len = calculate_txt_len(src_lines)
    tgt_len = calculate_txt_len(tgt_lines)
    m = src_len.shape[0] - 1
    n = tgt_len.shape[0] - 1
    # find search path
    w, search_path = \
      find_search_path(m, n, min_win_size, max_win_size, win_per_100)
    cost, back = align(src_len, tgt_len, w, search_path, align_types, priors, c, s2)
    alignment = back_track(m, n, back, search_path, align_types)
    #print(alignment)
    # save alignment
    f = open(align_file, 'w', encoding="utf-8")
    print_alignments(alignment, file=f)
 def print_alignments(alignments, file=sys.stdout):
  for x, y in alignments:
    print('%s:%s' % (x, y), file=file)
 def back_track(i, j, b, search_path, a_types):
    #i = b.shape[0] - 1
    #j = b.shape[1] - 1
    alignment = []
    while ( i !=0  and j != 0 ):
        j_offset = j - search_path[i][0]
        a = b[i][j_offset]
        s = a_types[a][0]
        t = a_types[a][1]
        src_range = [i - offset - 1 for offset in range(s)][::-1]
        tgt_range = [j - offset - 1 for offset in range(t)][::-1]
        alignment.append((src_range, tgt_range))
        i = i-s
        j = j-t
    return alignment[::-1]
@nb.jit(nopython=True, fastmath=True, cache=True)
 def align(src_len, tgt_len, w, search_path, align_types, priors, c, s2):
  #initialize cost and backpointer matrix
  m = src_len.shape[0] - 1
  cost = np.zeros((m + 1, 2 * w + 1))
  back = np.zeros((m + 1, 2 * w + 1), dtype=nb.int64)
  cost[0][0] = 0
  back[0][0] = -1
  for i in range(m + 1):
    i_start = search_path[i][0]
    i_end = search_path[i][1]
    for j in range(i_start, i_end + 1):
      if i + j == 0:
        continue
      best_score = np.inf
      best_a = -1
      for a in range(align_types.shape[0]):
        a_1 = align_types[a][0]
        a_2 = align_types[a][1]
        prev_i = i - a_1
        prev_j = j - a_2
        if prev_i < 0 or prev_j < 0 :  # no previous cell 
          continue
        prev_i_start = search_path[prev_i][0]
        prev_i_end =  search_path[prev_i][1]
        if prev_j < prev_i_start or prev_j > prev_i_end: # out of bound of cost matrix
            continue
        prev_j_offset = prev_j - prev_i_start
        score = cost[prev_i][prev_j_offset] - math.log(priors[a_1 + a_2]) + \
          get_score(src_len[i] - src_len[i - a_1], tgt_len[j] - tgt_len[j - a_2], c, s2)
        if score < best_score:
          best_score = score
          best_a = a
      j_offset = j - i_start
      cost[i][j_offset] = best_score
      back[i][j_offset] = best_a
  return cost, back
@nb.jit(nopython=True, fastmath=True, cache=True)
 def get_score(len_s, len_t, c, s2):
  mean = (len_s + len_t / c) / 2
  z = (len_t - len_s * c) / math.sqrt(mean * s2)
  pd = 2 * (1 - norm_cdf(abs(z)))
  if pd > 0:
    return -math.log(pd)
  return 25
@nb.jit(nopython=True, fastmath=True, cache=True)
 def find_search_path(src_len, tgt_len, min_win_size, max_win_size, win_per_100):
  yx_ratio = tgt_len / src_len
  win_size_1 = int(yx_ratio * tgt_len * win_per_100 / 100)
  win_size_2 = int(abs(tgt_len - src_len) * 3/4)
  w_1 = min(max(min_win_size, max(win_size_1, win_size_2)), max_win_size)
  #w_2 = int(max(src_len, tgt_len) * 0.05)
  w_2 = int(max(src_len, tgt_len) * 0.06)
  w = max(w_1, w_2)
  search_path = np.zeros((src_len + 1, 2), dtype=nb.int64)
  for i in range(0, src_len + 1):
    center = int(yx_ratio * i)
    w_start = max(0, center - w)
    w_end = min(center + w, tgt_len)
    search_path[i] = [w_start, w_end]
  return w, search_path
@nb.jit(nopython=True, fastmath=True, cache=True)
 def norm_cdf(z):
  t = 1/float(1+0.2316419*z) # t = 1/(1+pz) , z=0.2316419
  p_norm = 1 - 0.3989423*math.exp(-z*z/2) * ((0.319381530 * t)+ \
                                         (-0.356563782 * t)+ \
                                         (1.781477937 * t) + \
                                         (-1.821255978* t) + \
                                         (1.330274429 * t))
  return p_norm
 def calculate_txt_len(lines):
    txt_len = []
    txt_len.append(0)
    for i, line in enumerate(lines):
        txt_len.append(txt_len[i] + len(line.strip().encode("utf-8")))
    return np.array(txt_len)
 def read_job(file):
    job = []
    with open(file, 'r', encoding="utf-8") as f:
        for line in f:
            if not line.startswith("#"):
                job.append(line.strip())
    return job
 if __name__ == '__main__':
    t_0 = time.time()
    _main()
    print("It takes {}".format(time.time() - t_0))
--- a/bin/hunalign/ce.dic
+++ b/bin/hunalign/ce.dic
--- a/bin/hunalign/ec.dic
+++ b/bin/hunalign/ec.dic
--- a/bin/hunalign/hunalign
+++ b/bin/hunalign/hunalign
--- a/bin/hunalign/null.dic
+++ b/bin/hunalign/null.dic
@@ -0,0 +1 @@
 NULL @ NULL
--- a/bin/hunalign/translate.txt
+++ b/bin/hunalign/translate.txt
--- a/bin/vecalign/LICENSE
+++ b/bin/vecalign/LICENSE
@@ -0,0 +1,202 @@
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/
   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
   1. Definitions.
      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.
      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.
      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.
      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.
      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.
      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.
      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).
      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.
      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."
      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.
   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.
   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.
   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:
      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and
      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and
      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and
      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.
      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.
   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.
   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.
   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.
   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.
   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.
   END OF TERMS AND CONDITIONS
   APPENDIX: How to apply the Apache License to your work.
      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.
   Copyright [yyyy] [name of copyright owner]
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
--- a/bin/vecalign/README.md
+++ b/bin/vecalign/README.md
@@ -0,0 +1,158 @@
 # Vecalign
 Vecalign is an accurate sentence alignment algorithm which is fast even for very long documents.
 In conjunction with [LASER](https://github.com/facebookresearch/LASER), Vecalign 
 works in about 100 languages (i.e. 100^2 language pairs), 
 without the need for a machine translation system or lexicon. 
 Vecalign uses similarity of multilingual sentence embeddings to judge the similarity of sentences.
 ![multilingual_sentence_embedding image](media/multilingual_sentence_embedding.png)
 [image based on [this Facebook AI post](https://engineering.fb.com/ai-research/laser-multilingual-sentence-embeddings/)]
 Vecalign uses an approximation to Dynamic Programming based on 
 [Fast Dynamic Time Warping](https://content.iospress.com/articles/intelligent-data-analysis/ida00303)
 which is linear in time and space with respect to the number of sentences being aligned. 
 ![dynamic_programing_approximation visualization](media/dynamic_programing_approximation.gif)
 ### License 
 Copyright 2019 Brian Thompson
 Vecalign is released under the [Apache License, Version 2.0](LICENSE).
 For convenience, the dev and test datasets from Bleualign are provided. Bleualign is Copyright 2010 Rico Sennrich and is released under the [GNU General Public License Version 2](bleualign_data/LICENSE)
 ### Build Vecalign
 You will need python 3.6+ with numpy and cython. You can build an environment using conda as follows:
 ```
 # Use latest conda
 conda update conda -y
 # Create conda environment
 conda create  --force -y --name vecalign python=3.7
 # Activate new environment
 source `conda info --base`/etc/profile.d/conda.sh # See: https://github.com/conda/conda/issues/7980
 conda activate vecalign
 # Install required packages
 conda install -y -c anaconda cython
 conda install -y -c anaconda numpy
 ```
 Note that Vecalign contains cython code, but there is no need to build it manually as it is compiled automatically by [pyximport](https://github.com/cython/cython/tree/master/pyximport).
 ### Run Vecalign (using provided embeddings)
 ```
 ./vecalign.py --alignment_max_size 8 --src bleualign_data/dev.de --tgt bleualign_data/dev.fr \
   --src_embed bleualign_data/overlaps.de bleualign_data/overlaps.de.emb  \
   --tgt_embed bleualign_data/overlaps.fr bleualign_data/overlaps.fr.emb
 ```
 Alignments are written to stdout:
 ```
 [0]:[0]:0.156006
 [1]:[1]:0.160997
 [2]:[2]:0.217155
 [3]:[3]:0.361439
 [4]:[4]:0.346332
 [5]:[5]:0.211873
 [6]:[6, 7, 8]:0.507506
 [7]:[9]:0.252747
 [8, 9]:[10, 11, 12]:0.139594
 [10, 11]:[13]:0.273751
 [12]:[14]:0.165397
 [13]:[15, 16, 17]:0.436312
 [14]:[18, 19, 20, 21]:0.734142
 []:[22]:0.000000
 []:[23]:0.000000
 []:[24]:0.000000
 []:[25]:0.000000
 [15]:[26, 27, 28]:0.840094
 ...
 ```
 The first two entries are the source and target sentence indexes for each alignment, respectively. 
 The third entry in each line is the sentence alignment cost computed by Vecalign. 
 Note that this cost includes normalization but does *not* include the penalties terms for containing more than one sentence. 
 Note that the alignment cost is set to zero for insertions/deletions. 
 Also note that the results may vary slightly due to randomness in the normalization.
 To score against a gold alignment, use the "-g" flag.
 Flags "-s", "-t", and "-g" can accept multiple arguments. This is primarily useful for scoring, as the output alignments will all be concatenated together in stdout. For example, to align and score the bleualign test set: 
 ```
 ./vecalign.py --alignment_max_size 8 --src bleualign_data/test*.de --tgt bleualign_data/test*.fr \
   --gold bleualign_data/test*.defr  \
   --src_embed bleualign_data/overlaps.de bleualign_data/overlaps.de.emb  \
   --tgt_embed bleualign_data/overlaps.fr bleualign_data/overlaps.fr.emb > /dev/null
 ```
 Which should give you results that approximately match the Vecalign paper:
 ```
 ---------------------------------
 |             |  Strict |    Lax  |
 | Precision   |   0.899 |   0.985 |
 | Recall      |   0.904 |   0.987 |
 | F1          |   0.902 |   0.986 |
 ---------------------------------
 ```
 Note: Run `./vecalign.py -h` for full sentence alignment usage and options. 
 For stand-alone scoring against a gold reference, see [score.py](score.py)
 ### Embed your own documents
 The Vecalign repository contains overlap and embedding files for the Bluealign dev/test files. 
 This section shows how those files were made, as an example for running on new data.
 Vecalign requires not only embeddings of sentences in each document, 
 but also embeddings of *concatenations* of consecutive sentences.
 The embeddings of multiple, consecutive sentences are needed to consider 1-many, many-1, and many-many alignments.
 To create a file containing all the sentence combinations in the dev and test files from Bleualign:
 ```
 ./overlap.py -i bleualign_data/dev.fr bleualign_data/test*.fr -o bleualign_data/overlaps.fr -n 10
 ./overlap.py -i bleualign_data/dev.de bleualign_data/test*.de -o bleualign_data/overlaps.de -n 10
 ```
 Note: Run `./overlap.py -h` to see full set of embedding options. 
 `bleualign_data/overlaps.fr` and `bleualign_data/overlaps.de` are text files containing one or more sentences per line. 
 These files must then be embedded using a multilingual sentence embedder.
 We recommend the [Language-Agnostic SEntence Representations (LASER)](https://github.com/facebookresearch/LASER) 
 toolkit from Facebook, as it has strong performance and comes with a pretrained model which works well in about 100 languages. 
 However, Vecalign should also work with other embedding methods as well. Embeddings should be provided as a binary file containing float32 values.
 The following assumes LASER is installed and the LASER environmental variable has been set.
 To embed the Bleualign files using LASER:
 ```
 $LASER/tasks/embed/embed.sh bleualign_data/overlaps.fr fr bleualign_data/overlaps.fr.emb
 $LASER/tasks/embed/embed.sh bleualign_data/overlaps.de de bleualign_data/overlaps.de.emb
 ```
 Note that LASER will not overwrite an embedding file if it exsts, so you may need to run first `rm bleualign_data/overlaps.fr.emb bleualign_data/overlaps.de.emb`.
 ### Publications
 If you use Vecalign, please cite our [paper](https://www.aclweb.org/anthology/D19-1136.pdf):
 ```
@inproceedings{thompson-koehn-2019-vecalign,
    title = "{V}ecalign: Improved Sentence Alignment in Linear Time and Space",
    author = "Thompson, Brian and Koehn, Philipp",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)",
    month = nov,
    year = "2019",
    address = "Hong Kong, China",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/D19-1136",
    doi = "10.18653/v1/D19-1136",
    pages = "1342--1348",
 }
 ```
--- a/bin/vecalign/init.py
+++ b/bin/vecalign/init.py
--- a/bin/vecalign/dp_core.pyx
+++ b/bin/vecalign/dp_core.pyx
@@ -0,0 +1,411 @@
 # cython: language_level=3
 """
 Copyright 2019 Brian Thompson
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    https://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 """
 import numpy as np
 cimport numpy as np
 cimport cython
 def make_x_y_offsets(alignment_types):
    # alignment types for which we will precompute costs
    # deletion/insertion is added later
    for x, y in alignment_types:
        assert (x > 0)
        assert (y > 0)
    x_offsets = np.array([x for x, y in alignment_types], dtype=np.int32)  # MUST **NOT** INCLUDE (0,1), (1,0)
    y_offsets = np.array([y for x, y in alignment_types], dtype=np.int32)  # MUST **NOT** INCLUDE (0,1), (1,0)
    return x_offsets, y_offsets
 def make_dense_costs(np.ndarray[float, ndim=3] vecs0,  # itput
                     np.ndarray[float, ndim=3] vecs1,  # input
                     np.ndarray[float, ndim=2] norm0,  # input
                     np.ndarray[float, ndim=2] norm1,  # input
                     int offset0 = 0,  # index into vecs0/norms0
                     int offset1 = 0,  # index into vecs1/norms1
                     ):
    """
    Make a full N*M feature matrix. By default, makes 1-1 alignments, 
       can build others by specifying offset0, offset1 to index into
       vecs0, norms0 and vecs1, norms1 respectivly. 
    """
    assert vecs0.shape[0] > offset0
    assert vecs1.shape[0] > offset1
    assert norm0.shape[0] > offset0
    assert norm1.shape[0] > offset1
    cdef int size0 = np.shape(vecs0)[1]
    assert norm0.shape[1] == size0
    cdef int size1 = np.shape(vecs1)[1]
    assert norm1.shape[1] == size1
    cdef int vecsize = np.shape(vecs0)[2]
    assert vecs1.shape[2] == vecsize
    cdef int xi, yi
    cdef float sumx
    cdef np.ndarray[float, ndim=2] costs = np.empty((size0, size1), dtype=np.float32)
    for xi in range(size0):
        for yi in range(size1):
            sumx = 0.0
            for jj in range(vecsize):
                sumx += vecs0[offset0, xi, jj] * vecs1[offset1, yi, jj]
            costs[xi, yi] = 2.0 * (1.0 - sumx) / (1e-6 + norm0[offset0, xi] + norm1[offset1, yi])
            # normalize by alignment type  
            costs[xi, yi] = costs[xi, yi] * (offset0 + 1) * (offset1 + 1)
    return costs
 def dense_dp(np.ndarray[float, ndim=2] alignment_cost, float pen):
    """
    Compute cost matrix (csum) and backpointers (bp) 
    from full 2-D 1-1 alignment costs matrix (alignment_cost) 
    """
    size0 = alignment_cost.shape[0]
    size1 = alignment_cost.shape[1]
    # csum and traceback matrix are both on nodes
    #   so they are +1 in each dimension compared to the jump costs matrix
    # For anything being used in accumulation, use float64
    cdef np.ndarray[double, ndim=2] csum = np.empty((size0 + 1, size1 + 1), dtype=np.float64)
    cdef np.ndarray[int, ndim=2] bp = np.empty((size0 + 1, size1 + 1), dtype=np.int32)
    # bp and csum are nodes, 
    #   while alignment_cost is the cost of going between the nodes
    # Size of nodes should be one larger than alignment costs
    b0, b1 = np.shape(bp)
    c0, c1 = np.shape(csum)
    j0, j1 = np.shape(alignment_cost)
    assert (b0 == c0 == j0 + 1)
    assert (b1 == c1 == j1 + 1)
    cdef int cmax = np.shape(csum)[1]
    cdef int rmax = np.shape(csum)[0]
    cdef int c, r
    cdef double cost0, cost1, cost2
    # initialize the all c-direction deletion path
    for c in range(cmax):
        csum[0, c] = c * pen
        bp[0, c] = 1
    # initialize the all r-direction deletion path
    for r in range(rmax):
        csum[r, 0] = r * pen
        bp[r, 0] = 2
    # Initial cost is 0.0
    csum[0, 0] = 0.0  # noop
    bp[0, 0] = 4  # should not matter
    # Calculate the rest recursively
    for c in range(1, cmax):
        for r in range(1, rmax):
            # alignment_cost indexes are off by 1 wrt
            #   csum/bp, since csum/bp are nodes
            cost0 = csum[r - 1, c - 1] + alignment_cost[r - 1, c - 1]
            cost1 = csum[r, c - 1] + pen
            cost2 = csum[r - 1, c] + pen
            csum[r, c] = cost0
            bp[r, c] = 0
            if cost1 < csum[r, c]:
                csum[r, c] = cost1
                bp[r, c] = 1
            if cost2 < csum[r, c]:
                csum[r, c] = cost2
                bp[r, c] = 2
    return csum, bp
 def score_path(np.ndarray[int, ndim=1] xx,
               np.ndarray[int, ndim=1] yy,
               np.ndarray[float, ndim=1] norm1,
               np.ndarray[float, ndim=1] norm2,
               np.ndarray[float, ndim=2] vecs1,
               np.ndarray[float, ndim=2] vecs2,
               np.ndarray[float, ndim=1] out):
    cdef int xi, yi, ii, jj
    cdef float outx
    cdef int lenxy = xx.shape[0]
    cdef int vecsize = vecs1.shape[1]
    for ii in range(lenxy):
        xi = xx[ii]
        yi = yy[ii]
        outx = 0.0
        for jj in range(vecsize):
            outx += vecs1[xi, jj] * vecs2[yi, jj]
        out[ii] = 2.0 * (1.0 - outx) / (norm1[xi] + norm2[yi])
 # Bounds checking and wraparound slow things down by about 2x
 # Division by 0 checking has minimal speed impact
@cython.boundscheck(False)  # turn off bounds-checking for entire function
@cython.wraparound(False)  # turn off negative index wrapping for entire function
@cython.cdivision(True)  # use c-style division (no division-by-zero check)
 def make_sparse_costs(np.ndarray[float, ndim=3] vecs0,  # intput: num aligns X num sents X dim
                      np.ndarray[float, ndim=3] vecs1,  # input
                      np.ndarray[float, ndim=2] norms0,  # intput: num aligns X num sents
                      np.ndarray[float, ndim=2] norms1,  # input
                      x_y_path,
                      alignment_types,
                      int width_over2):
    """
    Make features for DP, *for lines running across approximate path*, *for each alignment type*
    x_offsets, y_offsets should not include (0,1), (1,0)
    Basically, we take the feature matrix, rotate it 45 degress, 
       and compute a "wavy" matrix for the features.
    It's like the diagonal but it moves around to hopefully always include the true path.
    """
    cdef np.ndarray[int, ndim=2] x_y_path_ = np.array(x_y_path).astype(np.int32)
    assert (vecs0.shape[0] == norms0.shape[0])
    assert (vecs1.shape[0] == norms1.shape[0])
    assert (vecs0.shape[1] == norms0.shape[1])
    assert (vecs1.shape[1] == norms1.shape[1])
    # check how many overlaps vectors were passed in
    num_overlaps_in_vecs0 = vecs0.shape[0]
    num_overlaps_in_vecs1 = vecs1.shape[0]
    # check how many overlaps were requested
    # edge case: alignment_types could be empty
    #    In that case, we should just return insertions/deletions
    #    and max_x_overlap == max_y_overlap == 0
    max_x_overlap = max([0] + [x for x, y in alignment_types])  # add [0] in case alignment_types is empty
    max_y_overlap = max([0] + [y for x, y in alignment_types])  # add [0] in case alignment_types is empty
    # note: alignment types are specified 1-based, but vectors are stored 0-based
    if max_x_overlap > num_overlaps_in_vecs0:
        raise Exception('%d x overlaps requrested (via alignment_types), but vecs0 only has %d' % (
            max_x_overlap, num_overlaps_in_vecs0))
    if max_y_overlap > num_overlaps_in_vecs1:
        raise Exception('%d y overlaps requrested (via alignment_types), but vecs1 only has %d' % (
            max_y_overlap, num_overlaps_in_vecs1))
    # number of sentences in each document
    cdef int xsize = vecs0.shape[1]
    cdef int ysize = vecs1.shape[1]
    # vector diminsions should match
    assert (vecs0.shape[2] == vecs1.shape[2])
    cdef np.ndarray[int, ndim=1] x_offsets, y_offsets
    x_offsets, y_offsets = make_x_y_offsets(alignment_types)
    # reserve outputs
    a_len = x_y_path_.shape[0]
    b_len = 2 * width_over2
    cdef np.ndarray[float, ndim=3] a_b_feats = np.empty((len(alignment_types), a_len, b_len), dtype=np.float32)
    cdef np.ndarray[int, ndim=1] b_offset = np.empty(a_len).astype(np.int32)
    cdef int x, y, aa, bb, xx, yy, a_idx, b_idx, bb2, x_offset, y_offset, ii_align, x_offset_idx, y_offset_idx
    cdef int vecsize = vecs0.shape[2]
    cdef int num_alignments = x_offsets.shape[0]
    cdef float sumx, feat
    cdef float inf = np.inf
    for ii in range(x_y_path_.shape[0]):
        x = x_y_path_[ii, 0]
        y = x_y_path_[ii, 1]
        # convert xy to ab cords
        aa = x + y
        bb = y
        a_idx = aa
        b_offset[aa] = bb - width_over2
        for b_idx, bb2 in enumerate(range(bb - width_over2, bb + width_over2)):
            # convert ab to xy cords
            xx = aa - bb2
            yy = bb2
            for ii_align in range(num_alignments):
                x_offset = x_offsets[ii_align]
                x_offset_idx = x_offset - 1  # overlaps start at 1, vectors stored 0-based
                y_offset = y_offsets[ii_align]
                y_offset_idx = y_offset - 1
                if 0 <= xx < xsize and 0 <= yy < ysize:
                    sumx = 0.0
                    for jj in range(vecsize):
                        sumx += vecs0[x_offset_idx, xx, jj] * vecs1[y_offset_idx, yy, jj]
                    feat = 2.0 * x_offset * y_offset * (1.0 - sumx) / (
                            1e-6 + norms0[x_offset_idx, xx] + norms1[y_offset_idx, yy])
                else:
                    feat = inf
                a_b_feats[ii_align, a_idx, b_idx] = feat
    return a_b_feats, b_offset
 def sparse_dp(np.ndarray[float, ndim=3] a_b_costs,
              np.ndarray[int, ndim=1] b_offset_in,
              alignment_types,
              double del_penalty,
              int x_in_size,
              int y_in_size):
    """
    Do DP along a path, using features saved off along path.
    x_offsets, y_offsets should not include (0,1), (1,0)
    xsize, ysize refer to the costs a_b_csum, but in x/y space
    As in the simpler full-DP case, 
       we compute cumulative costs and backpointers on notes,
       and there are COSTS associated with moving between them.
    This means the size of the notes  +1,+1 larger (in x,y) than the COSTS.
    So the size of a_b_csum, a_b_xp, a_b_yp are all one larger in x and y compared to the costs
    In order to save memory (and time, vs a sparse matrix with hashes to look up values), let:
             a = x + y
             b = x - y
    b_offsets tells us how far from the left edge the features are computed for.
         basically it's like we are computing along the diagonal, 
         but we shift the diagonal around based on our belief
         about where the alignments are. 
    b_offsets is used for both costs AND csum, backpointers, so it needs to be 
        +2 longer (it is in the a-direction) than the costs (in the a direction)
    """
    cdef np.ndarray[int, ndim=1] x_offsets, y_offsets
    x_offsets, y_offsets = make_x_y_offsets(alignment_types)
    # make x/y offsets, including (0,1), (1,), i.e. including deletion and insertion
    x_offsets = np.concatenate([x_offsets, np.array([0, 1], dtype=np.int32)])
    y_offsets = np.concatenate([y_offsets, np.array([1, 0], dtype=np.int32)])
    cdef int a_in_size = a_b_costs.shape[1]
    cdef int b_in_size = a_b_costs.shape[2]
    cdef int a_out_size = a_in_size + 2
    cdef int b_out_size = b_in_size
    cdef int x_out_size = x_in_size + 1
    cdef int y_out_size = y_in_size + 1
    # costs are the costs of going between nodes.
    # in x,y for the nodes, we basically add a buffer 
    #   at x=0 and y=0, and shift the cost by (x=+1,y=+1)
    # In a,b space, this means adding two points (for the buffer)
    #      at the beginning, and shifting by (a=+0,b=+1) since
    #      a=x+y and b=y
    # for the first two points, we can simply replicate the
    #    original b_offset, since it should be -width_over2
    # i.e. b_offset_in[0] == -width_over2
    extra_two_points = np.array([b_offset_in[0], b_offset_in[0]], dtype=np.int32)
    cdef np.ndarray[int, ndim=1] b_offset_out = np.concatenate([extra_two_points, b_offset_in + 1])
    # outputs
    # For anything being used in accumulation, use float64
    cdef np.ndarray[double, ndim=2] a_b_csum = np.zeros((a_in_size + 2, b_in_size),
                                                        dtype=np.float64) + np.inf  # error cumulative sum
    cdef np.ndarray[int, ndim=2] a_b_xp = np.zeros((a_in_size + 2, b_in_size), dtype=np.int32) - 2  # backpointer for x
    cdef np.ndarray[int, ndim=2] a_b_yp = np.zeros((a_in_size + 2, b_in_size), dtype=np.int32) - 2  # backpointer for y
    cdef int num_alignments = x_offsets.shape[0]
    cdef double inf = np.inf
    cdef int xx_out, yy_out, ii_align, x_offset, y_offset
    cdef int aa_in_cost, bb_in_cost, aa_out, bb_out, aa_out_prev, bb_out_prev, xx_in_cost, yy_in_cost, xx_out_prev, yy_out_prev
    cdef double alignment_cost, total_cost, prev_cost
    # increasing in a is the same as going along diagonals in x/y, so DP order works
    #  (and any ordering is fine in b - nothing depends on values adjacent on diagonal in x/y)
    for aa_out in range(a_in_size + 2):
        for bb_out in range(b_in_size):
            #xx_out, yy_out = ab2xy_w_offset(aa_out, bb_out, b_offset_out)
            yy_out = bb_out + b_offset_out[aa_out]
            xx_out = aa_out - yy_out
            # edge case: all deletions in y-direction
            if xx_out == 0 and 0 <= yy_out < y_out_size:
                a_b_csum[aa_out, bb_out] = del_penalty * yy_out
                a_b_xp[aa_out, bb_out] = 0
                a_b_yp[aa_out, bb_out] = 1
            # edge case: all deletions in x-direction
            elif yy_out == 0 and 0 <= xx_out < x_out_size:
                a_b_csum[aa_out, bb_out] = del_penalty * xx_out
                a_b_xp[aa_out, bb_out] = 1
                a_b_yp[aa_out, bb_out] = 0
            else:
                # initialize output to inf
                a_b_csum[aa_out, bb_out] = inf
                a_b_xp[aa_out, bb_out] = -42
                a_b_yp[aa_out, bb_out] = -42
                for ii_align in range(num_alignments):
                    x_offset = x_offsets[ii_align]
                    y_offset = y_offsets[ii_align]
                    # coords of location of alignment cost, in input x/y space
                    xx_in_cost = xx_out - 1  # features were front padded,
                    yy_in_cost = yy_out - 1  #   so offset is always 1
                    # the coords of location of previous cumsum cost, in input x/y space
                    xx_out_prev = xx_out - x_offset
                    yy_out_prev = yy_out - y_offset
                    if 0 <= xx_in_cost < x_in_size and 0 <= yy_in_cost < y_in_size and 0 <= xx_out_prev < x_out_size and 0 <= yy_out_prev < y_out_size:
                        # convert x,y to a,b
                        aa_in_cost = xx_in_cost + yy_in_cost
                        bb_in_cost = yy_in_cost - b_offset_in[aa_in_cost]
                        aa_out_prev = xx_out_prev + yy_out_prev
                        bb_out_prev = yy_out_prev - b_offset_out[aa_out_prev]
                        if 0 <= aa_in_cost < a_in_size and 0 <= bb_in_cost < b_in_size and 0 <= aa_out_prev < a_out_size and 0 <= bb_out_prev < b_out_size:
                            if x_offset == 0 or y_offset == 0:
                                alignment_cost = del_penalty
                            else:
                                alignment_cost = a_b_costs[ii_align, aa_in_cost, bb_in_cost]
                            prev_cost = a_b_csum[aa_out_prev, bb_out_prev]
                            total_cost = prev_cost + alignment_cost
                            if total_cost < a_b_csum[aa_out, bb_out]:
                                a_b_csum[aa_out, bb_out] = total_cost
                                a_b_xp[aa_out, bb_out] = x_offset
                                a_b_yp[aa_out, bb_out] = y_offset
    return a_b_csum, a_b_xp, a_b_yp, b_offset_out
--- a/bin/vecalign/dp_utils.py
+++ b/bin/vecalign/dp_utils.py
@@ -0,0 +1,665 @@
 """
 Copyright 2019 Brian Thompson
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    https://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 """
 import logging
 import sys
 from ast import literal_eval
 from collections import OrderedDict
 from math import ceil
 from time import time
 import numpy as np
 import pyximport
 pyximport.install(setup_args={'include_dirs':np.get_include()}, inplace=True, reload_support=True)
 from dp_core import make_dense_costs, score_path, sparse_dp, make_sparse_costs, dense_dp
 logger = logging.getLogger('vecalign')  # set up in vecalign.py
 def preprocess_line(line):
    line = line.strip()
    if len(line) == 0:
        line = 'BLANK_LINE'
    return line
 def yield_overlaps(lines, num_overlaps):
    lines = [preprocess_line(line) for line in lines]
    for overlap in range(1, num_overlaps + 1):
        for out_line in layer(lines, overlap):
            # check must be here so all outputs are unique
            out_line2 = out_line[:10000]  # limit line so dont encode arbitrarily long sentences
            yield out_line2
 def read_in_embeddings(text_file, embed_file):
    """
    Given a text file with candidate sentences and a corresponing embedding file,
       make a maping from candidate sentence to embedding index, 
       and a numpy array of the embeddings
    """
    sent2line = dict()
    with open(text_file, 'rt', encoding="utf-8") as fin:
        for ii, line in enumerate(fin):
            if line.strip() in sent2line:
                raise Exception('got multiple embeddings for the same line')
            sent2line[line.strip()] = ii
    line_embeddings = np.fromfile(embed_file, dtype=np.float32, count=-1)
    if line_embeddings.size == 0:
        raise Exception('Got empty embedding file')
    laser_embedding_size = line_embeddings.size // len(sent2line)  # currently hardcoded to 1024
    if laser_embedding_size != 1024:
        logger.warning('expected an embedding size of 1024, got %s', laser_embedding_size)
    logger.info('laser_embedding_size determined to be %d', laser_embedding_size)
    line_embeddings.resize(line_embeddings.shape[0] // laser_embedding_size, laser_embedding_size)
    return sent2line, line_embeddings
 def make_doc_embedding(sent2line, line_embeddings, lines, num_overlaps):
    """
    lines: sentences in input document to embed
    sent2line, line_embeddings: precomputed embeddings for lines (and overlaps of lines)
    """
    lines = [preprocess_line(line) for line in lines]
    vecsize = line_embeddings.shape[1]
    vecs0 = np.empty((num_overlaps, len(lines), vecsize), dtype=np.float32)
    for ii, overlap in enumerate(range(1, num_overlaps + 1)):
        for jj, out_line in enumerate(layer(lines, overlap)):
            try:
                line_id = sent2line[out_line]
            except KeyError:
                logger.warning('Failed to find overlap=%d line "%s". Will use random vector.', overlap, out_line)
                line_id = None
            if line_id is not None:
                vec = line_embeddings[line_id]
            else:
                vec = np.random.random(vecsize) - 0.5
                vec = vec / np.linalg.norm(vec)
            vecs0[ii, jj, :] = vec
    return vecs0
 def make_norm1(vecs0):
    """
    make vectors norm==1 so that cosine distance can be computed via dot product
    """
    for ii in range(vecs0.shape[0]):
        for jj in range(vecs0.shape[1]):
            norm = np.sqrt(np.square(vecs0[ii, jj, :]).sum())
            vecs0[ii, jj, :] = vecs0[ii, jj, :] / (norm + 1e-5)
 def layer(lines, num_overlaps, comb=' '):
    """
    make front-padded overlapping sentences
    """
    if num_overlaps < 1:
        raise Exception('num_overlaps must be >= 1')
    out = ['PAD', ] * min(num_overlaps - 1, len(lines))
    for ii in range(len(lines) - num_overlaps + 1):
        out.append(comb.join(lines[ii:ii + num_overlaps]))
    return out
 def read_alignments(fin):
    alignments = []
    with open(fin, 'rt', encoding="utf-8") as infile:
        for line in infile:
            fields = [x.strip() for x in line.split(':') if len(x.strip())]
            if len(fields) < 2:
                raise Exception('Got line "%s", which does not have at least two ":" separated fields' % line.strip())
            try:
                src = literal_eval(fields[0])
                tgt = literal_eval(fields[1])
            except:
                raise Exception('Failed to parse line "%s"' % line.strip())
            alignments.append((src, tgt))
    # I know bluealign files have a few entries entries missing,
    #   but I don't fix them in order to be consistent previous reported scores
    return alignments
 def print_alignments(alignments, scores=None, file=sys.stdout):
    if scores is not None:
        for (x, y), s in zip(alignments, scores):
            print('%s:%s:%.6f' % (x, y, s), file=file)
    else:
        for x, y in alignments:
            print('%s:%s' % (x, y), file=file)
 class DeletionKnob(object):
    """
    A good deletion penalty is dependent on normalization, and probably language, domain, etc, etc
    I want a way to control deletion penalty that generalizes well...
    Sampling costs and use percentile seems to work fairly well.
    """
    def __init__(self, samp, res_min, res_max):
        self.res_min = res_min
        self.res_max = res_max
        if self.res_min >= self.res_max:
            logger.warning('res_max <= res_min, increasing it')
            self.res_max = self.res_min + 1e-4
        num_bins = 1000
        num_pts = 30
        self.hist, self.bin_edges = np.histogram(samp, bins=num_bins,
                                                 range=[self.res_min, self.res_max],
                                                 density=True)
        dx = self.bin_edges[1] - self.bin_edges[0]
        self.cdf = np.cumsum(self.hist) * dx
        interp_points = [(0, self.res_min), ]
        for knob_val in np.linspace(0, 1, num_pts - 1)[1:-1]:
            cdf_idx = np.searchsorted(self.cdf, knob_val)
            cdf_val = self.res_min + cdf_idx / float(num_bins) * (self.res_max - self.res_min)
            interp_points.append((knob_val, cdf_val))
        interp_points.append((1, self.res_max))
        self.x, self.y = zip(*interp_points)
    def percentile_frac_to_del_penalty(self, knob_val):
        del_pen = np.interp([knob_val], self.x, self.y)[0]
        return del_pen
 def make_alignment_types(max_alignment_size):
    # return list of all (n,m) where n+m <= this
    alignment_types = []
    for x in range(1, max_alignment_size):
        for y in range(1, max_alignment_size):
            if x + y <= max_alignment_size:
                alignment_types.append((x, y))
    return alignment_types
 def ab2xy_w_offset(aa, bb_idx, bb_offset):
    bb_from_side = bb_idx + bb_offset[aa]
    xx = aa - bb_from_side
    yy = bb_from_side
    return (xx, yy)
 def xy2ab_w_offset(xx, yy, bb_offset):
    aa = xx + yy
    bb_from_side = yy
    bb = bb_from_side - bb_offset[aa]
    return aa, bb
 def process_scores(scores, alignments):
    # floating point sometimes gives negative numbers, which is a little unnerving ...
    scores = np.clip(scores, a_min=0, a_max=None)
    for ii, (x_algn, y_algn) in enumerate(alignments):
        # deletion penalty is pretty arbitrary, just report 0
        if len(x_algn) == 0 or len(y_algn) == 0:
            scores[ii] = 0.0
        # report sores un-normalized by alignment sizes
        #    (still normalized with random vectors, though)
        else:
            scores[ii] = scores[ii] / len(x_algn) / len(y_algn)
    return scores
 def sparse_traceback(a_b_csum, a_b_xp, a_b_yp, b_offset, xsize, ysize):
    alignments = []
    xx = xsize
    yy = ysize
    cum_costs = []
    while True:
        aa, bb = xy2ab_w_offset(xx, yy, b_offset)
        cum_costs.append(a_b_csum[aa, bb])
        xp = a_b_xp[aa, bb]
        yp = a_b_yp[aa, bb]
        if xx == yy == 0:
            break
        if xx < 0 or yy < 0:
            raise Exception('traceback bug')
        x_side = list(range(xx - xp, xx))
        y_side = list(range(yy - yp, yy))
        alignments.append((x_side, y_side))
        xx = xx - xp
        yy = yy - yp
    alignments.reverse()
    cum_costs.reverse()
    costs = np.array(cum_costs[1:]) - np.array(cum_costs[:-1])
    # "costs" are scaled by x_alignment_size * y_alignment_size
    #     and the cost of a deletion is del_penalty
    # "scores": 0 for deletion/insertion, 
    #    and cosine distance, *not* scaled 
    #    by len(x_alignment)*len(y_alignment)
    scores = process_scores(scores=costs, alignments=alignments)
    return alignments, scores
 def dense_traceback(x_y_tb):
    xsize, ysize = x_y_tb.shape
    xx = xsize - 1
    yy = ysize - 1
    alignments = []
    while True:
        if xx == yy == 0:
            break
        bp = x_y_tb[xx, yy]
        if bp == 0:
            xp, yp = 1, 1
            alignments.append(([xx - 1], [yy - 1]))
        elif bp == 1:
            xp, yp = 0, 1
            alignments.append(([], [yy - 1]))
        elif bp == 2:
            xp, yp = 1, 0
            alignments.append(([xx - 1], []))
        else:
            raise Exception('got unknown value')
        xx = xx - xp
        yy = yy - yp
    alignments.reverse()
    return alignments
 def append_slant(path, xwidth, ywidth):
    """
    Append quantized approximation to a straight line
       from current x,y to a point at (x+xwidth, y+ywidth)
    """
    NN = xwidth + ywidth
    xstart, ystart = path[-1]
    for ii in range(1, NN + 1):
        x = xstart + round(xwidth * ii / NN)
        y = ystart + round(ywidth * ii / NN)
        # In the case of ties we want them to round differently,
        #   so explicitly make sure we take a step of 1, not 0 or 2
        lastx, lasty = path[-1]
        delta = x + y - lastx - lasty
        if delta == 1:
            path.append((x, y))
        elif delta == 2:
            path.append((x - 1, y))
        elif delta == 0:
            path.append((x + 1, y))
 def alignment_to_search_path(algn):
    """
    Given an alignment, make searchpath.
    Searchpath must step exactly one position in x XOR y at each time step.
    In the case of a block of deletions, the order found by DP is not meaningful.
    To make things consistent and to improve the probability of recovering 
       from search errors, we search an approximately straight line
       through a block of deletions. We do the same through a many-many 
       alignment, even though we currently don't refine a many-many alignment...
    """
    path = [(0, 0), ]
    xdel, ydel = 0, 0
    ydel = 0
    for x, y in algn:
        if len(x) and len(y):
            append_slant(path, xdel, ydel)
            xdel, ydel = 0, 0
            append_slant(path, len(x), len(y))
        elif len(x):
            xdel += len(x)
        elif len(y):
            ydel += len(y)
    append_slant(path, xdel, ydel)
    return path
 def extend_alignments(course_alignments, size0, size1):
    """
    extend alignments to include new endpoints size0, size1
    if alignments are larger than size0/size1, raise exception
    """
    # could be a string of deletions or insertions at end, so cannot just grab last one
    xmax = 0  # maximum x value in course_alignments
    ymax = 0  # maximum y value in course_alignments
    for x, y in course_alignments:
        for xval in x:
            xmax = max(xmax, xval)
        for yval in y:
            ymax = max(ymax, yval)
    if xmax > size0 or ymax > size1:
        raise Exception('asked to extend alignments but already bigger than requested')
    # do not duplicate xmax/ymax, do include size0/size1 
    extra_x = list(range(xmax + 1, size0 + 1))
    extra_y = list(range(ymax + 1, size1 + 1))
    logger.debug('extending alignments in x by %d and y by %d', len(extra_x), len(extra_y))
    if len(extra_x) == 0:
        for yval in extra_y:
            course_alignments.append(([], [yval]))
    elif len(extra_y) == 0:
        for xval in extra_x:
            course_alignments.append(([xval], []))
    else:
        course_alignments.append((extra_x, extra_y))
 def upsample_alignment(algn):
    def upsample_one_alignment(xx):
        return list(range(min(xx) * 2, (max(xx) + 1) * 2))
    new_algn = []
    for xx, yy in algn:
        if len(xx) == 0:
            for yyy in upsample_one_alignment(yy):
                new_algn.append(([], [yyy]))
        elif len(yy) == 0:
            for xxx in upsample_one_alignment(xx):
                new_algn.append(([xxx], []))
        else:
            new_algn.append((upsample_one_alignment(xx), upsample_one_alignment(yy)))
    return new_algn
 def make_del_knob(e_laser,
                  f_laser,
                  e_laser_norms,
                  f_laser_norms,
                  sample_size):
    e_size = e_laser.shape[0]
    f_size = f_laser.shape[0]
    if e_size > 0 and f_size > 0 and sample_size > 0:
        if e_size * f_size < sample_size:
            # dont sample, just compute full matrix
            sample_size = e_size * f_size
            x_idxs = np.zeros(sample_size, dtype=np.int32)
            y_idxs = np.zeros(sample_size, dtype=np.int32)
            c = 0
            for ii in range(e_size):
                for jj in range(f_size):
                    x_idxs[c] = ii
                    y_idxs[c] = jj
                    c += 1
        else:
            # get random samples
            x_idxs = np.random.choice(range(e_size), size=sample_size, replace=True).astype(np.int32)
            y_idxs = np.random.choice(range(f_size), size=sample_size, replace=True).astype(np.int32)
        # output
        random_scores = np.empty(sample_size, dtype=np.float32)
        score_path(x_idxs, y_idxs,
                   e_laser_norms, f_laser_norms,
                   e_laser, f_laser,
                   random_scores, )
        min_score = 0
        max_score = max(random_scores)  # could bump this up... but its probably fine
    else:
        # Not much we can do here...
        random_scores = np.array([0.0, 0.5, 1.0])  # ???
        min_score = 0
        max_score = 1  # ????
    del_knob = DeletionKnob(random_scores, min_score, max_score)
    return del_knob
 def compute_norms(vecs0, vecs1, num_samples, overlaps_to_use=None):
    # overlaps_to_use = 10  # 10 matches before
    overlaps1, size1, dim = vecs1.shape
    overlaps0, size0, dim0 = vecs0.shape
    assert (dim == dim0)
    if overlaps_to_use is not None:
        if overlaps_to_use > overlaps1:
            raise Exception('Cannot use more overlaps than provided. You may want to re-run make_verlaps.py with a larger -n value')
    else:
        overlaps_to_use = overlaps1
    samps_per_overlap = ceil(num_samples / overlaps_to_use)
    if size1 and samps_per_overlap:
        # sample other size (from all overlaps) to compre to this side
        vecs1_rand_sample = np.empty((samps_per_overlap * overlaps_to_use, dim), dtype=np.float32)
        for overlap_ii in range(overlaps_to_use):
            idxs = np.random.choice(range(size1), size=samps_per_overlap, replace=True)
            random_vecs = vecs1[overlap_ii, idxs, :]
            vecs1_rand_sample[overlap_ii * samps_per_overlap:(overlap_ii + 1) * samps_per_overlap, :] = random_vecs
        norms0 = np.empty((overlaps0, size0), dtype=np.float32)
        for overlap_ii in range(overlaps0):
            e_laser = vecs0[overlap_ii, :, :]
            sim = np.matmul(e_laser, vecs1_rand_sample.T)
            norms0[overlap_ii, :] = 1.0 - sim.mean(axis=1)
    else:  # no samples, no normalization
        norms0 = np.ones((overlaps0, size0)).astype(np.float32)
    return norms0
 def downsample_vectors(vecs1):
    a, b, c = vecs1.shape
    half = np.empty((a, b // 2, c), dtype=np.float32)
    for ii in range(a):
        # average consecutive vectors
        for jj in range(0, b - b % 2, 2):
            v1 = vecs1[ii, jj, :]
            v2 = vecs1[ii, jj + 1, :]
            half[ii, jj // 2, :] = v1 + v2
        # compute mean for all vectors
        mean = np.mean(half[ii, :, :], axis=0)
        for jj in range(0, b - b % 2, 2):
            # remove mean
            half[ii, jj // 2, :] = half[ii, jj // 2, :] - mean
    # make vectors norm==1 so dot product is cosine distance
    make_norm1(half)
    return half
 def vecalign(vecs0,
             vecs1,
             final_alignment_types,
             del_percentile_frac,
             width_over2,
             max_size_full_dp,
             costs_sample_size,
             num_samps_for_norm,
             norms0=None,
             norms1=None):
    if width_over2 < 3:
        logger.warning('width_over2 was set to %d, which does not make sense. increasing to 3.', width_over2)
        width_over2 = 3
    # make sure input embeddings are norm==1
    make_norm1(vecs0)
    make_norm1(vecs1)
    # save off runtime stats for summary
    runtimes = OrderedDict()
    # Determine stack depth
    s0, s1 = vecs0.shape[1], vecs1.shape[1]
    max_depth = 0
    while s0 * s1 > max_size_full_dp ** 2:
        max_depth += 1
        s0 = s0 // 2
        s1 = s1 // 2
    # init recursion stack
    # depth is 0-based (full size is 0, 1 is half, 2 is quarter, etc)
    stack = {0: {'v0': vecs0, 'v1': vecs1}}
    # downsample sentence vectors
    t0 = time()
    for depth in range(1, max_depth + 1):
        stack[depth] = {'v0': downsample_vectors(stack[depth - 1]['v0']),
                        'v1': downsample_vectors(stack[depth - 1]['v1'])}
    runtimes['Downsample embeddings'] = time() - t0
    # compute norms for all depths, add sizes, add alignment types
    t0 = time()
    for depth in stack:
        stack[depth]['size0'] = stack[depth]['v0'].shape[1]
        stack[depth]['size1'] = stack[depth]['v1'].shape[1]
        stack[depth]['alignment_types'] = final_alignment_types if depth == 0 else [(1, 1)]
        if depth == 0 and norms0 is not None:
            if norms0.shape != vecs0.shape[:2]:
                print('norms0.shape:', norms0.shape)
                print('vecs0.shape[:2]:', vecs0.shape[:2])
                raise Exception('norms0 wrong shape')
            stack[depth]['n0'] = norms0
        else:
            stack[depth]['n0'] = compute_norms(stack[depth]['v0'], stack[depth]['v1'], num_samps_for_norm)
        if depth == 0 and norms1 is not None:
            if norms1.shape != vecs1.shape[:2]:
                print('norms1.shape:', norms1.shape)
                print('vecs1.shape[:2]:', vecs1.shape[:2])
                raise Exception('norms1 wrong shape')
            stack[depth]['n1'] = norms1
        else:
            stack[depth]['n1'] = compute_norms(stack[depth]['v1'], stack[depth]['v0'], num_samps_for_norm)
    runtimes['Normalize embeddings'] = time() - t0
    # Compute deletion penalty for all depths
    t0 = time()
    for depth in stack:
        stack[depth]['del_knob'] = make_del_knob(e_laser=stack[depth]['v0'][0, :, :],
                                                 f_laser=stack[depth]['v1'][0, :, :],
                                                 e_laser_norms=stack[depth]['n0'][0, :],
                                                 f_laser_norms=stack[depth]['n1'][0, :],
                                                 sample_size=costs_sample_size)
        stack[depth]['del_penalty'] = stack[depth]['del_knob'].percentile_frac_to_del_penalty(del_percentile_frac)
        logger.debug('del_penalty at depth %d: %f', depth, stack[depth]['del_penalty'])
    runtimes['Compute deletion penalties'] = time() - t0
    tt = time() - t0
    logger.debug('%d x %d full DP make features: %.6fs (%.3e per dot product)',
                 stack[max_depth]['size0'], stack[max_depth]['size1'], tt,
                 tt / (stack[max_depth]['size0'] + 1e-6) / (stack[max_depth]['size1'] + 1e-6))
    # full DP at maximum recursion depth
    t0 = time()
    stack[max_depth]['costs_1to1'] = make_dense_costs(stack[max_depth]['v0'],
                                                      stack[max_depth]['v1'],
                                                      stack[max_depth]['n0'],
                                                      stack[max_depth]['n1'])
    runtimes['Full DP make features'] = time() - t0
    t0 = time()
    _, stack[max_depth]['x_y_tb'] = dense_dp(stack[max_depth]['costs_1to1'], stack[max_depth]['del_penalty'])
    stack[max_depth]['alignments'] = dense_traceback(stack[max_depth]['x_y_tb'])
    runtimes['Full DP'] = time() - t0
    # upsample the path up to the top resolution
    compute_costs_times = []
    dp_times = []
    upsample_depths = [0, ] if max_depth == 0 else list(reversed(range(0, max_depth)))
    for depth in upsample_depths:
        if max_depth > 0:  # upsample previoius alignment to current resolution
            course_alignments = upsample_alignment(stack[depth + 1]['alignments'])
            # features may have been truncated when downsampleing, so alignment may need extended
            extend_alignments(course_alignments, stack[depth]['size0'], stack[depth]['size1'])  # in-place
        else:  # We did a full size 1-1 search, so search same size with more alignment types
            course_alignments = stack[0]['alignments']
        # convert couse alignments to a searchpath
        stack[depth]['searchpath'] = alignment_to_search_path(course_alignments)
        # compute ccosts for sparse DP
        t0 = time()
        stack[depth]['a_b_costs'], stack[depth]['b_offset'] = make_sparse_costs(stack[depth]['v0'], stack[depth]['v1'],
                                                                                stack[depth]['n0'], stack[depth]['n1'],
                                                                                stack[depth]['searchpath'],
                                                                                stack[depth]['alignment_types'],
                                                                                width_over2)
        tt = time() - t0
        num_dot_products = len(stack[depth]['b_offset']) * len(stack[depth]['alignment_types']) * width_over2 * 2
        logger.debug('%d x %d sparse DP (%d alignment types, %d window) make features: %.6fs (%.3e per dot product)',
                     stack[max_depth]['size0'], stack[max_depth]['size1'],
                     len(stack[depth]['alignment_types']), width_over2 * 2,
                     tt, tt / (num_dot_products + 1e6))
        compute_costs_times.append(time() - t0)
        t0 = time()
        # perform sparse DP
        stack[depth]['a_b_csum'], stack[depth]['a_b_xp'], stack[depth]['a_b_yp'], \
        stack[depth]['new_b_offset'] = sparse_dp(stack[depth]['a_b_costs'], stack[depth]['b_offset'],
                                                 stack[depth]['alignment_types'], stack[depth]['del_penalty'],
                                                 stack[depth]['size0'], stack[depth]['size1'])
        # performace traceback to get alignments and alignment scores
        # for debugging, avoid overwriting stack[depth]['alignments']
        akey = 'final_alignments' if depth == 0 else 'alignments'
        stack[depth][akey], stack[depth]['alignment_scores'] = sparse_traceback(stack[depth]['a_b_csum'],
                                                                                stack[depth]['a_b_xp'],
                                                                                stack[depth]['a_b_yp'],
                                                                                stack[depth]['new_b_offset'],
                                                                                stack[depth]['size0'],
                                                                                stack[depth]['size1'])
        dp_times.append(time() - t0)
    runtimes['Upsample DP compute costs'] = sum(compute_costs_times[:-1])
    runtimes['Upsample DP'] = sum(dp_times[:-1])
    runtimes['Final DP compute costs'] = compute_costs_times[-1]
    runtimes['Final DP'] = dp_times[-1]
    # log time stats
    max_key_str_len = max([len(key) for key in runtimes])
    for key in runtimes:
        if runtimes[key] > 5e-5:
            logger.info(key + ' took ' + '.' * (max_key_str_len + 5 - len(key)) + ('%.4fs' % runtimes[key]).rjust(7))
    return stack
--- a/bin/vecalign/overlap.py
+++ b/bin/vecalign/overlap.py
@@ -0,0 +1,61 @@
 #!/usr/bin/env python3
 """
 Copyright 2019 Brian Thompson
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    https://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 """
 import argparse
 from dp_utils import yield_overlaps
 def go(output_file, input_files, num_overlaps):
    output = set()
    for fin in input_files:
        lines = open(fin, 'rt', encoding="utf-8").readlines()
        for out_line in yield_overlaps(lines, num_overlaps):
            output.add(out_line)
    # for reproducibility
    output = list(output)
    output.sort()
    with open(output_file, 'wt', encoding="utf-8") as fout:
        for line in output:
            fout.write(line + '\n')
 def _main():
    parser = argparse.ArgumentParser('Create text file containing overlapping sentences.',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-i', '--inputs', type=str, nargs='+',
                        help='input text file(s).')
    parser.add_argument('-o', '--output', type=str,
                        help='output text file containing overlapping sentneces')
    parser.add_argument('-n', '--num_overlaps', type=int, default=4,
                        help='Maximum number of allowed overlaps.')
    args = parser.parse_args()
    go(output_file=args.output,
       num_overlaps=args.num_overlaps,
       input_files=args.inputs)
 if __name__ == '__main__':
    _main()
--- a/bin/vecalign/score.py
+++ b/bin/vecalign/score.py
@@ -0,0 +1,170 @@
 #!/usr/bin/env python3
 """
 Copyright 2019 Brian Thompson
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    https://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 """
 import argparse
 import sys
 from collections import defaultdict
 import numpy as np
 from dp_utils import read_alignments
 """
 Faster implementation of lax and strict precision and recall, based on
   https://www.aclweb.org/anthology/W11-4624/.
 """
 def _precision(goldalign, testalign):
    """
    Computes tpstrict, fpstrict, tplax, fplax for gold/test alignments
    """
    tpstrict = 0  # true positive strict counter
    tplax = 0     # true positive lax counter
    fpstrict = 0  # false positive strict counter
    fplax = 0     # false positive lax counter
    # convert to sets, remove alignments empty on both sides
    testalign = set([(tuple(x), tuple(y)) for x, y in testalign if len(x) or len(y)])
    goldalign = set([(tuple(x), tuple(y)) for x, y in goldalign if len(x) or len(y)])
    # mappings from source test sentence idxs to
    #    target gold sentence idxs for which the source test sentence 
    #    was found in corresponding source gold alignment
    src_id_to_gold_tgt_ids = defaultdict(set)
    for gold_src, gold_tgt in goldalign:
        for gold_src_id in gold_src:
            for gold_tgt_id in gold_tgt:
                src_id_to_gold_tgt_ids[gold_src_id].add(gold_tgt_id)
    for (test_src, test_target) in testalign:
        if (test_src, test_target) == ((), ()):
            continue
        if (test_src, test_target) in goldalign:
            # strict match
            tpstrict += 1
            tplax += 1
        else:
            # For anything with partial gold/test overlap on the source,
            #   see if there is also partial overlap on the gold/test target
            # If so, its a lax match
            target_ids = set()
            for src_test_id in test_src:
                for tgt_id in src_id_to_gold_tgt_ids[src_test_id]:
                    target_ids.add(tgt_id)
            if set(test_target).intersection(target_ids):
                fpstrict += 1
                tplax += 1
            else:
                fpstrict += 1
                fplax += 1
    return np.array([tpstrict, fpstrict, tplax, fplax], dtype=np.int32)
 def score_multiple(gold_list, test_list, value_for_div_by_0=0.0):
    # accumulate counts for all gold/test files
    pcounts = np.array([0, 0, 0, 0], dtype=np.int32)
    rcounts = np.array([0, 0, 0, 0], dtype=np.int32)
    for goldalign, testalign in zip(gold_list, test_list):
        pcounts += _precision(goldalign=goldalign, testalign=testalign)
        # recall is precision with no insertion/deletion and swap args
        test_no_del = [(x, y) for x, y in testalign if len(x) and len(y)]
        gold_no_del = [(x, y) for x, y in goldalign if len(x) and len(y)]
        rcounts += _precision(goldalign=test_no_del, testalign=gold_no_del)
    # Compute results
    # pcounts: tpstrict,fnstrict,tplax,fnlax
    # rcounts: tpstrict,fpstrict,tplax,fplax
    if pcounts[0] + pcounts[1] == 0:
        pstrict = value_for_div_by_0
    else:
        pstrict = pcounts[0] / float(pcounts[0] + pcounts[1])
    if pcounts[2] + pcounts[3] == 0:
        plax = value_for_div_by_0
    else:
        plax = pcounts[2] / float(pcounts[2] + pcounts[3])
    if rcounts[0] + rcounts[1] == 0:
        rstrict = value_for_div_by_0
    else:
        rstrict = rcounts[0] / float(rcounts[0] + rcounts[1])
    if rcounts[2] + rcounts[3] == 0:
        rlax = value_for_div_by_0
    else:
        rlax = rcounts[2] / float(rcounts[2] + rcounts[3])
    if (pstrict + rstrict) == 0:
        fstrict = value_for_div_by_0
    else:
        fstrict = 2 * (pstrict * rstrict) / (pstrict + rstrict)
    if (plax + rlax) == 0:
        flax = value_for_div_by_0
    else:
        flax = 2 * (plax * rlax) / (plax + rlax)
    result = dict(recall_strict=rstrict,
                  recall_lax=rlax,
                  precision_strict=pstrict,
                  precision_lax=plax,
                  f1_strict=fstrict,
                  f1_lax=flax)
    return result
 def log_final_scores(res):
    print(' ---------------------------------', file=sys.stderr)
    print('|             |  Strict |    Lax  |', file=sys.stderr)
    print('| Precision   |   {precision_strict:.3f} |   {precision_lax:.3f} |'.format(**res), file=sys.stderr)
    print('| Recall      |   {recall_strict:.3f} |   {recall_lax:.3f} |'.format(**res), file=sys.stderr)
    print('| F1          |   {f1_strict:.3f} |   {f1_lax:.3f} |'.format(**res), file=sys.stderr)
    print(' ---------------------------------', file=sys.stderr)
 def main():
    parser = argparse.ArgumentParser(
        'Compute strict/lax precision and recall for one or more pairs of gold/test alignments',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-t', '--test', type=str, nargs='+', required=True,
                        help='one or more test alignment files')
    parser.add_argument('-g', '--gold', type=str, nargs='+', required=True,
                        help='one or more gold alignment files')
    args = parser.parse_args()
    if len(args.test) != len(args.gold):
        raise Exception('number of gold/test files must be the same')
    gold_list = [read_alignments(x) for x in args.gold]
    test_list = [read_alignments(x) for x in args.test]
    res = score_multiple(gold_list=gold_list, test_list=test_list)
    log_final_scores(res)
 if __name__ == '__main__':
    main()
--- a/bin/vecalign/vecalign.py
+++ b/bin/vecalign/vecalign.py
@@ -0,0 +1,165 @@
 #!/usr/bin/env python3
 """
 Copyright 2019 Brian Thompson
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    https://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 """
 import argparse
 import logging
 import pickle
 from math import ceil
 from random import seed as seed
 import numpy as np
 logger = logging.getLogger('vecalign')
 logger.setLevel(logging.WARNING)
 logFormatter = logging.Formatter("%(asctime)s  %(levelname)-5.5s  %(message)s")
 consoleHandler = logging.StreamHandler()
 consoleHandler.setFormatter(logFormatter)
 logger.addHandler(consoleHandler)
 from dp_utils import make_alignment_types, print_alignments, read_alignments, \
    read_in_embeddings, make_doc_embedding, vecalign
 from score import score_multiple, log_final_scores
 def _main():
    # make runs consistent
    seed(42)
    np.random.seed(42)
    parser = argparse.ArgumentParser('Sentence alignment using sentence embeddings and FastDTW',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    #parser.add_argument('-s', '--src', type=str, nargs='+', required=True,
    #                    help='preprocessed source file to align')
    #parser.add_argument('-t', '--tgt', type=str, nargs='+', required=True,
    #                    help='preprocessed target file to align')
    parser.add_argument('--job', type=str, required=True, help='Job file for alignment task.')
    parser.add_argument('-g', '--gold_alignment', type=str, nargs='+', required=False,
                        help='preprocessed target file to align')
    parser.add_argument('--src_embed', type=str, nargs=2, required=True,
                        help='Source embeddings. Requires two arguments: first is a text file, sencond is a binary embeddings file. ')
    parser.add_argument('--tgt_embed', type=str, nargs=2, required=True,
                        help='Target embeddings. Requires two arguments: first is a text file, sencond is a binary embeddings file. ')
    parser.add_argument('-a', '--alignment_max_size', type=int, default=5,
                        help='Searches for alignments up to size N-M, where N+M <= this value. Note that the the embeddings must support the requested number of overlaps')
    parser.add_argument('-d', '--del_percentile_frac', type=float, default=0.2,
                        help='Deletion penalty is set to this percentile (as a fraction) of the cost matrix distribution. Should be between 0 and 1.')
    parser.add_argument('-v', '--verbose', help='sets consle to logging.DEBUG instead of logging.WARN',
                        action='store_true')
    parser.add_argument('--max_size_full_dp', type=int, default=300,
                        help='Maximum size N for which is is acceptable to run full N^2 dynamic programming.')
    parser.add_argument('--costs_sample_size', type=int, default=20000,
                        help='Sample size to estimate costs distribution, used to set deletion penalty in conjunction with deletion_percentile.')
    parser.add_argument('--num_samps_for_norm', type=int, default=100,
                        help='Number of samples used for normalizing embeddings')
    parser.add_argument('--search_buffer_size', type=int, default=5,
                        help='Width (one side) of search buffer. Larger values makes search more likely to recover from errors but increases runtime.')
    parser.add_argument('--debug_save_stack', type=str,
                        help='Write stack to pickle file for debug purposes')
    args = parser.parse_args()
    #if len(args.src) != len(args.tgt):
    #    raise Exception('number of source files must match number of target files')
    #if args.gold_alignment is not None:
    #    if len(args.gold_alignment) != len(args.src):
    #        raise Exception('number of gold alignment files, if provided, must match number of source and target files')
    if args.verbose:
        import logging
        logger.setLevel(logging.INFO)
    if args.alignment_max_size < 2:
        logger.warning('Alignment_max_size < 2. Increasing to 2 so that 1-1 alignments will be considered')
        args.alignment_max_size = 2
    src_sent2line, src_line_embeddings = read_in_embeddings(args.src_embed[0], args.src_embed[1])
    tgt_sent2line, tgt_line_embeddings = read_in_embeddings(args.tgt_embed[0], args.tgt_embed[1])
    width_over2 = ceil(args.alignment_max_size / 2.0) + args.search_buffer_size
    test_alignments = []
    stack_list = []
    # read in alignment jobs
    job = read_job(args.job)
    #for src_file, tgt_file in zip(args.src, args.tgt):
    for rec in job:
        #logger.info('Aligning src="%s" to tgt="%s"', src_file, tgt_file)
        src_file, tgt_file, align_file = rec.split("\t")
        print("Aligning {} to {}".format(src_file, tgt_file))
        src_lines = open(src_file, 'rt', encoding="utf-8").readlines()
        vecs0 = make_doc_embedding(src_sent2line, src_line_embeddings, src_lines, args.alignment_max_size)
        tgt_lines = open(tgt_file, 'rt', encoding="utf-8").readlines()
        vecs1 = make_doc_embedding(tgt_sent2line, tgt_line_embeddings, tgt_lines, args.alignment_max_size)
        final_alignment_types = make_alignment_types(args.alignment_max_size)
        logger.debug('Considering alignment types %s', final_alignment_types)
        stack = vecalign(vecs0=vecs0,
                         vecs1=vecs1,
                         final_alignment_types=final_alignment_types,
                         del_percentile_frac=args.del_percentile_frac,
                         width_over2=width_over2,
                         max_size_full_dp=args.max_size_full_dp,
                         costs_sample_size=args.costs_sample_size,
                         num_samps_for_norm=args.num_samps_for_norm)
        # write final alignments to stdout
        #print_alignments(stack[0]['final_alignments'], stack[0]['alignment_scores'])
        out_f = open(align_file, 'w', encoding="utf-8")
        #print_alignments(stack[0]['final_alignments'], stack[0]['alignment_scores'],file=out_f)
        print_alignments(stack[0]['final_alignments'],file=out_f)
        #test_alignments.append(stack[0]['final_alignments'])
        #stack_list.append(stack)
    #if args.gold_alignment is not None:
    #    gold_list = [read_alignments(x) for x in args.gold_alignment]
    #    res = score_multiple(gold_list=gold_list, test_list=test_alignments)
    #   log_final_scores(res)
    #if args.debug_save_stack:
    #  pickle.dump(stack_list, open(args.debug_save_stack, 'wb'))
 def read_job(file):
    job = []
    with open(file, 'r', encoding="utf-8") as f:
        for line in f:
            if not line.startswith("#"):
                job.append(line.strip())
    return job
 if __name__ == '__main__':
    _main()