666 lines
24 KiB
Python
666 lines
24 KiB
Python
"""
|
|
Copyright 2019 Brian Thompson
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
https://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
"""
|
|
|
|
import logging
|
|
import sys
|
|
from ast import literal_eval
|
|
from collections import OrderedDict
|
|
from math import ceil
|
|
from time import time
|
|
|
|
import numpy as np
|
|
|
|
import pyximport
|
|
pyximport.install(setup_args={'include_dirs':np.get_include()}, inplace=True, reload_support=True)
|
|
|
|
from dp_core import make_dense_costs, score_path, sparse_dp, make_sparse_costs, dense_dp
|
|
|
|
logger = logging.getLogger('vecalign') # set up in vecalign.py
|
|
|
|
|
|
def preprocess_line(line):
|
|
line = line.strip()
|
|
if len(line) == 0:
|
|
line = 'BLANK_LINE'
|
|
return line
|
|
|
|
|
|
def yield_overlaps(lines, num_overlaps):
|
|
lines = [preprocess_line(line) for line in lines]
|
|
for overlap in range(1, num_overlaps + 1):
|
|
for out_line in layer(lines, overlap):
|
|
# check must be here so all outputs are unique
|
|
out_line2 = out_line[:10000] # limit line so dont encode arbitrarily long sentences
|
|
yield out_line2
|
|
|
|
|
|
def read_in_embeddings(text_file, embed_file):
|
|
"""
|
|
Given a text file with candidate sentences and a corresponing embedding file,
|
|
make a maping from candidate sentence to embedding index,
|
|
and a numpy array of the embeddings
|
|
"""
|
|
sent2line = dict()
|
|
with open(text_file, 'rt', encoding="utf-8") as fin:
|
|
for ii, line in enumerate(fin):
|
|
if line.strip() in sent2line:
|
|
raise Exception('got multiple embeddings for the same line')
|
|
sent2line[line.strip()] = ii
|
|
|
|
line_embeddings = np.fromfile(embed_file, dtype=np.float32, count=-1)
|
|
if line_embeddings.size == 0:
|
|
raise Exception('Got empty embedding file')
|
|
|
|
laser_embedding_size = line_embeddings.size // len(sent2line) # currently hardcoded to 1024
|
|
if laser_embedding_size != 1024:
|
|
logger.warning('expected an embedding size of 1024, got %s', laser_embedding_size)
|
|
logger.info('laser_embedding_size determined to be %d', laser_embedding_size)
|
|
line_embeddings.resize(line_embeddings.shape[0] // laser_embedding_size, laser_embedding_size)
|
|
return sent2line, line_embeddings
|
|
|
|
|
|
def make_doc_embedding(sent2line, line_embeddings, lines, num_overlaps):
|
|
"""
|
|
lines: sentences in input document to embed
|
|
sent2line, line_embeddings: precomputed embeddings for lines (and overlaps of lines)
|
|
"""
|
|
|
|
lines = [preprocess_line(line) for line in lines]
|
|
|
|
vecsize = line_embeddings.shape[1]
|
|
|
|
vecs0 = np.empty((num_overlaps, len(lines), vecsize), dtype=np.float32)
|
|
|
|
for ii, overlap in enumerate(range(1, num_overlaps + 1)):
|
|
for jj, out_line in enumerate(layer(lines, overlap)):
|
|
try:
|
|
line_id = sent2line[out_line]
|
|
except KeyError:
|
|
logger.warning('Failed to find overlap=%d line "%s". Will use random vector.', overlap, out_line)
|
|
line_id = None
|
|
|
|
if line_id is not None:
|
|
vec = line_embeddings[line_id]
|
|
else:
|
|
vec = np.random.random(vecsize) - 0.5
|
|
vec = vec / np.linalg.norm(vec)
|
|
|
|
vecs0[ii, jj, :] = vec
|
|
|
|
return vecs0
|
|
|
|
|
|
def make_norm1(vecs0):
|
|
"""
|
|
make vectors norm==1 so that cosine distance can be computed via dot product
|
|
"""
|
|
for ii in range(vecs0.shape[0]):
|
|
for jj in range(vecs0.shape[1]):
|
|
norm = np.sqrt(np.square(vecs0[ii, jj, :]).sum())
|
|
vecs0[ii, jj, :] = vecs0[ii, jj, :] / (norm + 1e-5)
|
|
|
|
|
|
def layer(lines, num_overlaps, comb=' '):
|
|
"""
|
|
make front-padded overlapping sentences
|
|
"""
|
|
if num_overlaps < 1:
|
|
raise Exception('num_overlaps must be >= 1')
|
|
out = ['PAD', ] * min(num_overlaps - 1, len(lines))
|
|
for ii in range(len(lines) - num_overlaps + 1):
|
|
out.append(comb.join(lines[ii:ii + num_overlaps]))
|
|
return out
|
|
|
|
|
|
def read_alignments(fin):
|
|
alignments = []
|
|
with open(fin, 'rt', encoding="utf-8") as infile:
|
|
for line in infile:
|
|
fields = [x.strip() for x in line.split(':') if len(x.strip())]
|
|
if len(fields) < 2:
|
|
raise Exception('Got line "%s", which does not have at least two ":" separated fields' % line.strip())
|
|
try:
|
|
src = literal_eval(fields[0])
|
|
tgt = literal_eval(fields[1])
|
|
except:
|
|
raise Exception('Failed to parse line "%s"' % line.strip())
|
|
alignments.append((src, tgt))
|
|
|
|
# I know bluealign files have a few entries entries missing,
|
|
# but I don't fix them in order to be consistent previous reported scores
|
|
return alignments
|
|
|
|
|
|
def print_alignments(alignments, scores=None, file=sys.stdout):
|
|
if scores is not None:
|
|
for (x, y), s in zip(alignments, scores):
|
|
print('%s:%s:%.6f' % (x, y, s), file=file)
|
|
else:
|
|
for x, y in alignments:
|
|
print('%s:%s' % (x, y), file=file)
|
|
|
|
|
|
class DeletionKnob(object):
|
|
"""
|
|
A good deletion penalty is dependent on normalization, and probably language, domain, etc, etc
|
|
I want a way to control deletion penalty that generalizes well...
|
|
Sampling costs and use percentile seems to work fairly well.
|
|
"""
|
|
def __init__(self, samp, res_min, res_max):
|
|
|
|
self.res_min = res_min
|
|
self.res_max = res_max
|
|
|
|
if self.res_min >= self.res_max:
|
|
logger.warning('res_max <= res_min, increasing it')
|
|
self.res_max = self.res_min + 1e-4
|
|
|
|
num_bins = 1000
|
|
num_pts = 30
|
|
|
|
self.hist, self.bin_edges = np.histogram(samp, bins=num_bins,
|
|
range=[self.res_min, self.res_max],
|
|
density=True)
|
|
|
|
dx = self.bin_edges[1] - self.bin_edges[0]
|
|
self.cdf = np.cumsum(self.hist) * dx
|
|
|
|
interp_points = [(0, self.res_min), ]
|
|
for knob_val in np.linspace(0, 1, num_pts - 1)[1:-1]:
|
|
cdf_idx = np.searchsorted(self.cdf, knob_val)
|
|
cdf_val = self.res_min + cdf_idx / float(num_bins) * (self.res_max - self.res_min)
|
|
interp_points.append((knob_val, cdf_val))
|
|
interp_points.append((1, self.res_max))
|
|
self.x, self.y = zip(*interp_points)
|
|
|
|
def percentile_frac_to_del_penalty(self, knob_val):
|
|
del_pen = np.interp([knob_val], self.x, self.y)[0]
|
|
return del_pen
|
|
|
|
|
|
def make_alignment_types(max_alignment_size):
|
|
# return list of all (n,m) where n+m <= this
|
|
alignment_types = []
|
|
for x in range(1, max_alignment_size):
|
|
for y in range(1, max_alignment_size):
|
|
if x + y <= max_alignment_size:
|
|
alignment_types.append((x, y))
|
|
return alignment_types
|
|
|
|
|
|
def ab2xy_w_offset(aa, bb_idx, bb_offset):
|
|
bb_from_side = bb_idx + bb_offset[aa]
|
|
xx = aa - bb_from_side
|
|
yy = bb_from_side
|
|
return (xx, yy)
|
|
|
|
|
|
def xy2ab_w_offset(xx, yy, bb_offset):
|
|
aa = xx + yy
|
|
bb_from_side = yy
|
|
bb = bb_from_side - bb_offset[aa]
|
|
return aa, bb
|
|
|
|
|
|
def process_scores(scores, alignments):
|
|
# floating point sometimes gives negative numbers, which is a little unnerving ...
|
|
scores = np.clip(scores, a_min=0, a_max=None)
|
|
|
|
for ii, (x_algn, y_algn) in enumerate(alignments):
|
|
# deletion penalty is pretty arbitrary, just report 0
|
|
if len(x_algn) == 0 or len(y_algn) == 0:
|
|
scores[ii] = 0.0
|
|
# report sores un-normalized by alignment sizes
|
|
# (still normalized with random vectors, though)
|
|
else:
|
|
scores[ii] = scores[ii] / len(x_algn) / len(y_algn)
|
|
|
|
return scores
|
|
|
|
|
|
def sparse_traceback(a_b_csum, a_b_xp, a_b_yp, b_offset, xsize, ysize):
|
|
alignments = []
|
|
xx = xsize
|
|
yy = ysize
|
|
|
|
cum_costs = []
|
|
|
|
while True:
|
|
aa, bb = xy2ab_w_offset(xx, yy, b_offset)
|
|
|
|
cum_costs.append(a_b_csum[aa, bb])
|
|
|
|
xp = a_b_xp[aa, bb]
|
|
yp = a_b_yp[aa, bb]
|
|
|
|
if xx == yy == 0:
|
|
break
|
|
|
|
if xx < 0 or yy < 0:
|
|
raise Exception('traceback bug')
|
|
|
|
x_side = list(range(xx - xp, xx))
|
|
y_side = list(range(yy - yp, yy))
|
|
alignments.append((x_side, y_side))
|
|
|
|
xx = xx - xp
|
|
yy = yy - yp
|
|
|
|
alignments.reverse()
|
|
cum_costs.reverse()
|
|
costs = np.array(cum_costs[1:]) - np.array(cum_costs[:-1])
|
|
# "costs" are scaled by x_alignment_size * y_alignment_size
|
|
# and the cost of a deletion is del_penalty
|
|
# "scores": 0 for deletion/insertion,
|
|
# and cosine distance, *not* scaled
|
|
# by len(x_alignment)*len(y_alignment)
|
|
scores = process_scores(scores=costs, alignments=alignments)
|
|
|
|
return alignments, scores
|
|
|
|
|
|
def dense_traceback(x_y_tb):
|
|
xsize, ysize = x_y_tb.shape
|
|
|
|
xx = xsize - 1
|
|
yy = ysize - 1
|
|
|
|
alignments = []
|
|
while True:
|
|
if xx == yy == 0:
|
|
break
|
|
bp = x_y_tb[xx, yy]
|
|
if bp == 0:
|
|
xp, yp = 1, 1
|
|
alignments.append(([xx - 1], [yy - 1]))
|
|
elif bp == 1:
|
|
xp, yp = 0, 1
|
|
alignments.append(([], [yy - 1]))
|
|
elif bp == 2:
|
|
xp, yp = 1, 0
|
|
alignments.append(([xx - 1], []))
|
|
else:
|
|
raise Exception('got unknown value')
|
|
|
|
xx = xx - xp
|
|
yy = yy - yp
|
|
|
|
alignments.reverse()
|
|
|
|
return alignments
|
|
|
|
|
|
def append_slant(path, xwidth, ywidth):
|
|
"""
|
|
Append quantized approximation to a straight line
|
|
from current x,y to a point at (x+xwidth, y+ywidth)
|
|
"""
|
|
NN = xwidth + ywidth
|
|
xstart, ystart = path[-1]
|
|
for ii in range(1, NN + 1):
|
|
x = xstart + round(xwidth * ii / NN)
|
|
y = ystart + round(ywidth * ii / NN)
|
|
# In the case of ties we want them to round differently,
|
|
# so explicitly make sure we take a step of 1, not 0 or 2
|
|
lastx, lasty = path[-1]
|
|
delta = x + y - lastx - lasty
|
|
if delta == 1:
|
|
path.append((x, y))
|
|
elif delta == 2:
|
|
path.append((x - 1, y))
|
|
elif delta == 0:
|
|
path.append((x + 1, y))
|
|
|
|
|
|
def alignment_to_search_path(algn):
|
|
"""
|
|
Given an alignment, make searchpath.
|
|
Searchpath must step exactly one position in x XOR y at each time step.
|
|
|
|
In the case of a block of deletions, the order found by DP is not meaningful.
|
|
To make things consistent and to improve the probability of recovering
|
|
from search errors, we search an approximately straight line
|
|
through a block of deletions. We do the same through a many-many
|
|
alignment, even though we currently don't refine a many-many alignment...
|
|
"""
|
|
path = [(0, 0), ]
|
|
xdel, ydel = 0, 0
|
|
ydel = 0
|
|
for x, y in algn:
|
|
if len(x) and len(y):
|
|
append_slant(path, xdel, ydel)
|
|
xdel, ydel = 0, 0
|
|
append_slant(path, len(x), len(y))
|
|
elif len(x):
|
|
xdel += len(x)
|
|
elif len(y):
|
|
ydel += len(y)
|
|
|
|
append_slant(path, xdel, ydel)
|
|
|
|
return path
|
|
|
|
|
|
def extend_alignments(course_alignments, size0, size1):
|
|
"""
|
|
extend alignments to include new endpoints size0, size1
|
|
if alignments are larger than size0/size1, raise exception
|
|
"""
|
|
# could be a string of deletions or insertions at end, so cannot just grab last one
|
|
xmax = 0 # maximum x value in course_alignments
|
|
ymax = 0 # maximum y value in course_alignments
|
|
for x, y in course_alignments:
|
|
for xval in x:
|
|
xmax = max(xmax, xval)
|
|
for yval in y:
|
|
ymax = max(ymax, yval)
|
|
|
|
if xmax > size0 or ymax > size1:
|
|
raise Exception('asked to extend alignments but already bigger than requested')
|
|
|
|
# do not duplicate xmax/ymax, do include size0/size1
|
|
extra_x = list(range(xmax + 1, size0 + 1))
|
|
extra_y = list(range(ymax + 1, size1 + 1))
|
|
|
|
logger.debug('extending alignments in x by %d and y by %d', len(extra_x), len(extra_y))
|
|
|
|
if len(extra_x) == 0:
|
|
for yval in extra_y:
|
|
course_alignments.append(([], [yval]))
|
|
elif len(extra_y) == 0:
|
|
for xval in extra_x:
|
|
course_alignments.append(([xval], []))
|
|
else:
|
|
course_alignments.append((extra_x, extra_y))
|
|
|
|
|
|
def upsample_alignment(algn):
|
|
def upsample_one_alignment(xx):
|
|
return list(range(min(xx) * 2, (max(xx) + 1) * 2))
|
|
|
|
new_algn = []
|
|
for xx, yy in algn:
|
|
if len(xx) == 0:
|
|
for yyy in upsample_one_alignment(yy):
|
|
new_algn.append(([], [yyy]))
|
|
elif len(yy) == 0:
|
|
for xxx in upsample_one_alignment(xx):
|
|
new_algn.append(([xxx], []))
|
|
else:
|
|
new_algn.append((upsample_one_alignment(xx), upsample_one_alignment(yy)))
|
|
return new_algn
|
|
|
|
|
|
def make_del_knob(e_laser,
|
|
f_laser,
|
|
e_laser_norms,
|
|
f_laser_norms,
|
|
sample_size):
|
|
e_size = e_laser.shape[0]
|
|
f_size = f_laser.shape[0]
|
|
|
|
if e_size > 0 and f_size > 0 and sample_size > 0:
|
|
|
|
if e_size * f_size < sample_size:
|
|
# dont sample, just compute full matrix
|
|
sample_size = e_size * f_size
|
|
x_idxs = np.zeros(sample_size, dtype=np.int32)
|
|
y_idxs = np.zeros(sample_size, dtype=np.int32)
|
|
c = 0
|
|
for ii in range(e_size):
|
|
for jj in range(f_size):
|
|
x_idxs[c] = ii
|
|
y_idxs[c] = jj
|
|
c += 1
|
|
else:
|
|
# get random samples
|
|
x_idxs = np.random.choice(range(e_size), size=sample_size, replace=True).astype(np.int32)
|
|
y_idxs = np.random.choice(range(f_size), size=sample_size, replace=True).astype(np.int32)
|
|
|
|
# output
|
|
random_scores = np.empty(sample_size, dtype=np.float32)
|
|
|
|
score_path(x_idxs, y_idxs,
|
|
e_laser_norms, f_laser_norms,
|
|
e_laser, f_laser,
|
|
random_scores, )
|
|
|
|
min_score = 0
|
|
max_score = max(random_scores) # could bump this up... but its probably fine
|
|
|
|
else:
|
|
# Not much we can do here...
|
|
random_scores = np.array([0.0, 0.5, 1.0]) # ???
|
|
min_score = 0
|
|
max_score = 1 # ????
|
|
|
|
del_knob = DeletionKnob(random_scores, min_score, max_score)
|
|
|
|
return del_knob
|
|
|
|
|
|
def compute_norms(vecs0, vecs1, num_samples, overlaps_to_use=None):
|
|
# overlaps_to_use = 10 # 10 matches before
|
|
|
|
overlaps1, size1, dim = vecs1.shape
|
|
overlaps0, size0, dim0 = vecs0.shape
|
|
assert (dim == dim0)
|
|
|
|
if overlaps_to_use is not None:
|
|
if overlaps_to_use > overlaps1:
|
|
raise Exception('Cannot use more overlaps than provided. You may want to re-run make_verlaps.py with a larger -n value')
|
|
else:
|
|
overlaps_to_use = overlaps1
|
|
|
|
samps_per_overlap = ceil(num_samples / overlaps_to_use)
|
|
|
|
if size1 and samps_per_overlap:
|
|
# sample other size (from all overlaps) to compre to this side
|
|
vecs1_rand_sample = np.empty((samps_per_overlap * overlaps_to_use, dim), dtype=np.float32)
|
|
for overlap_ii in range(overlaps_to_use):
|
|
idxs = np.random.choice(range(size1), size=samps_per_overlap, replace=True)
|
|
random_vecs = vecs1[overlap_ii, idxs, :]
|
|
vecs1_rand_sample[overlap_ii * samps_per_overlap:(overlap_ii + 1) * samps_per_overlap, :] = random_vecs
|
|
|
|
norms0 = np.empty((overlaps0, size0), dtype=np.float32)
|
|
for overlap_ii in range(overlaps0):
|
|
e_laser = vecs0[overlap_ii, :, :]
|
|
sim = np.matmul(e_laser, vecs1_rand_sample.T)
|
|
norms0[overlap_ii, :] = 1.0 - sim.mean(axis=1)
|
|
|
|
else: # no samples, no normalization
|
|
norms0 = np.ones((overlaps0, size0)).astype(np.float32)
|
|
|
|
return norms0
|
|
|
|
|
|
def downsample_vectors(vecs1):
|
|
a, b, c = vecs1.shape
|
|
half = np.empty((a, b // 2, c), dtype=np.float32)
|
|
for ii in range(a):
|
|
# average consecutive vectors
|
|
for jj in range(0, b - b % 2, 2):
|
|
v1 = vecs1[ii, jj, :]
|
|
v2 = vecs1[ii, jj + 1, :]
|
|
half[ii, jj // 2, :] = v1 + v2
|
|
# compute mean for all vectors
|
|
mean = np.mean(half[ii, :, :], axis=0)
|
|
for jj in range(0, b - b % 2, 2):
|
|
# remove mean
|
|
half[ii, jj // 2, :] = half[ii, jj // 2, :] - mean
|
|
# make vectors norm==1 so dot product is cosine distance
|
|
make_norm1(half)
|
|
return half
|
|
|
|
|
|
def vecalign(vecs0,
|
|
vecs1,
|
|
final_alignment_types,
|
|
del_percentile_frac,
|
|
width_over2,
|
|
max_size_full_dp,
|
|
costs_sample_size,
|
|
num_samps_for_norm,
|
|
norms0=None,
|
|
norms1=None):
|
|
if width_over2 < 3:
|
|
logger.warning('width_over2 was set to %d, which does not make sense. increasing to 3.', width_over2)
|
|
width_over2 = 3
|
|
|
|
# make sure input embeddings are norm==1
|
|
make_norm1(vecs0)
|
|
make_norm1(vecs1)
|
|
|
|
# save off runtime stats for summary
|
|
runtimes = OrderedDict()
|
|
|
|
# Determine stack depth
|
|
s0, s1 = vecs0.shape[1], vecs1.shape[1]
|
|
max_depth = 0
|
|
while s0 * s1 > max_size_full_dp ** 2:
|
|
max_depth += 1
|
|
s0 = s0 // 2
|
|
s1 = s1 // 2
|
|
|
|
# init recursion stack
|
|
# depth is 0-based (full size is 0, 1 is half, 2 is quarter, etc)
|
|
stack = {0: {'v0': vecs0, 'v1': vecs1}}
|
|
|
|
# downsample sentence vectors
|
|
t0 = time()
|
|
for depth in range(1, max_depth + 1):
|
|
stack[depth] = {'v0': downsample_vectors(stack[depth - 1]['v0']),
|
|
'v1': downsample_vectors(stack[depth - 1]['v1'])}
|
|
runtimes['Downsample embeddings'] = time() - t0
|
|
|
|
# compute norms for all depths, add sizes, add alignment types
|
|
t0 = time()
|
|
for depth in stack:
|
|
stack[depth]['size0'] = stack[depth]['v0'].shape[1]
|
|
stack[depth]['size1'] = stack[depth]['v1'].shape[1]
|
|
stack[depth]['alignment_types'] = final_alignment_types if depth == 0 else [(1, 1)]
|
|
|
|
if depth == 0 and norms0 is not None:
|
|
if norms0.shape != vecs0.shape[:2]:
|
|
print('norms0.shape:', norms0.shape)
|
|
print('vecs0.shape[:2]:', vecs0.shape[:2])
|
|
raise Exception('norms0 wrong shape')
|
|
stack[depth]['n0'] = norms0
|
|
else:
|
|
stack[depth]['n0'] = compute_norms(stack[depth]['v0'], stack[depth]['v1'], num_samps_for_norm)
|
|
|
|
if depth == 0 and norms1 is not None:
|
|
if norms1.shape != vecs1.shape[:2]:
|
|
print('norms1.shape:', norms1.shape)
|
|
print('vecs1.shape[:2]:', vecs1.shape[:2])
|
|
raise Exception('norms1 wrong shape')
|
|
stack[depth]['n1'] = norms1
|
|
else:
|
|
stack[depth]['n1'] = compute_norms(stack[depth]['v1'], stack[depth]['v0'], num_samps_for_norm)
|
|
|
|
runtimes['Normalize embeddings'] = time() - t0
|
|
|
|
# Compute deletion penalty for all depths
|
|
t0 = time()
|
|
for depth in stack:
|
|
stack[depth]['del_knob'] = make_del_knob(e_laser=stack[depth]['v0'][0, :, :],
|
|
f_laser=stack[depth]['v1'][0, :, :],
|
|
e_laser_norms=stack[depth]['n0'][0, :],
|
|
f_laser_norms=stack[depth]['n1'][0, :],
|
|
sample_size=costs_sample_size)
|
|
stack[depth]['del_penalty'] = stack[depth]['del_knob'].percentile_frac_to_del_penalty(del_percentile_frac)
|
|
logger.debug('del_penalty at depth %d: %f', depth, stack[depth]['del_penalty'])
|
|
runtimes['Compute deletion penalties'] = time() - t0
|
|
tt = time() - t0
|
|
logger.debug('%d x %d full DP make features: %.6fs (%.3e per dot product)',
|
|
stack[max_depth]['size0'], stack[max_depth]['size1'], tt,
|
|
tt / (stack[max_depth]['size0'] + 1e-6) / (stack[max_depth]['size1'] + 1e-6))
|
|
# full DP at maximum recursion depth
|
|
t0 = time()
|
|
stack[max_depth]['costs_1to1'] = make_dense_costs(stack[max_depth]['v0'],
|
|
stack[max_depth]['v1'],
|
|
stack[max_depth]['n0'],
|
|
stack[max_depth]['n1'])
|
|
|
|
runtimes['Full DP make features'] = time() - t0
|
|
t0 = time()
|
|
_, stack[max_depth]['x_y_tb'] = dense_dp(stack[max_depth]['costs_1to1'], stack[max_depth]['del_penalty'])
|
|
stack[max_depth]['alignments'] = dense_traceback(stack[max_depth]['x_y_tb'])
|
|
runtimes['Full DP'] = time() - t0
|
|
|
|
# upsample the path up to the top resolution
|
|
compute_costs_times = []
|
|
dp_times = []
|
|
upsample_depths = [0, ] if max_depth == 0 else list(reversed(range(0, max_depth)))
|
|
for depth in upsample_depths:
|
|
if max_depth > 0: # upsample previoius alignment to current resolution
|
|
course_alignments = upsample_alignment(stack[depth + 1]['alignments'])
|
|
# features may have been truncated when downsampleing, so alignment may need extended
|
|
extend_alignments(course_alignments, stack[depth]['size0'], stack[depth]['size1']) # in-place
|
|
else: # We did a full size 1-1 search, so search same size with more alignment types
|
|
course_alignments = stack[0]['alignments']
|
|
|
|
# convert couse alignments to a searchpath
|
|
stack[depth]['searchpath'] = alignment_to_search_path(course_alignments)
|
|
|
|
# compute ccosts for sparse DP
|
|
t0 = time()
|
|
stack[depth]['a_b_costs'], stack[depth]['b_offset'] = make_sparse_costs(stack[depth]['v0'], stack[depth]['v1'],
|
|
stack[depth]['n0'], stack[depth]['n1'],
|
|
stack[depth]['searchpath'],
|
|
stack[depth]['alignment_types'],
|
|
width_over2)
|
|
|
|
tt = time() - t0
|
|
num_dot_products = len(stack[depth]['b_offset']) * len(stack[depth]['alignment_types']) * width_over2 * 2
|
|
logger.debug('%d x %d sparse DP (%d alignment types, %d window) make features: %.6fs (%.3e per dot product)',
|
|
stack[max_depth]['size0'], stack[max_depth]['size1'],
|
|
len(stack[depth]['alignment_types']), width_over2 * 2,
|
|
tt, tt / (num_dot_products + 1e6))
|
|
|
|
compute_costs_times.append(time() - t0)
|
|
t0 = time()
|
|
# perform sparse DP
|
|
stack[depth]['a_b_csum'], stack[depth]['a_b_xp'], stack[depth]['a_b_yp'], \
|
|
stack[depth]['new_b_offset'] = sparse_dp(stack[depth]['a_b_costs'], stack[depth]['b_offset'],
|
|
stack[depth]['alignment_types'], stack[depth]['del_penalty'],
|
|
stack[depth]['size0'], stack[depth]['size1'])
|
|
|
|
# performace traceback to get alignments and alignment scores
|
|
# for debugging, avoid overwriting stack[depth]['alignments']
|
|
akey = 'final_alignments' if depth == 0 else 'alignments'
|
|
stack[depth][akey], stack[depth]['alignment_scores'] = sparse_traceback(stack[depth]['a_b_csum'],
|
|
stack[depth]['a_b_xp'],
|
|
stack[depth]['a_b_yp'],
|
|
stack[depth]['new_b_offset'],
|
|
stack[depth]['size0'],
|
|
stack[depth]['size1'])
|
|
dp_times.append(time() - t0)
|
|
|
|
runtimes['Upsample DP compute costs'] = sum(compute_costs_times[:-1])
|
|
runtimes['Upsample DP'] = sum(dp_times[:-1])
|
|
|
|
runtimes['Final DP compute costs'] = compute_costs_times[-1]
|
|
runtimes['Final DP'] = dp_times[-1]
|
|
|
|
# log time stats
|
|
max_key_str_len = max([len(key) for key in runtimes])
|
|
for key in runtimes:
|
|
if runtimes[key] > 5e-5:
|
|
logger.info(key + ' took ' + '.' * (max_key_str_len + 5 - len(key)) + ('%.4fs' % runtimes[key]).rjust(7))
|
|
|
|
return stack
|