Baseline alignment systems
This commit is contained in:
665
ext-lib/vecalign/dp_utils.py
Normal file
665
ext-lib/vecalign/dp_utils.py
Normal file
@@ -0,0 +1,665 @@
|
||||
"""
|
||||
Copyright 2019 Brian Thompson
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
https://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
from ast import literal_eval
|
||||
from collections import OrderedDict
|
||||
from math import ceil
|
||||
from time import time
|
||||
|
||||
import numpy as np
|
||||
|
||||
import pyximport
|
||||
pyximport.install(setup_args={'include_dirs':np.get_include()}, inplace=True, reload_support=True)
|
||||
|
||||
from dp_core import make_dense_costs, score_path, sparse_dp, make_sparse_costs, dense_dp
|
||||
|
||||
logger = logging.getLogger('vecalign') # set up in vecalign.py
|
||||
|
||||
|
||||
def preprocess_line(line):
|
||||
line = line.strip()
|
||||
if len(line) == 0:
|
||||
line = 'BLANK_LINE'
|
||||
return line
|
||||
|
||||
|
||||
def yield_overlaps(lines, num_overlaps):
|
||||
lines = [preprocess_line(line) for line in lines]
|
||||
for overlap in range(1, num_overlaps + 1):
|
||||
for out_line in layer(lines, overlap):
|
||||
# check must be here so all outputs are unique
|
||||
out_line2 = out_line[:10000] # limit line so dont encode arbitrarily long sentences
|
||||
yield out_line2
|
||||
|
||||
|
||||
def read_in_embeddings(text_file, embed_file):
|
||||
"""
|
||||
Given a text file with candidate sentences and a corresponing embedding file,
|
||||
make a maping from candidate sentence to embedding index,
|
||||
and a numpy array of the embeddings
|
||||
"""
|
||||
sent2line = dict()
|
||||
with open(text_file, 'rt', encoding="utf-8") as fin:
|
||||
for ii, line in enumerate(fin):
|
||||
if line.strip() in sent2line:
|
||||
raise Exception('got multiple embeddings for the same line')
|
||||
sent2line[line.strip()] = ii
|
||||
|
||||
line_embeddings = np.fromfile(embed_file, dtype=np.float32, count=-1)
|
||||
if line_embeddings.size == 0:
|
||||
raise Exception('Got empty embedding file')
|
||||
|
||||
laser_embedding_size = line_embeddings.size // len(sent2line) # currently hardcoded to 1024
|
||||
if laser_embedding_size != 1024:
|
||||
logger.warning('expected an embedding size of 1024, got %s', laser_embedding_size)
|
||||
logger.info('laser_embedding_size determined to be %d', laser_embedding_size)
|
||||
line_embeddings.resize(line_embeddings.shape[0] // laser_embedding_size, laser_embedding_size)
|
||||
return sent2line, line_embeddings
|
||||
|
||||
|
||||
def make_doc_embedding(sent2line, line_embeddings, lines, num_overlaps):
|
||||
"""
|
||||
lines: sentences in input document to embed
|
||||
sent2line, line_embeddings: precomputed embeddings for lines (and overlaps of lines)
|
||||
"""
|
||||
|
||||
lines = [preprocess_line(line) for line in lines]
|
||||
|
||||
vecsize = line_embeddings.shape[1]
|
||||
|
||||
vecs0 = np.empty((num_overlaps, len(lines), vecsize), dtype=np.float32)
|
||||
|
||||
for ii, overlap in enumerate(range(1, num_overlaps + 1)):
|
||||
for jj, out_line in enumerate(layer(lines, overlap)):
|
||||
try:
|
||||
line_id = sent2line[out_line]
|
||||
except KeyError:
|
||||
logger.warning('Failed to find overlap=%d line "%s". Will use random vector.', overlap, out_line)
|
||||
line_id = None
|
||||
|
||||
if line_id is not None:
|
||||
vec = line_embeddings[line_id]
|
||||
else:
|
||||
vec = np.random.random(vecsize) - 0.5
|
||||
vec = vec / np.linalg.norm(vec)
|
||||
|
||||
vecs0[ii, jj, :] = vec
|
||||
|
||||
return vecs0
|
||||
|
||||
|
||||
def make_norm1(vecs0):
|
||||
"""
|
||||
make vectors norm==1 so that cosine distance can be computed via dot product
|
||||
"""
|
||||
for ii in range(vecs0.shape[0]):
|
||||
for jj in range(vecs0.shape[1]):
|
||||
norm = np.sqrt(np.square(vecs0[ii, jj, :]).sum())
|
||||
vecs0[ii, jj, :] = vecs0[ii, jj, :] / (norm + 1e-5)
|
||||
|
||||
|
||||
def layer(lines, num_overlaps, comb=' '):
|
||||
"""
|
||||
make front-padded overlapping sentences
|
||||
"""
|
||||
if num_overlaps < 1:
|
||||
raise Exception('num_overlaps must be >= 1')
|
||||
out = ['PAD', ] * min(num_overlaps - 1, len(lines))
|
||||
for ii in range(len(lines) - num_overlaps + 1):
|
||||
out.append(comb.join(lines[ii:ii + num_overlaps]))
|
||||
return out
|
||||
|
||||
|
||||
def read_alignments(fin):
|
||||
alignments = []
|
||||
with open(fin, 'rt', encoding="utf-8") as infile:
|
||||
for line in infile:
|
||||
fields = [x.strip() for x in line.split(':') if len(x.strip())]
|
||||
if len(fields) < 2:
|
||||
raise Exception('Got line "%s", which does not have at least two ":" separated fields' % line.strip())
|
||||
try:
|
||||
src = literal_eval(fields[0])
|
||||
tgt = literal_eval(fields[1])
|
||||
except:
|
||||
raise Exception('Failed to parse line "%s"' % line.strip())
|
||||
alignments.append((src, tgt))
|
||||
|
||||
# I know bluealign files have a few entries entries missing,
|
||||
# but I don't fix them in order to be consistent previous reported scores
|
||||
return alignments
|
||||
|
||||
|
||||
def print_alignments(alignments, scores=None, file=sys.stdout):
|
||||
if scores is not None:
|
||||
for (x, y), s in zip(alignments, scores):
|
||||
print('%s:%s:%.6f' % (x, y, s), file=file)
|
||||
else:
|
||||
for x, y in alignments:
|
||||
print('%s:%s' % (x, y), file=file)
|
||||
|
||||
|
||||
class DeletionKnob(object):
|
||||
"""
|
||||
A good deletion penalty is dependent on normalization, and probably language, domain, etc, etc
|
||||
I want a way to control deletion penalty that generalizes well...
|
||||
Sampling costs and use percentile seems to work fairly well.
|
||||
"""
|
||||
def __init__(self, samp, res_min, res_max):
|
||||
|
||||
self.res_min = res_min
|
||||
self.res_max = res_max
|
||||
|
||||
if self.res_min >= self.res_max:
|
||||
logger.warning('res_max <= res_min, increasing it')
|
||||
self.res_max = self.res_min + 1e-4
|
||||
|
||||
num_bins = 1000
|
||||
num_pts = 30
|
||||
|
||||
self.hist, self.bin_edges = np.histogram(samp, bins=num_bins,
|
||||
range=[self.res_min, self.res_max],
|
||||
density=True)
|
||||
|
||||
dx = self.bin_edges[1] - self.bin_edges[0]
|
||||
self.cdf = np.cumsum(self.hist) * dx
|
||||
|
||||
interp_points = [(0, self.res_min), ]
|
||||
for knob_val in np.linspace(0, 1, num_pts - 1)[1:-1]:
|
||||
cdf_idx = np.searchsorted(self.cdf, knob_val)
|
||||
cdf_val = self.res_min + cdf_idx / float(num_bins) * (self.res_max - self.res_min)
|
||||
interp_points.append((knob_val, cdf_val))
|
||||
interp_points.append((1, self.res_max))
|
||||
self.x, self.y = zip(*interp_points)
|
||||
|
||||
def percentile_frac_to_del_penalty(self, knob_val):
|
||||
del_pen = np.interp([knob_val], self.x, self.y)[0]
|
||||
return del_pen
|
||||
|
||||
|
||||
def make_alignment_types(max_alignment_size):
|
||||
# return list of all (n,m) where n+m <= this
|
||||
alignment_types = []
|
||||
for x in range(1, max_alignment_size):
|
||||
for y in range(1, max_alignment_size):
|
||||
if x + y <= max_alignment_size:
|
||||
alignment_types.append((x, y))
|
||||
return alignment_types
|
||||
|
||||
|
||||
def ab2xy_w_offset(aa, bb_idx, bb_offset):
|
||||
bb_from_side = bb_idx + bb_offset[aa]
|
||||
xx = aa - bb_from_side
|
||||
yy = bb_from_side
|
||||
return (xx, yy)
|
||||
|
||||
|
||||
def xy2ab_w_offset(xx, yy, bb_offset):
|
||||
aa = xx + yy
|
||||
bb_from_side = yy
|
||||
bb = bb_from_side - bb_offset[aa]
|
||||
return aa, bb
|
||||
|
||||
|
||||
def process_scores(scores, alignments):
|
||||
# floating point sometimes gives negative numbers, which is a little unnerving ...
|
||||
scores = np.clip(scores, a_min=0, a_max=None)
|
||||
|
||||
for ii, (x_algn, y_algn) in enumerate(alignments):
|
||||
# deletion penalty is pretty arbitrary, just report 0
|
||||
if len(x_algn) == 0 or len(y_algn) == 0:
|
||||
scores[ii] = 0.0
|
||||
# report sores un-normalized by alignment sizes
|
||||
# (still normalized with random vectors, though)
|
||||
else:
|
||||
scores[ii] = scores[ii] / len(x_algn) / len(y_algn)
|
||||
|
||||
return scores
|
||||
|
||||
|
||||
def sparse_traceback(a_b_csum, a_b_xp, a_b_yp, b_offset, xsize, ysize):
|
||||
alignments = []
|
||||
xx = xsize
|
||||
yy = ysize
|
||||
|
||||
cum_costs = []
|
||||
|
||||
while True:
|
||||
aa, bb = xy2ab_w_offset(xx, yy, b_offset)
|
||||
|
||||
cum_costs.append(a_b_csum[aa, bb])
|
||||
|
||||
xp = a_b_xp[aa, bb]
|
||||
yp = a_b_yp[aa, bb]
|
||||
|
||||
if xx == yy == 0:
|
||||
break
|
||||
|
||||
if xx < 0 or yy < 0:
|
||||
raise Exception('traceback bug')
|
||||
|
||||
x_side = list(range(xx - xp, xx))
|
||||
y_side = list(range(yy - yp, yy))
|
||||
alignments.append((x_side, y_side))
|
||||
|
||||
xx = xx - xp
|
||||
yy = yy - yp
|
||||
|
||||
alignments.reverse()
|
||||
cum_costs.reverse()
|
||||
costs = np.array(cum_costs[1:]) - np.array(cum_costs[:-1])
|
||||
# "costs" are scaled by x_alignment_size * y_alignment_size
|
||||
# and the cost of a deletion is del_penalty
|
||||
# "scores": 0 for deletion/insertion,
|
||||
# and cosine distance, *not* scaled
|
||||
# by len(x_alignment)*len(y_alignment)
|
||||
scores = process_scores(scores=costs, alignments=alignments)
|
||||
|
||||
return alignments, scores
|
||||
|
||||
|
||||
def dense_traceback(x_y_tb):
|
||||
xsize, ysize = x_y_tb.shape
|
||||
|
||||
xx = xsize - 1
|
||||
yy = ysize - 1
|
||||
|
||||
alignments = []
|
||||
while True:
|
||||
if xx == yy == 0:
|
||||
break
|
||||
bp = x_y_tb[xx, yy]
|
||||
if bp == 0:
|
||||
xp, yp = 1, 1
|
||||
alignments.append(([xx - 1], [yy - 1]))
|
||||
elif bp == 1:
|
||||
xp, yp = 0, 1
|
||||
alignments.append(([], [yy - 1]))
|
||||
elif bp == 2:
|
||||
xp, yp = 1, 0
|
||||
alignments.append(([xx - 1], []))
|
||||
else:
|
||||
raise Exception('got unknown value')
|
||||
|
||||
xx = xx - xp
|
||||
yy = yy - yp
|
||||
|
||||
alignments.reverse()
|
||||
|
||||
return alignments
|
||||
|
||||
|
||||
def append_slant(path, xwidth, ywidth):
|
||||
"""
|
||||
Append quantized approximation to a straight line
|
||||
from current x,y to a point at (x+xwidth, y+ywidth)
|
||||
"""
|
||||
NN = xwidth + ywidth
|
||||
xstart, ystart = path[-1]
|
||||
for ii in range(1, NN + 1):
|
||||
x = xstart + round(xwidth * ii / NN)
|
||||
y = ystart + round(ywidth * ii / NN)
|
||||
# In the case of ties we want them to round differently,
|
||||
# so explicitly make sure we take a step of 1, not 0 or 2
|
||||
lastx, lasty = path[-1]
|
||||
delta = x + y - lastx - lasty
|
||||
if delta == 1:
|
||||
path.append((x, y))
|
||||
elif delta == 2:
|
||||
path.append((x - 1, y))
|
||||
elif delta == 0:
|
||||
path.append((x + 1, y))
|
||||
|
||||
|
||||
def alignment_to_search_path(algn):
|
||||
"""
|
||||
Given an alignment, make searchpath.
|
||||
Searchpath must step exactly one position in x XOR y at each time step.
|
||||
|
||||
In the case of a block of deletions, the order found by DP is not meaningful.
|
||||
To make things consistent and to improve the probability of recovering
|
||||
from search errors, we search an approximately straight line
|
||||
through a block of deletions. We do the same through a many-many
|
||||
alignment, even though we currently don't refine a many-many alignment...
|
||||
"""
|
||||
path = [(0, 0), ]
|
||||
xdel, ydel = 0, 0
|
||||
ydel = 0
|
||||
for x, y in algn:
|
||||
if len(x) and len(y):
|
||||
append_slant(path, xdel, ydel)
|
||||
xdel, ydel = 0, 0
|
||||
append_slant(path, len(x), len(y))
|
||||
elif len(x):
|
||||
xdel += len(x)
|
||||
elif len(y):
|
||||
ydel += len(y)
|
||||
|
||||
append_slant(path, xdel, ydel)
|
||||
|
||||
return path
|
||||
|
||||
|
||||
def extend_alignments(course_alignments, size0, size1):
|
||||
"""
|
||||
extend alignments to include new endpoints size0, size1
|
||||
if alignments are larger than size0/size1, raise exception
|
||||
"""
|
||||
# could be a string of deletions or insertions at end, so cannot just grab last one
|
||||
xmax = 0 # maximum x value in course_alignments
|
||||
ymax = 0 # maximum y value in course_alignments
|
||||
for x, y in course_alignments:
|
||||
for xval in x:
|
||||
xmax = max(xmax, xval)
|
||||
for yval in y:
|
||||
ymax = max(ymax, yval)
|
||||
|
||||
if xmax > size0 or ymax > size1:
|
||||
raise Exception('asked to extend alignments but already bigger than requested')
|
||||
|
||||
# do not duplicate xmax/ymax, do include size0/size1
|
||||
extra_x = list(range(xmax + 1, size0 + 1))
|
||||
extra_y = list(range(ymax + 1, size1 + 1))
|
||||
|
||||
logger.debug('extending alignments in x by %d and y by %d', len(extra_x), len(extra_y))
|
||||
|
||||
if len(extra_x) == 0:
|
||||
for yval in extra_y:
|
||||
course_alignments.append(([], [yval]))
|
||||
elif len(extra_y) == 0:
|
||||
for xval in extra_x:
|
||||
course_alignments.append(([xval], []))
|
||||
else:
|
||||
course_alignments.append((extra_x, extra_y))
|
||||
|
||||
|
||||
def upsample_alignment(algn):
|
||||
def upsample_one_alignment(xx):
|
||||
return list(range(min(xx) * 2, (max(xx) + 1) * 2))
|
||||
|
||||
new_algn = []
|
||||
for xx, yy in algn:
|
||||
if len(xx) == 0:
|
||||
for yyy in upsample_one_alignment(yy):
|
||||
new_algn.append(([], [yyy]))
|
||||
elif len(yy) == 0:
|
||||
for xxx in upsample_one_alignment(xx):
|
||||
new_algn.append(([xxx], []))
|
||||
else:
|
||||
new_algn.append((upsample_one_alignment(xx), upsample_one_alignment(yy)))
|
||||
return new_algn
|
||||
|
||||
|
||||
def make_del_knob(e_laser,
|
||||
f_laser,
|
||||
e_laser_norms,
|
||||
f_laser_norms,
|
||||
sample_size):
|
||||
e_size = e_laser.shape[0]
|
||||
f_size = f_laser.shape[0]
|
||||
|
||||
if e_size > 0 and f_size > 0 and sample_size > 0:
|
||||
|
||||
if e_size * f_size < sample_size:
|
||||
# dont sample, just compute full matrix
|
||||
sample_size = e_size * f_size
|
||||
x_idxs = np.zeros(sample_size, dtype=np.int32)
|
||||
y_idxs = np.zeros(sample_size, dtype=np.int32)
|
||||
c = 0
|
||||
for ii in range(e_size):
|
||||
for jj in range(f_size):
|
||||
x_idxs[c] = ii
|
||||
y_idxs[c] = jj
|
||||
c += 1
|
||||
else:
|
||||
# get random samples
|
||||
x_idxs = np.random.choice(range(e_size), size=sample_size, replace=True).astype(np.int32)
|
||||
y_idxs = np.random.choice(range(f_size), size=sample_size, replace=True).astype(np.int32)
|
||||
|
||||
# output
|
||||
random_scores = np.empty(sample_size, dtype=np.float32)
|
||||
|
||||
score_path(x_idxs, y_idxs,
|
||||
e_laser_norms, f_laser_norms,
|
||||
e_laser, f_laser,
|
||||
random_scores, )
|
||||
|
||||
min_score = 0
|
||||
max_score = max(random_scores) # could bump this up... but its probably fine
|
||||
|
||||
else:
|
||||
# Not much we can do here...
|
||||
random_scores = np.array([0.0, 0.5, 1.0]) # ???
|
||||
min_score = 0
|
||||
max_score = 1 # ????
|
||||
|
||||
del_knob = DeletionKnob(random_scores, min_score, max_score)
|
||||
|
||||
return del_knob
|
||||
|
||||
|
||||
def compute_norms(vecs0, vecs1, num_samples, overlaps_to_use=None):
|
||||
# overlaps_to_use = 10 # 10 matches before
|
||||
|
||||
overlaps1, size1, dim = vecs1.shape
|
||||
overlaps0, size0, dim0 = vecs0.shape
|
||||
assert (dim == dim0)
|
||||
|
||||
if overlaps_to_use is not None:
|
||||
if overlaps_to_use > overlaps1:
|
||||
raise Exception('Cannot use more overlaps than provided. You may want to re-run make_verlaps.py with a larger -n value')
|
||||
else:
|
||||
overlaps_to_use = overlaps1
|
||||
|
||||
samps_per_overlap = ceil(num_samples / overlaps_to_use)
|
||||
|
||||
if size1 and samps_per_overlap:
|
||||
# sample other size (from all overlaps) to compre to this side
|
||||
vecs1_rand_sample = np.empty((samps_per_overlap * overlaps_to_use, dim), dtype=np.float32)
|
||||
for overlap_ii in range(overlaps_to_use):
|
||||
idxs = np.random.choice(range(size1), size=samps_per_overlap, replace=True)
|
||||
random_vecs = vecs1[overlap_ii, idxs, :]
|
||||
vecs1_rand_sample[overlap_ii * samps_per_overlap:(overlap_ii + 1) * samps_per_overlap, :] = random_vecs
|
||||
|
||||
norms0 = np.empty((overlaps0, size0), dtype=np.float32)
|
||||
for overlap_ii in range(overlaps0):
|
||||
e_laser = vecs0[overlap_ii, :, :]
|
||||
sim = np.matmul(e_laser, vecs1_rand_sample.T)
|
||||
norms0[overlap_ii, :] = 1.0 - sim.mean(axis=1)
|
||||
|
||||
else: # no samples, no normalization
|
||||
norms0 = np.ones((overlaps0, size0)).astype(np.float32)
|
||||
|
||||
return norms0
|
||||
|
||||
|
||||
def downsample_vectors(vecs1):
|
||||
a, b, c = vecs1.shape
|
||||
half = np.empty((a, b // 2, c), dtype=np.float32)
|
||||
for ii in range(a):
|
||||
# average consecutive vectors
|
||||
for jj in range(0, b - b % 2, 2):
|
||||
v1 = vecs1[ii, jj, :]
|
||||
v2 = vecs1[ii, jj + 1, :]
|
||||
half[ii, jj // 2, :] = v1 + v2
|
||||
# compute mean for all vectors
|
||||
mean = np.mean(half[ii, :, :], axis=0)
|
||||
for jj in range(0, b - b % 2, 2):
|
||||
# remove mean
|
||||
half[ii, jj // 2, :] = half[ii, jj // 2, :] - mean
|
||||
# make vectors norm==1 so dot product is cosine distance
|
||||
make_norm1(half)
|
||||
return half
|
||||
|
||||
|
||||
def vecalign(vecs0,
|
||||
vecs1,
|
||||
final_alignment_types,
|
||||
del_percentile_frac,
|
||||
width_over2,
|
||||
max_size_full_dp,
|
||||
costs_sample_size,
|
||||
num_samps_for_norm,
|
||||
norms0=None,
|
||||
norms1=None):
|
||||
if width_over2 < 3:
|
||||
logger.warning('width_over2 was set to %d, which does not make sense. increasing to 3.', width_over2)
|
||||
width_over2 = 3
|
||||
|
||||
# make sure input embeddings are norm==1
|
||||
make_norm1(vecs0)
|
||||
make_norm1(vecs1)
|
||||
|
||||
# save off runtime stats for summary
|
||||
runtimes = OrderedDict()
|
||||
|
||||
# Determine stack depth
|
||||
s0, s1 = vecs0.shape[1], vecs1.shape[1]
|
||||
max_depth = 0
|
||||
while s0 * s1 > max_size_full_dp ** 2:
|
||||
max_depth += 1
|
||||
s0 = s0 // 2
|
||||
s1 = s1 // 2
|
||||
|
||||
# init recursion stack
|
||||
# depth is 0-based (full size is 0, 1 is half, 2 is quarter, etc)
|
||||
stack = {0: {'v0': vecs0, 'v1': vecs1}}
|
||||
|
||||
# downsample sentence vectors
|
||||
t0 = time()
|
||||
for depth in range(1, max_depth + 1):
|
||||
stack[depth] = {'v0': downsample_vectors(stack[depth - 1]['v0']),
|
||||
'v1': downsample_vectors(stack[depth - 1]['v1'])}
|
||||
runtimes['Downsample embeddings'] = time() - t0
|
||||
|
||||
# compute norms for all depths, add sizes, add alignment types
|
||||
t0 = time()
|
||||
for depth in stack:
|
||||
stack[depth]['size0'] = stack[depth]['v0'].shape[1]
|
||||
stack[depth]['size1'] = stack[depth]['v1'].shape[1]
|
||||
stack[depth]['alignment_types'] = final_alignment_types if depth == 0 else [(1, 1)]
|
||||
|
||||
if depth == 0 and norms0 is not None:
|
||||
if norms0.shape != vecs0.shape[:2]:
|
||||
print('norms0.shape:', norms0.shape)
|
||||
print('vecs0.shape[:2]:', vecs0.shape[:2])
|
||||
raise Exception('norms0 wrong shape')
|
||||
stack[depth]['n0'] = norms0
|
||||
else:
|
||||
stack[depth]['n0'] = compute_norms(stack[depth]['v0'], stack[depth]['v1'], num_samps_for_norm)
|
||||
|
||||
if depth == 0 and norms1 is not None:
|
||||
if norms1.shape != vecs1.shape[:2]:
|
||||
print('norms1.shape:', norms1.shape)
|
||||
print('vecs1.shape[:2]:', vecs1.shape[:2])
|
||||
raise Exception('norms1 wrong shape')
|
||||
stack[depth]['n1'] = norms1
|
||||
else:
|
||||
stack[depth]['n1'] = compute_norms(stack[depth]['v1'], stack[depth]['v0'], num_samps_for_norm)
|
||||
|
||||
runtimes['Normalize embeddings'] = time() - t0
|
||||
|
||||
# Compute deletion penalty for all depths
|
||||
t0 = time()
|
||||
for depth in stack:
|
||||
stack[depth]['del_knob'] = make_del_knob(e_laser=stack[depth]['v0'][0, :, :],
|
||||
f_laser=stack[depth]['v1'][0, :, :],
|
||||
e_laser_norms=stack[depth]['n0'][0, :],
|
||||
f_laser_norms=stack[depth]['n1'][0, :],
|
||||
sample_size=costs_sample_size)
|
||||
stack[depth]['del_penalty'] = stack[depth]['del_knob'].percentile_frac_to_del_penalty(del_percentile_frac)
|
||||
logger.debug('del_penalty at depth %d: %f', depth, stack[depth]['del_penalty'])
|
||||
runtimes['Compute deletion penalties'] = time() - t0
|
||||
tt = time() - t0
|
||||
logger.debug('%d x %d full DP make features: %.6fs (%.3e per dot product)',
|
||||
stack[max_depth]['size0'], stack[max_depth]['size1'], tt,
|
||||
tt / (stack[max_depth]['size0'] + 1e-6) / (stack[max_depth]['size1'] + 1e-6))
|
||||
# full DP at maximum recursion depth
|
||||
t0 = time()
|
||||
stack[max_depth]['costs_1to1'] = make_dense_costs(stack[max_depth]['v0'],
|
||||
stack[max_depth]['v1'],
|
||||
stack[max_depth]['n0'],
|
||||
stack[max_depth]['n1'])
|
||||
|
||||
runtimes['Full DP make features'] = time() - t0
|
||||
t0 = time()
|
||||
_, stack[max_depth]['x_y_tb'] = dense_dp(stack[max_depth]['costs_1to1'], stack[max_depth]['del_penalty'])
|
||||
stack[max_depth]['alignments'] = dense_traceback(stack[max_depth]['x_y_tb'])
|
||||
runtimes['Full DP'] = time() - t0
|
||||
|
||||
# upsample the path up to the top resolution
|
||||
compute_costs_times = []
|
||||
dp_times = []
|
||||
upsample_depths = [0, ] if max_depth == 0 else list(reversed(range(0, max_depth)))
|
||||
for depth in upsample_depths:
|
||||
if max_depth > 0: # upsample previoius alignment to current resolution
|
||||
course_alignments = upsample_alignment(stack[depth + 1]['alignments'])
|
||||
# features may have been truncated when downsampleing, so alignment may need extended
|
||||
extend_alignments(course_alignments, stack[depth]['size0'], stack[depth]['size1']) # in-place
|
||||
else: # We did a full size 1-1 search, so search same size with more alignment types
|
||||
course_alignments = stack[0]['alignments']
|
||||
|
||||
# convert couse alignments to a searchpath
|
||||
stack[depth]['searchpath'] = alignment_to_search_path(course_alignments)
|
||||
|
||||
# compute ccosts for sparse DP
|
||||
t0 = time()
|
||||
stack[depth]['a_b_costs'], stack[depth]['b_offset'] = make_sparse_costs(stack[depth]['v0'], stack[depth]['v1'],
|
||||
stack[depth]['n0'], stack[depth]['n1'],
|
||||
stack[depth]['searchpath'],
|
||||
stack[depth]['alignment_types'],
|
||||
width_over2)
|
||||
|
||||
tt = time() - t0
|
||||
num_dot_products = len(stack[depth]['b_offset']) * len(stack[depth]['alignment_types']) * width_over2 * 2
|
||||
logger.debug('%d x %d sparse DP (%d alignment types, %d window) make features: %.6fs (%.3e per dot product)',
|
||||
stack[max_depth]['size0'], stack[max_depth]['size1'],
|
||||
len(stack[depth]['alignment_types']), width_over2 * 2,
|
||||
tt, tt / (num_dot_products + 1e6))
|
||||
|
||||
compute_costs_times.append(time() - t0)
|
||||
t0 = time()
|
||||
# perform sparse DP
|
||||
stack[depth]['a_b_csum'], stack[depth]['a_b_xp'], stack[depth]['a_b_yp'], \
|
||||
stack[depth]['new_b_offset'] = sparse_dp(stack[depth]['a_b_costs'], stack[depth]['b_offset'],
|
||||
stack[depth]['alignment_types'], stack[depth]['del_penalty'],
|
||||
stack[depth]['size0'], stack[depth]['size1'])
|
||||
|
||||
# performace traceback to get alignments and alignment scores
|
||||
# for debugging, avoid overwriting stack[depth]['alignments']
|
||||
akey = 'final_alignments' if depth == 0 else 'alignments'
|
||||
stack[depth][akey], stack[depth]['alignment_scores'] = sparse_traceback(stack[depth]['a_b_csum'],
|
||||
stack[depth]['a_b_xp'],
|
||||
stack[depth]['a_b_yp'],
|
||||
stack[depth]['new_b_offset'],
|
||||
stack[depth]['size0'],
|
||||
stack[depth]['size1'])
|
||||
dp_times.append(time() - t0)
|
||||
|
||||
runtimes['Upsample DP compute costs'] = sum(compute_costs_times[:-1])
|
||||
runtimes['Upsample DP'] = sum(dp_times[:-1])
|
||||
|
||||
runtimes['Final DP compute costs'] = compute_costs_times[-1]
|
||||
runtimes['Final DP'] = dp_times[-1]
|
||||
|
||||
# log time stats
|
||||
max_key_str_len = max([len(key) for key in runtimes])
|
||||
for key in runtimes:
|
||||
if runtimes[key] > 5e-5:
|
||||
logger.info(key + ' took ' + '.' * (max_key_str_len + 5 - len(key)) + ('%.4fs' % runtimes[key]).rjust(7))
|
||||
|
||||
return stack
|
||||
Reference in New Issue
Block a user