From 7e1a7e795a317d8a68e19a5189d057e2fc4e2cb5 Mon Sep 17 00:00:00 2001 From: nlpfun Date: Mon, 29 Nov 2021 17:19:35 +0800 Subject: [PATCH] Update bert_align.py --- bin/bert_align.py | 442 +++++++++++++++++++++++++++++----------------- 1 file changed, 281 insertions(+), 161 deletions(-) diff --git a/bin/bert_align.py b/bin/bert_align.py index d61814b..622e498 100644 --- a/bin/bert_align.py +++ b/bin/bert_align.py @@ -1,4 +1,4 @@ -# 2021/11/27 +# 2021/11/29 # bfsujason@163.com """ @@ -15,7 +15,6 @@ python bin/bert_align.py \ """ import os -import sys import time import torch import faiss @@ -42,99 +41,100 @@ def main(): parser.add_argument('--margin', action='store_true', help='Margin-based cosine similarity') args = parser.parse_args() - # fixed parameters to determine the - # window size for the first-pass alignment - min_win_size = 10 - max_win_size = 600 - win_per_100 = 8 - - # read in embeddings - src_sent2line, src_line_embeddings = read_in_embeddings(args.src_embed[0], args.src_embed[1]) - tgt_sent2line, tgt_line_embeddings = read_in_embeddings(args.tgt_embed[0], args.tgt_embed[1]) - embedding_size = src_line_embeddings.shape[1] + # Read in source and target embeddings. + src_sent2line, src_line_embeddings = \ + read_in_embeddings(args.src_embed[0], args.src_embed[1]) + tgt_sent2line, tgt_line_embeddings = \ + read_in_embeddings(args.tgt_embed[0], args.tgt_embed[1]) + # Perform stentence alignment. make_dir(args.out) jobs = create_jobs(args.meta, args.src, args.tgt, args.out) - - # start alignment - for rec in jobs: - src_file, tgt_file, align_file = rec.split("\t") + for job in jobs: + src_file, tgt_file, out_file = job.split('\t') print("Aligning {} to {}".format(src_file, tgt_file)) - - # read in source and target sentences + + # Convert source and target texts into feature matrix. + t_0 = time.time() src_lines = open(src_file, 'rt', encoding="utf-8").readlines() tgt_lines = open(tgt_file, 'rt', encoding="utf-8").readlines() - - # convert source and target texts into embeddings - # and calculate sentence length - t_0 = time.time() - src_vecs, src_lens = doc2feats(src_sent2line, src_line_embeddings, src_lines, args.max_align - 1) - tgt_vecs, tgt_lens = doc2feats(tgt_sent2line, tgt_line_embeddings, tgt_lines, args.max_align - 1) + src_vecs, src_lens = \ + doc2feats(src_sent2line, src_line_embeddings, src_lines, args.max_align - 1) + tgt_vecs, tgt_lens = \ + doc2feats(tgt_sent2line, tgt_line_embeddings, tgt_lines, args.max_align - 1) char_ratio = np.sum(src_lens[0,]) / np.sum(tgt_lens[0,]) - print("Reading embeddings takes {:.3f}".format(time.time() - t_0)) - - # using faiss, find in the target text - # the k nearest neighbors of each source sentence + print("Vectorizing soure and target texts takes {:.3f} seconds.".format(time.time() - t_0)) + + # Find the top_k similar target sentences for each source sentence. t_1 = time.time() - if torch.cuda.is_available(): # GPU version - res = faiss.StandardGpuResources() - index = faiss.IndexFlatIP(embedding_size) - gpu_index = faiss.index_cpu_to_gpu(res, 0, index) - gpu_index.add(tgt_vecs[0,:]) - xq = src_vecs[0,:] - D,I = gpu_index.search(xq,args.top_k) - else: # CPU version - index = faiss.IndexFlatIP(embedding_size) # use inter product to build index - index.add(tgt_vecs[0,:]) - xq = src_vecs[0,:] - D,I = index.search(xq, args.top_k) - print("Finding top-k neighbors takes {:.3f}".format(time.time() - t_1)) + D, I = find_top_k_sents(src_vecs[0,:], tgt_vecs[0,:], k=args.top_k) + print("Finding top-k sentences takes {:.3f} seconds.".format(time.time() - t_1)) - # find 1-to-1 alignment + # Find optimal 1-1 alignments using dynamic programming. t_2 = time.time() - src_len = len(src_lines) - tgt_len = len(tgt_lines) - first_alignment_types = make_alignment_types(2) # 0-0, 1-0 and 1-1 - first_w, first_search_path = find_first_search_path(src_len, tgt_len, min_win_size, max_win_size, win_per_100) - first_pointers = first_pass_align(src_len, tgt_len, first_w, first_search_path, first_alignment_types, D, I, args.top_k) - first_alignment = first_back_track(src_len, tgt_len, first_pointers, first_search_path, first_alignment_types) - print("First pass alignment takes {:.3f}".format(time.time() - t_2)) - - # find m-to-n alignment + m = len(src_lines) + n = len(tgt_lines) + first_alignment_types = get_alignment_types(2) # 0-1, 1-0, 1-1 + first_w, first_path = find_first_search_path(m, n) + first_pointers = first_pass_align(m, n, first_w, + first_path, first_alignment_types, + D, I, args.top_k) + first_alignment = first_back_track(m, n, + first_pointers, first_path, + first_alignment_types) + print("First-pass alignment takes {:.3f} seconds.".format(time.time() - t_2)) + + # Find optimal m-to-n alignments using dynamic programming. t_3 = time.time() - second_w, second_search_path = find_second_search_path(first_alignment, args.win, src_len, tgt_len) - second_alignment_types = make_alignment_types(args.max_align) - second_pointers = second_pass_align(src_vecs, tgt_vecs, src_lens, tgt_lens, second_w, second_search_path, second_alignment_types, char_ratio, args.skip, margin=args.margin) - second_alignment = second_back_track(src_len, tgt_len, second_pointers, second_search_path, second_alignment_types) + second_alignment_types = get_alignment_types(args.max_align) + second_w, second_path = find_second_path(first_alignment, args.win, m, n) + second_pointers = second_pass_align(src_vecs, tgt_vecs, src_lens, tgt_lens, + second_w, second_path, second_alignment_types, + char_ratio, args.skip, margin=args.margin) + second_alignment = second_back_track(m, n, second_pointers, + second_path, second_alignment_types) print("Second pass alignment takes {:.3f}".format(time.time() - t_3)) - # save alignment - print_alignments(second_alignment, align_file) - -def second_back_track(i, j, b, search_path, a_types): - alignment = [] - while ( i !=0 and j != 0 ): - j_offset = j - search_path[i][0] - a = b[i][j_offset] - s = a_types[a][0] - t = a_types[a][1] - src_range = [i - offset - 1 for offset in range(s)][::-1] - tgt_range = [j - offset - 1 for offset in range(t)][::-1] - alignment.append((src_range, tgt_range)) + # save alignment results + print_alignments(second_alignment, out_file) - i = i-s - j = j-t - - return alignment[::-1] +def print_alignments(alignments, out): + with open(out, 'wt', encoding='utf-8') as f: + for x, y in alignments: + f.write("{}:{}\n".format(x, y)) @nb.jit(nopython=True, fastmath=True, cache=True) -def second_pass_align(src_vecs, tgt_vecs, src_lens, tgt_lens, w, search_path, align_types, char_ratio, skip, margin=False): +def second_pass_align(src_vecs, + tgt_vecs, + src_lens, + tgt_lens, + w, + search_path, + align_types, + char_ratio, + skip, + margin=False): + """ + Perform the second-pass alignment to extract n-m bitext segments. + Args: + src_vecs: numpy array of shape (max_align-1, num_src_sents, embedding_size). + tgt_vecs: numpy array of shape (max_align-1, num_tgt_sents, embedding_size) + src_lens: numpy array of shape (max_align-1, num_src_sents). + tgt_lens: numpy array of shape (max_align-1, num_tgt_sents). + w: int. Predefined window size for the second-pass alignment. + search_path: numpy array. Second-pass alignment search path. + align_types: numpy array. Second-pass alignment types. + char_ratio: float. Ratio between source length to target length. + skip: float. Cost for instertion and deletion. + margin: boolean. Set to true if choosing modified cosine similarity score. + Returns: + pointers: numpy array recording best alignments for each DP cell. + """ src_len = src_vecs.shape[1] tgt_len = tgt_vecs.shape[1] - # intialize sum matrix + # Intialize cost and backpointer matrix cost = np.zeros((src_len + 1, w)) - #back = np.zeros((tgt_len + 1, w), dtype=nb.int64) back = np.zeros((src_len + 1, w), dtype=nb.int64) cost[0][0] = 0 back[0][0] = -1 @@ -171,7 +171,11 @@ def second_pass_align(src_vecs, tgt_vecs, src_lens, tgt_lens, w, search_path, al tgt_v = tgt_vecs[a_2-1,j-1,:] src_l = src_lens[a_1-1, i-1] tgt_l = tgt_lens[a_2-1, j-1] - cur_score = get_score(src_v, tgt_v, a_1, a_2, i, j, src_vecs, tgt_vecs, src_len, tgt_len, margin=margin) + cur_score = get_score(src_v, tgt_v, + a_1, a_2, i, j, + src_vecs, tgt_vecs, + src_len, tgt_len, + margin=margin) tgt_l = tgt_l * char_ratio min_len = min(src_l, tgt_l) max_len = max(src_l, tgt_l) @@ -189,15 +193,36 @@ def second_pass_align(src_vecs, tgt_vecs, src_lens, tgt_lens, w, search_path, al return back +def second_back_track(i, j, b, search_path, a_types): + alignment = [] + while ( i !=0 and j != 0 ): + j_offset = j - search_path[i][0] + a = b[i][j_offset] + s = a_types[a][0] + t = a_types[a][1] + src_range = [i - offset - 1 for offset in range(s)][::-1] + tgt_range = [j - offset - 1 for offset in range(t)][::-1] + alignment.append((src_range, tgt_range)) + + i = i-s + j = j-t + + return alignment[::-1] + @nb.jit(nopython=True, fastmath=True, cache=True) -def get_score(src_v, tgt_v, a_1, a_2, i, j, src_vecs, tgt_vecs, src_len, tgt_len, margin=False): +def get_score(src_v, tgt_v, + a_1, a_2, + i, j, + src_vecs, tgt_vecs, + src_len, tgt_len, + margin=False): similarity = nb_dot(src_v, tgt_v) if margin: tgt_neighbor_ave_sim = get_neighbor_sim(src_v, a_2, j, tgt_len, tgt_vecs) src_neighbor_ave_sim = get_neighbor_sim(tgt_v, a_1, i, src_len, src_vecs) neighbor_ave_sim = (tgt_neighbor_ave_sim + src_neighbor_ave_sim)/2 similarity -= neighbor_ave_sim - + return similarity @nb.jit(nopython=True, fastmath=True, cache=True) @@ -222,17 +247,24 @@ def get_neighbor_sim(vec, a, j, len, db): neighbor_ave_sim = neighbor_left_sim + neighbor_right_sim else: neighbor_ave_sim = (neighbor_left_sim + neighbor_right_sim) / 2 - + return neighbor_ave_sim @nb.jit(nopython=True, fastmath=True, cache=True) def nb_dot(x, y): return np.dot(x,y) -def find_second_search_path(align, w, src_len, tgt_len): +def find_second_path(align, w, src_len, tgt_len): ''' Convert 1-1 alignment from first-pass to the path for second-pass alignment. - The index along X-axis and Y-axis must be consecutive. + The indices along X-axis and Y-axis must be consecutive. + Args: + align: list of tuples. First-pass alignment results. + w: int. Predefined window size for the second path. + src_len: int. Number of source sentences. + tgt_len: int. Number of target sentences. + Returns: + path: numpy array for the second search path. ''' last_bead_src = align[-1][0] last_bead_tgt = align[-1][1] @@ -246,7 +278,7 @@ def find_second_search_path(align, w, src_len, tgt_len): align.pop() align.append((src_len, tgt_len)) - prev_src, prev_tgt = 0,0 + prev_src, prev_tgt = 0, 0 path = [] max_w = -np.inf for src, tgt in align: @@ -262,13 +294,23 @@ def find_second_search_path(align, w, src_len, tgt_len): return max_w + 1, np.array(path) def first_back_track(i, j, b, search_path, a_types): + """ + Retrieve 1-1 alignments from the first-pass DP table. + Args: + i: int. Number of source sentences. + j: int. Number of target sentences. + search_path: numpy array. First-pass search path. + a_types: numpy array. First-pass alignment types. + Returns: + alignment: list of tuples for 1-1 alignments. + """ alignment = [] while ( i !=0 and j != 0 ): j_offset = j - search_path[i][0] a = b[i][j_offset] s = a_types[a][0] t = a_types[a][1] - if a == 2: + if a == 2: # best 1-1 alignment alignment.append((i, j)) i = i-s @@ -277,9 +319,29 @@ def first_back_track(i, j, b, search_path, a_types): return alignment[::-1] @nb.jit(nopython=True, fastmath=True, cache=True) -def first_pass_align(src_len, tgt_len, w, search_path, align_types, dist, index, top_k): - - #initialize cost and backpointer matrix +def first_pass_align(src_len, + tgt_len, + w, + search_path, + align_types, + dist, + index, + top_k): + """ + Perform the first-pass alignment to extract 1-1 bitext segments. + Args: + src_len: int. Number of source sentences. + tgt_len: int. Number of target sentences. + w: int. Window size for the first-pass alignment. + search_path: numpy array. Search path for the first-pass alignment. + align_types: numpy array. Alignment types for the first-pass alignment. + dist: numpy array. Distance matrix for top-k similar vecs. + index: numpy array. Index matrix for top-k similar vecs. + top_k: int. Number of most similar top-k vecs. + Returns: + pointers: numpy array recording best alignments for each DP cell. + """ + # Initialize cost and backpointer matrix. cost = np.zeros((src_len + 1, 2 * w + 1)) pointers = np.zeros((src_len + 1, 2 * w + 1), dtype=nb.int64) cost[0][0] = 0 @@ -323,29 +385,92 @@ def first_pass_align(src_len, tgt_len, w, search_path, align_types, dist, index, return pointers -@nb.jit(nopython=True, fastmath=True, cache=True) -def find_first_search_path(src_len, tgt_len, min_win_size, max_win_size, win_per_100): +def find_first_search_path(src_len, + tgt_len, + min_win_size = 250, + percent=0.06): + """ + Find the window size and search path for the first-pass alignment. + Args: + src_len: int. Number of source sentences. + tgt_len: int. Number of target sentences. + min_win_size: int. Minimum window size. + percent. float. Percent of longer sentences. + Returns: + win_size: int. Window size along the diagonal of the DP table. + search_path: numpy array of shape (src_len + 1, 2), containing the start + and end index of target sentences for each source sentence. + One extra row is added in the search_path for calculation of + deletions and omissions. + """ + win_size = max(min_win_size, int(max(src_len, tgt_len) * percent)) + search_path = [] yx_ratio = tgt_len / src_len - win_size_1 = int(yx_ratio * tgt_len * win_per_100 / 100) - win_size_2 = int(abs(tgt_len - src_len) * 3/4) - w_1 = min(max(min_win_size, max(win_size_1, win_size_2)), max_win_size) - w_2 = int(max(src_len, tgt_len) * 0.06) - w = max(w_1, w_2) - search_path = np.zeros((src_len + 1, 2), dtype=nb.int64) for i in range(0, src_len + 1): center = int(yx_ratio * i) - w_start = max(0, center - w) - w_end = min(center + w, tgt_len) - search_path[i] = [w_start, w_end] - - return w, search_path + win_start = max(0, center - win_size) + win_end = min(center + win_size, tgt_len) + search_path.append([win_start, win_end]) + return win_size, np.array(search_path) + +def get_alignment_types(max_alignment_size): + """ + Get all the possible alignment types. + Args: + max_alignment_size: int. Source sentences number + + Target sentences number <= this value. + Returns: + alignment_types: numpy array. + """ + alignment_types = [[0,1], [1,0]] + for x in range(1, max_alignment_size): + for y in range(1, max_alignment_size): + if x + y <= max_alignment_size: + alignment_types.append([x, y]) + return np.array(alignment_types) + +def find_top_k_sents(src_vecs, tgt_vecs, k=3): + """ + Find the top_k similar vecs in tgt_vecs for each vec in src_vecs. + Args: + src_vecs: numpy array of shape (num_src_sents, embedding_size) + tgt_vecs: numpy array of shape (num_tgt_sents, embedding_size) + k: int. Number of most similar target sentences. + Returns: + D: numpy array. Similarity score matrix of shape (num_src_sents, k). + I: numpy array. Target index matrix of shape (num_src_sents, k). + """ + embedding_size = src_vecs.shape[1] + if torch.cuda.is_available(): # GPU version + res = faiss.StandardGpuResources() + index = faiss.IndexFlatIP(embedding_size) + gpu_index = faiss.index_cpu_to_gpu(res, 0, index) + gpu_index.add(tgt_vecs) + D, I = gpu_index.search(src_vecs, k) + else: # CPU version + index = faiss.IndexFlatIP(embedding_size) + index.add(tgt_vecs) + D, I = index.search(src_vecs, k) + return D, I def doc2feats(sent2line, line_embeddings, lines, num_overlaps): + """ + Convert texts into feature matrix. + Args: + sent2line: dict. Map each sentence to its ID. + line_embeddings: numpy array of sentence embeddings. + lines: list of sentences. + num_overlaps: int. Maximum number of overlapping sentences allowed. + Returns: + vecs0: numpy array of shape (num_overlaps, num_lines, size_embedding) + for overlapping sentence embeddings. + vecs1: numpy array of shape (num_overlap, num_lines) + for overlapping sentence lengths. + """ lines = [preprocess_line(line) for line in lines] vecsize = line_embeddings.shape[1] vecs0 = np.empty((num_overlaps, len(lines), vecsize), dtype=np.float32) vecs1 = np.empty((num_overlaps, len(lines)), dtype=np.int) - for ii, overlap in enumerate(range(1, num_overlaps + 1)): for jj, out_line in enumerate(layer(lines, overlap)): try: @@ -353,97 +478,92 @@ def doc2feats(sent2line, line_embeddings, lines, num_overlaps): except KeyError: logger.warning('Failed to find overlap=%d line "%s". Will use random vector.', overlap, out_line) line_id = None - if line_id is not None: vec = line_embeddings[line_id] else: vec = np.random.random(vecsize) - 0.5 - vec = vec / np.linalg.norm(vec) - + vec = vec / np.linalg.norm(vec) vecs0[ii, jj, :] = vec vecs1[ii, jj] = len(out_line.encode("utf-8")) - return vecs0, vecs1 -def preprocess_line(line): - line = line.strip() - if len(line) == 0: - line = 'BLANK_LINE' - - return line - def layer(lines, num_overlaps, comb=' '): """ - make front-padded overlapping sentences + Make front-padded overlapping sentences. """ if num_overlaps < 1: raise Exception('num_overlaps must be >= 1') out = ['PAD', ] * min(num_overlaps - 1, len(lines)) for ii in range(len(lines) - num_overlaps + 1): - out.append(comb.join(lines[ii:ii + num_overlaps])) - + out.append(comb.join(lines[ii:ii + num_overlaps])) return out +def preprocess_line(line): + """ + Clean each line of the text. + """ + line = line.strip() + if len(line) == 0: + line = 'BLANK_LINE' + return line + def read_in_embeddings(text_file, embed_file): + """ + Read in the overlap lines and line embeddings. + Args: + text_file: str. Overlap file path. + embed_file: str. Embedding file path. + Returns: + sent2line: dict. Map overlap sentences to line IDs. + line_embeddings: numpy array of the shape (num_lines, embedding_size). + For sentence-transformers, the embedding_size is 768. + """ sent2line = dict() - with open(text_file, 'rt', encoding="utf-8") as fin: - for ii, line in enumerate(fin): - if line.strip() in sent2line: - raise Exception('got multiple embeddings for the same line') - sent2line[line.strip()] = ii - - line_embeddings = np.fromfile(embed_file, dtype=np.float32, count=-1) - if line_embeddings.size == 0: - raise Exception('Got empty embedding file') - + with open(text_file, 'rt', encoding="utf-8") as f: + for i, line in enumerate(f): + sent2line[line.strip()] = i + line_embeddings = np.fromfile(embed_file, dtype=np.float32) embedding_size = line_embeddings.size // len(sent2line) line_embeddings.resize(line_embeddings.shape[0] // embedding_size, embedding_size) - return sent2line, line_embeddings -def make_alignment_types(max_alignment_size): - # Return list of all (n,m) where n+m <= this - alignment_types = [] - for x in range(1, max_alignment_size): - for y in range(1, max_alignment_size): - if x + y <= max_alignment_size: - alignment_types.append([x, y]) - alignment_types = [[0,1], [1,0]] + alignment_types - - return np.array(alignment_types) - -def create_jobs(meta, src, tgt, out): +def create_jobs(meta_data_file, src_dir, tgt_dir, alignment_dir): + """ + Creat a job list consisting of source, target and alignment file paths. + """ jobs = [] - fns = get_fns(meta) - for file in fns: - src_path = os.path.abspath(os.path.join(src, file)) - tgt_path = os.path.abspath(os.path.join(tgt, file)) - - out_path = os.path.abspath(os.path.join(out, file + '.align')) - jobs.append('\t'.join([src_path, tgt_path, out_path])) - + text_ids = get_text_ids(meta_data_file) + for id in text_ids: + src_path = os.path.abspath(os.path.join(src_dir, id)) + tgt_path = os.path.abspath(os.path.join(tgt_dir, id)) + out_path = os.path.abspath(os.path.join(alignment_dir, id + '.align')) + jobs.append('\t'.join([src_path, tgt_path, out_path])) return jobs -def get_fns(meta): - fns = [] - with open(meta, 'rt', encoding='utf-8') as f: +def get_text_ids(meta_data_file): + """ + Get the text IDs to be aligned. + Args: + meta_data_file: str. TSV file with the first column being text ID. + Returns: + text_ids: list. + """ + text_ids = [] + with open(meta_data_file, 'rt', encoding='utf-8') as f: next(f) # skip header for line in f: recs = line.strip().split('\t') - fns.append(recs[0]) + text_ids.append(recs[0]) + return text_ids - return fns - -def print_alignments(alignments, out): - with open(out, 'wt', encoding='utf-8') as f: - for x, y in alignments: - f.write("{}:{}\n".format(x, y)) - -def make_dir(path): - if os.path.isdir(path): - shutil.rmtree(path) - os.makedirs(path, exist_ok=True) - +def make_dir(auto_alignment_path): + """ + Make an empty diretory for saving automatic alignment results. + """ + if os.path.isdir(auto_alignment_path): + shutil.rmtree(auto_alignment_path) + os.makedirs(auto_alignment_path, exist_ok=True) + if __name__ == '__main__': t_0 = time.time() main()