Files
bertalign/ext-lib/bleualign/bleualign.py
2021-11-28 13:59:28 +08:00

111 lines
3.1 KiB
Python

# 2021/11/27
# bfsujason@163.com
"""
Usage:
python ext-lib/bleualign/bleualign.py \
-m data/mac/test/meta_data.tsv \
-s data/mac/test/zh \
-t data/mac/test/en \
-o data/mac/test/auto
"""
import os
import sys
import time
import shutil
import argparse
def main():
parser = argparse.ArgumentParser(description='Sentence alignment using Bleualign')
parser.add_argument('-s', '--src', type=str, required=True, help='Source directory.')
parser.add_argument('-t', '--tgt', type=str, required=True, help='Target directory.')
parser.add_argument('-o', '--out', type=str, required=True, help='Output directory.')
parser.add_argument('-m', '--meta', type=str, required=True, help='Metadata file.')
parser.add_argument('--tok', action='store_true', help='Use tokenized source trans and target text.')
args = parser.parse_args()
make_dir(args.out)
jobs = create_jobs(args.meta, args.src, args.tgt, args.out, args.tok)
job_path = os.path.abspath(os.path.join(args.out, 'bleualign.job'))
write_jobs(jobs, job_path)
bleualign_bin = os.path.abspath('ext-lib/bleualign/batch_align.py')
run_bleualign(bleualign_bin, job_path)
convert_format(args.out)
def convert_format(dir):
for file in os.listdir(dir):
if file.endswith('-s'):
file_id = file.split('.')[0]
src = os.path.join(dir, file)
tgt = os.path.join(dir, file_id + '.align-t')
out = os.path.join(dir, file_id + '.align')
_convert_format(src, tgt, out)
os.unlink(src)
os.unlink(tgt)
def _convert_format(src, tgt, path):
src_align = read_alignment(src)
tgt_align = read_alignment(tgt)
with open(path, 'wt', encoding='utf-8') as f:
for x, y in zip(src_align, tgt_align):
f.write("{}:{}\n".format(x,y))
def read_alignment(file):
alignment = []
with open(file, 'rt', encoding='utf-8') as f:
for line in f:
line = line.strip()
alignment.append([int(x) for x in line.split(',')])
return alignment
def run_bleualign(bin, job):
cmd = "python {} {}".format(bin, job)
os.system(cmd)
os.unlink(job)
def write_jobs(jobs, path):
jobs = '\n'.join(jobs)
with open(path, 'wt', encoding='utf-8') as f:
f.write(jobs)
def create_jobs(meta, src, tgt, out, is_tok):
jobs = []
fns = get_fns(meta)
for file in fns:
src_path = os.path.abspath(os.path.join(src, file))
trans_path = os.path.abspath(os.path.join(src, file + '.trans'))
if is_tok:
tgt_path = os.path.abspath(os.path.join(tgt, file + '.tok'))
else:
tgt_path = os.path.abspath(os.path.join(tgt, file))
out_path = os.path.abspath(os.path.join(out, file + '.align'))
jobs.append('\t'.join([trans_path, src_path, tgt_path, out_path]))
return jobs
def get_fns(meta):
fns = []
with open(meta, 'rt', encoding='utf-8') as f:
next(f) # skip header
for line in f:
recs = line.strip().split('\t')
fns.append(recs[0])
return fns
def make_dir(path):
if os.path.isdir(path):
shutil.rmtree(path)
os.makedirs(path, exist_ok=True)
if __name__ == '__main__':
t_0 = time.time()
main()
print("It takes {:.3f} seconds to align all the sentences.".format(time.time() - t_0))