111 lines
3.1 KiB
Python
111 lines
3.1 KiB
Python
# 2021/11/27
|
|
# bfsujason@163.com
|
|
|
|
"""
|
|
Usage:
|
|
|
|
python ext-lib/bleualign/bleualign.py \
|
|
-m data/mac/test/meta_data.tsv \
|
|
-s data/mac/test/zh \
|
|
-t data/mac/test/en \
|
|
-o data/mac/test/auto
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import shutil
|
|
import argparse
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Sentence alignment using Bleualign')
|
|
parser.add_argument('-s', '--src', type=str, required=True, help='Source directory.')
|
|
parser.add_argument('-t', '--tgt', type=str, required=True, help='Target directory.')
|
|
parser.add_argument('-o', '--out', type=str, required=True, help='Output directory.')
|
|
parser.add_argument('-m', '--meta', type=str, required=True, help='Metadata file.')
|
|
parser.add_argument('--tok', action='store_true', help='Use tokenized source trans and target text.')
|
|
args = parser.parse_args()
|
|
|
|
make_dir(args.out)
|
|
|
|
jobs = create_jobs(args.meta, args.src, args.tgt, args.out, args.tok)
|
|
job_path = os.path.abspath(os.path.join(args.out, 'bleualign.job'))
|
|
write_jobs(jobs, job_path)
|
|
|
|
bleualign_bin = os.path.abspath('ext-lib/bleualign/batch_align.py')
|
|
run_bleualign(bleualign_bin, job_path)
|
|
|
|
convert_format(args.out)
|
|
|
|
def convert_format(dir):
|
|
for file in os.listdir(dir):
|
|
if file.endswith('-s'):
|
|
file_id = file.split('.')[0]
|
|
src = os.path.join(dir, file)
|
|
tgt = os.path.join(dir, file_id + '.align-t')
|
|
out = os.path.join(dir, file_id + '.align')
|
|
_convert_format(src, tgt, out)
|
|
os.unlink(src)
|
|
os.unlink(tgt)
|
|
|
|
def _convert_format(src, tgt, path):
|
|
src_align = read_alignment(src)
|
|
tgt_align = read_alignment(tgt)
|
|
with open(path, 'wt', encoding='utf-8') as f:
|
|
for x, y in zip(src_align, tgt_align):
|
|
f.write("{}:{}\n".format(x,y))
|
|
|
|
def read_alignment(file):
|
|
alignment = []
|
|
with open(file, 'rt', encoding='utf-8') as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
alignment.append([int(x) for x in line.split(',')])
|
|
|
|
return alignment
|
|
|
|
def run_bleualign(bin, job):
|
|
cmd = "python {} {}".format(bin, job)
|
|
os.system(cmd)
|
|
os.unlink(job)
|
|
|
|
def write_jobs(jobs, path):
|
|
jobs = '\n'.join(jobs)
|
|
with open(path, 'wt', encoding='utf-8') as f:
|
|
f.write(jobs)
|
|
|
|
def create_jobs(meta, src, tgt, out, is_tok):
|
|
jobs = []
|
|
fns = get_fns(meta)
|
|
for file in fns:
|
|
src_path = os.path.abspath(os.path.join(src, file))
|
|
trans_path = os.path.abspath(os.path.join(src, file + '.trans'))
|
|
if is_tok:
|
|
tgt_path = os.path.abspath(os.path.join(tgt, file + '.tok'))
|
|
else:
|
|
tgt_path = os.path.abspath(os.path.join(tgt, file))
|
|
out_path = os.path.abspath(os.path.join(out, file + '.align'))
|
|
jobs.append('\t'.join([trans_path, src_path, tgt_path, out_path]))
|
|
|
|
return jobs
|
|
|
|
def get_fns(meta):
|
|
fns = []
|
|
with open(meta, 'rt', encoding='utf-8') as f:
|
|
next(f) # skip header
|
|
for line in f:
|
|
recs = line.strip().split('\t')
|
|
fns.append(recs[0])
|
|
|
|
return fns
|
|
|
|
def make_dir(path):
|
|
if os.path.isdir(path):
|
|
shutil.rmtree(path)
|
|
os.makedirs(path, exist_ok=True)
|
|
|
|
if __name__ == '__main__':
|
|
t_0 = time.time()
|
|
main()
|
|
print("It takes {:.3f} seconds to align all the sentences.".format(time.time() - t_0))
|