Baseline alignment systems
This commit is contained in:
110
ext-lib/bleualign/bleualign.py
Normal file
110
ext-lib/bleualign/bleualign.py
Normal file
@@ -0,0 +1,110 @@
|
||||
# 2021/11/27
|
||||
# bfsujason@163.com
|
||||
|
||||
"""
|
||||
Usage:
|
||||
|
||||
python ext-lib/bleualign/bleualign.py \
|
||||
-m data/mac/test/meta_data.tsv \
|
||||
-s data/mac/test/zh \
|
||||
-t data/mac/test/en \
|
||||
-o data/mac/test/auto
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import shutil
|
||||
import argparse
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Sentence alignment using Bleualign')
|
||||
parser.add_argument('-s', '--src', type=str, required=True, help='Source directory.')
|
||||
parser.add_argument('-t', '--tgt', type=str, required=True, help='Target directory.')
|
||||
parser.add_argument('-o', '--out', type=str, required=True, help='Output directory.')
|
||||
parser.add_argument('-m', '--meta', type=str, required=True, help='Metadata file.')
|
||||
parser.add_argument('--tok', action='store_true', help='Use tokenized source trans and target text.')
|
||||
args = parser.parse_args()
|
||||
|
||||
make_dir(args.out)
|
||||
|
||||
jobs = create_jobs(args.meta, args.src, args.tgt, args.out, args.tok)
|
||||
job_path = os.path.abspath(os.path.join(args.out, 'bleualign.job'))
|
||||
write_jobs(jobs, job_path)
|
||||
|
||||
bleualign_bin = os.path.abspath('ext-lib/bleualign/batch_align.py')
|
||||
run_bleualign(bleualign_bin, job_path)
|
||||
|
||||
convert_format(args.out)
|
||||
|
||||
def convert_format(dir):
|
||||
for file in os.listdir(dir):
|
||||
if file.endswith('-s'):
|
||||
file_id = file.split('.')[0]
|
||||
src = os.path.join(dir, file)
|
||||
tgt = os.path.join(dir, file_id + '.align-t')
|
||||
out = os.path.join(dir, file_id + '.align')
|
||||
_convert_format(src, tgt, out)
|
||||
os.unlink(src)
|
||||
os.unlink(tgt)
|
||||
|
||||
def _convert_format(src, tgt, path):
|
||||
src_align = read_alignment(src)
|
||||
tgt_align = read_alignment(tgt)
|
||||
with open(path, 'wt', encoding='utf-8') as f:
|
||||
for x, y in zip(src_align, tgt_align):
|
||||
f.write("{}:{}\n".format(x,y))
|
||||
|
||||
def read_alignment(file):
|
||||
alignment = []
|
||||
with open(file, 'rt', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
alignment.append([int(x) for x in line.split(',')])
|
||||
|
||||
return alignment
|
||||
|
||||
def run_bleualign(bin, job):
|
||||
cmd = "python {} {}".format(bin, job)
|
||||
os.system(cmd)
|
||||
os.unlink(job)
|
||||
|
||||
def write_jobs(jobs, path):
|
||||
jobs = '\n'.join(jobs)
|
||||
with open(path, 'wt', encoding='utf-8') as f:
|
||||
f.write(jobs)
|
||||
|
||||
def create_jobs(meta, src, tgt, out, is_tok):
|
||||
jobs = []
|
||||
fns = get_fns(meta)
|
||||
for file in fns:
|
||||
src_path = os.path.abspath(os.path.join(src, file))
|
||||
trans_path = os.path.abspath(os.path.join(src, file + '.trans'))
|
||||
if is_tok:
|
||||
tgt_path = os.path.abspath(os.path.join(tgt, file + '.tok'))
|
||||
else:
|
||||
tgt_path = os.path.abspath(os.path.join(tgt, file))
|
||||
out_path = os.path.abspath(os.path.join(out, file + '.align'))
|
||||
jobs.append('\t'.join([trans_path, src_path, tgt_path, out_path]))
|
||||
|
||||
return jobs
|
||||
|
||||
def get_fns(meta):
|
||||
fns = []
|
||||
with open(meta, 'rt', encoding='utf-8') as f:
|
||||
next(f) # skip header
|
||||
for line in f:
|
||||
recs = line.strip().split('\t')
|
||||
fns.append(recs[0])
|
||||
|
||||
return fns
|
||||
|
||||
def make_dir(path):
|
||||
if os.path.isdir(path):
|
||||
shutil.rmtree(path)
|
||||
os.makedirs(path, exist_ok=True)
|
||||
|
||||
if __name__ == '__main__':
|
||||
t_0 = time.time()
|
||||
main()
|
||||
print("It takes {:.3f} seconds to align all the sentences.".format(time.time() - t_0))
|
||||
Reference in New Issue
Block a user