diff --git a/bin/convert_format.py b/bin/convert_format.py new file mode 100644 index 0000000..8d4abdc --- /dev/null +++ b/bin/convert_format.py @@ -0,0 +1,188 @@ +# 2021/11/30 +# bfsujason@163.com + +# Usage: +# python -p xi -s ..\data\split\zh zh -t ..\data\split\en en -a ..\data\zh-en -f intertext + +import os +import re +import shutil +import argparse +from ast import literal_eval + +def main(): + parser = argparse.ArgumentParser(description='winVecAlign: VecAlign for Windows OS') + parser.add_argument('-p', '--prj', type=str, required=True, help='Project name.') + parser.add_argument('-s', '--src', type=str, nargs=2, required=True, help='Source directory and language code.') + parser.add_argument('-t', '--tgt', type=str, nargs=2, required=True, help='Target directory and language code.') + parser.add_argument('-a', '--alignment', type=str, required=True, help='Auomatic alignment directory.') + parser.add_argument('-f', '--format', type=str, required=True, help='Output format.') + args = parser.parse_args() + + out_dir = os.path.join(args.alignment, args.format) + make_dir(out_dir) + + for file in os.listdir(args.alignment): + if not file.endswith('.align'): + continue + file_id = file.split('.')[0] + src_file = os.path.join(args.src[0], file_id) + tgt_file = os.path.join(args.tgt[0], file_id) + + src_lines = read_lines(src_file) + tgt_lines = read_lines(tgt_file) + + links = read_alignments(os.path.join(args.alignment, file)) + + if args.format == 'intertext': + prj_name = "{}{}{}".format(args.prj, args.src[1], args.tgt[1]) + src_name = "{}_{}".format(file_id, args.src[1]) + tgt_name = "{}_{}".format(file_id, args.tgt[1]) + + toDoc = '.'.join([prj_name, tgt_name, 'xml']) + fromDoc = '.'.join([prj_name, src_name, 'xml']) + linkDoc = '.'.join([prj_name, src_name, tgt_name, 'xml']) + + write_sent_xml(src_lines, out_dir, fromDoc) + write_sent_xml(tgt_lines, out_dir, toDoc) + write_link_xml(links, out_dir, toDoc, fromDoc, linkDoc) + elif args.format == 'tsv': + tsvDoc = os.path.join(out_dir, args.prj + '.' + file + '.txt') + write_tsv(src_lines, tgt_lines, links, tsvDoc) + elif args.format == 'tmx': + tmxDoc = os.path.join(out_dir, args.prj + '.' + file + '.tmx') + write_tmx(src_lines, args.src[1], tgt_lines, args.tgt[1], links, tmxDoc) + +def write_tsv(src_lines, tgt_lines, links, tsvDoc): + tsv = [] + for bead in (links): + src_line = get_line(bead[0], src_lines) + tgt_line = get_line(bead[1], tgt_lines) + tsv.append(src_line + "\t" + tgt_line) + with open(tsvDoc, 'wt', encoding="utf-8") as f: + f.write("\n".join(tsv)) + +def write_link_xml(links, dir, toDoc, fromDoc, linkDoc): + xml_head = "\n".format(toDoc, fromDoc) + xml_tail = "\n\n" + xml_body = [] + for bead in (links): + src_type, src_id = get_link_type_and_id(bead[0]) + tgt_type, tgt_id = get_link_type_and_id(bead[1]) + link = "".format(tgt_type, src_type, tgt_id, src_id) + xml_body.append(link) + + xml_body = "\n".join(xml_body) + fp = os.path.join(dir, linkDoc) + with open(fp, 'wt', encoding="utf-8") as f: + f.write(xml_head + xml_body + xml_tail) + +def get_link_type_and_id(bead): + type = len(bead); + id = '' + if type > 0: + id = ' '.join(["1:{}".format(x+1) for x in bead]) + + return type, id + +def write_sent_xml(lines, dir, doc): + xml_head = "\n\n

\n" + xml_tail = "\n

\n
\n" + xml_body = [] + for (id, line) in enumerate(lines): + line = re.sub('&', 'and', line) + line = re.sub('<|>', '\'', line) + line = " {}".format(id+1, line) + xml_body.append(line) + + xml_body = "\n".join(xml_body) + fp = os.path.join(dir, doc) + with open(fp, 'wt', encoding="utf-8") as f: + f.write(xml_head + xml_body + xml_tail) + +def write_tmx(src_lines, src_lang, tgt_lines, tgt_lang, links, tmxDoc): + tmx_head = """ + +
+""".format(LANG.TMX[src_lang]) + tmx_tail = """ +""" + tmx_body = [] + for beads in links: + src_line = get_line(beads[0], src_lines) + src_line = convert_line(src_line) + tgt_line = get_line(beads[1], tgt_lines) + tgt_line = convert_line(tgt_line) + tu = """ +{} +{} +""".format(LANG.TMX[src_lang], src_line, LANG.TMX[tgt_lang], tgt_line) + tmx_body.append(tu) + + tmx_body = '\n'.join(tmx_body) + with open(tmxDoc, 'wt', encoding="utf-8") as f: + f.write(tmx_head + "\n" + tmx_body + "\n" + tmx_tail) + +def convert_line(line): + line = re.sub(r"&","&",line) + line = re.sub(r"<","<",line) + line = re.sub(r">",">",line) + return line + +def get_line(bead, lines): + line = '' + if len(bead) > 0: + line = ' '.join(lines[bead[0]:bead[-1]+1]) + return line + +def read_lines(path): + lines = [] + with open(path, 'rt', encoding="utf-8") as f: + for line in f: + line = line.strip() + lines.append(line) + + return lines + +def read_alignments(path): + alignments = [] + with open(path, 'rt', encoding="utf-8") as infile: + for line in infile: + fields = [x.strip() for x in line.split(':') if len(x.strip())] + if len(fields) < 2: + raise Exception('Got line "%s", which does not have at least two ":" separated fields' % line.strip()) + try: + src = literal_eval(fields[0]) + tgt = literal_eval(fields[1]) + except: + raise Exception('Failed to parse line "%s"' % line.strip()) + alignments.append((src, tgt)) + + return alignments + +# Map ISO 639-1 to TMX language code. +class LANG(object): + TMX = { + 'zh': 'zh-CN', + 'en': 'en-US', + 'ar': 'ar-UAE', + 'de': 'de-DE', + 'fr': 'fr-FR', + 'nl': 'nl-NL', + 'it': 'it-IT', + 'ja': 'ja-JP', + 'ru': 'ru-RU', + 'pl': 'pl-PL', + 'es': 'es-ES', + } + +def make_dir(converted_alignment_path): + """ + Make an empty diretory for saving converted alignments. + """ + if os.path.isdir(converted_alignment_path): + shutil.rmtree(converted_alignment_path) + os.makedirs(converted_alignment_path, exist_ok=True) + +if __name__ == '__main__': + main()