diff --git a/bin/convert_format.py b/bin/convert_format.py
new file mode 100644
index 0000000..8d4abdc
--- /dev/null
+++ b/bin/convert_format.py
@@ -0,0 +1,188 @@
+# 2021/11/30
+# bfsujason@163.com
+
+# Usage:
+# python -p xi -s ..\data\split\zh zh -t ..\data\split\en en -a ..\data\zh-en -f intertext
+
+import os
+import re
+import shutil
+import argparse
+from ast import literal_eval
+
+def main():
+ parser = argparse.ArgumentParser(description='winVecAlign: VecAlign for Windows OS')
+ parser.add_argument('-p', '--prj', type=str, required=True, help='Project name.')
+ parser.add_argument('-s', '--src', type=str, nargs=2, required=True, help='Source directory and language code.')
+ parser.add_argument('-t', '--tgt', type=str, nargs=2, required=True, help='Target directory and language code.')
+ parser.add_argument('-a', '--alignment', type=str, required=True, help='Auomatic alignment directory.')
+ parser.add_argument('-f', '--format', type=str, required=True, help='Output format.')
+ args = parser.parse_args()
+
+ out_dir = os.path.join(args.alignment, args.format)
+ make_dir(out_dir)
+
+ for file in os.listdir(args.alignment):
+ if not file.endswith('.align'):
+ continue
+ file_id = file.split('.')[0]
+ src_file = os.path.join(args.src[0], file_id)
+ tgt_file = os.path.join(args.tgt[0], file_id)
+
+ src_lines = read_lines(src_file)
+ tgt_lines = read_lines(tgt_file)
+
+ links = read_alignments(os.path.join(args.alignment, file))
+
+ if args.format == 'intertext':
+ prj_name = "{}{}{}".format(args.prj, args.src[1], args.tgt[1])
+ src_name = "{}_{}".format(file_id, args.src[1])
+ tgt_name = "{}_{}".format(file_id, args.tgt[1])
+
+ toDoc = '.'.join([prj_name, tgt_name, 'xml'])
+ fromDoc = '.'.join([prj_name, src_name, 'xml'])
+ linkDoc = '.'.join([prj_name, src_name, tgt_name, 'xml'])
+
+ write_sent_xml(src_lines, out_dir, fromDoc)
+ write_sent_xml(tgt_lines, out_dir, toDoc)
+ write_link_xml(links, out_dir, toDoc, fromDoc, linkDoc)
+ elif args.format == 'tsv':
+ tsvDoc = os.path.join(out_dir, args.prj + '.' + file + '.txt')
+ write_tsv(src_lines, tgt_lines, links, tsvDoc)
+ elif args.format == 'tmx':
+ tmxDoc = os.path.join(out_dir, args.prj + '.' + file + '.tmx')
+ write_tmx(src_lines, args.src[1], tgt_lines, args.tgt[1], links, tmxDoc)
+
+def write_tsv(src_lines, tgt_lines, links, tsvDoc):
+ tsv = []
+ for bead in (links):
+ src_line = get_line(bead[0], src_lines)
+ tgt_line = get_line(bead[1], tgt_lines)
+ tsv.append(src_line + "\t" + tgt_line)
+ with open(tsvDoc, 'wt', encoding="utf-8") as f:
+ f.write("\n".join(tsv))
+
+def write_link_xml(links, dir, toDoc, fromDoc, linkDoc):
+ xml_head = "\n".format(toDoc, fromDoc)
+ xml_tail = "\n\n"
+ xml_body = []
+ for bead in (links):
+ src_type, src_id = get_link_type_and_id(bead[0])
+ tgt_type, tgt_id = get_link_type_and_id(bead[1])
+ link = "".format(tgt_type, src_type, tgt_id, src_id)
+ xml_body.append(link)
+
+ xml_body = "\n".join(xml_body)
+ fp = os.path.join(dir, linkDoc)
+ with open(fp, 'wt', encoding="utf-8") as f:
+ f.write(xml_head + xml_body + xml_tail)
+
+def get_link_type_and_id(bead):
+ type = len(bead);
+ id = ''
+ if type > 0:
+ id = ' '.join(["1:{}".format(x+1) for x in bead])
+
+ return type, id
+
+def write_sent_xml(lines, dir, doc):
+ xml_head = "\n\n \n"
+ xml_tail = "\n
\n\n"
+ xml_body = []
+ for (id, line) in enumerate(lines):
+ line = re.sub('&', 'and', line)
+ line = re.sub('<|>', '\'', line)
+ line = " {}".format(id+1, line)
+ xml_body.append(line)
+
+ xml_body = "\n".join(xml_body)
+ fp = os.path.join(dir, doc)
+ with open(fp, 'wt', encoding="utf-8") as f:
+ f.write(xml_head + xml_body + xml_tail)
+
+def write_tmx(src_lines, src_lang, tgt_lines, tgt_lang, links, tmxDoc):
+ tmx_head = """
+
+
+""".format(LANG.TMX[src_lang])
+ tmx_tail = """
+"""
+ tmx_body = []
+ for beads in links:
+ src_line = get_line(beads[0], src_lines)
+ src_line = convert_line(src_line)
+ tgt_line = get_line(beads[1], tgt_lines)
+ tgt_line = convert_line(tgt_line)
+ tu = """
+{}
+{}
+""".format(LANG.TMX[src_lang], src_line, LANG.TMX[tgt_lang], tgt_line)
+ tmx_body.append(tu)
+
+ tmx_body = '\n'.join(tmx_body)
+ with open(tmxDoc, 'wt', encoding="utf-8") as f:
+ f.write(tmx_head + "\n" + tmx_body + "\n" + tmx_tail)
+
+def convert_line(line):
+ line = re.sub(r"&","&",line)
+ line = re.sub(r"<","<",line)
+ line = re.sub(r">",">",line)
+ return line
+
+def get_line(bead, lines):
+ line = ''
+ if len(bead) > 0:
+ line = ' '.join(lines[bead[0]:bead[-1]+1])
+ return line
+
+def read_lines(path):
+ lines = []
+ with open(path, 'rt', encoding="utf-8") as f:
+ for line in f:
+ line = line.strip()
+ lines.append(line)
+
+ return lines
+
+def read_alignments(path):
+ alignments = []
+ with open(path, 'rt', encoding="utf-8") as infile:
+ for line in infile:
+ fields = [x.strip() for x in line.split(':') if len(x.strip())]
+ if len(fields) < 2:
+ raise Exception('Got line "%s", which does not have at least two ":" separated fields' % line.strip())
+ try:
+ src = literal_eval(fields[0])
+ tgt = literal_eval(fields[1])
+ except:
+ raise Exception('Failed to parse line "%s"' % line.strip())
+ alignments.append((src, tgt))
+
+ return alignments
+
+# Map ISO 639-1 to TMX language code.
+class LANG(object):
+ TMX = {
+ 'zh': 'zh-CN',
+ 'en': 'en-US',
+ 'ar': 'ar-UAE',
+ 'de': 'de-DE',
+ 'fr': 'fr-FR',
+ 'nl': 'nl-NL',
+ 'it': 'it-IT',
+ 'ja': 'ja-JP',
+ 'ru': 'ru-RU',
+ 'pl': 'pl-PL',
+ 'es': 'es-ES',
+ }
+
+def make_dir(converted_alignment_path):
+ """
+ Make an empty diretory for saving converted alignments.
+ """
+ if os.path.isdir(converted_alignment_path):
+ shutil.rmtree(converted_alignment_path)
+ os.makedirs(converted_alignment_path, exist_ok=True)
+
+if __name__ == '__main__':
+ main()