Files
mac/mark_disagreement.py
2022-05-31 22:27:29 +08:00

68 lines
2.0 KiB
Python

# bfsujason@163.com
import os
import argparse
import xml.etree.ElementTree as ET
def main():
parser = argparse.ArgumentParser(description='Mark up disagreements between annotators')
parser.add_argument('-a1', '--anno_1', type=str, required=True, help='Intertext alignment file for annotator 1.')
parser.add_argument('-a2', '--anno_2', type=str, required=True, help='Intertext alignment file for annotator 2.')
args = parser.parse_args()
alignments_1 = get_alignments(args.anno_1)
alignments_2 = get_alignments(args.anno_2)
diff_1 = find_diff(alignments_1, alignments_2)
diff_2 = find_diff(alignments_2, alignments_1)
mark_diff(diff_1, args.anno_1)
mark_diff(diff_2, args.anno_2)
def mark_diff(diff, xml_file):
doc = ET.parse(xml_file)
print(diff)
i = 0
for link in doc.iterfind('link'):
link.attrib.pop('mark', None)
if i in diff:
link.set('mark', '1')
i = i + 1
doc.write(xml_file)
def find_diff(alignments_1, alignments_2):
idxs = []
anno_1 = set([(tuple(x), tuple(y)) for x, y in alignments_1])
anno_2 = set([(tuple(x), tuple(y)) for x, y in alignments_2])
diff = list(anno_1 - anno_2)
diff = [(list(x), list(y)) for x, y in diff]
idxs = find_idxs(diff, alignments_1)
return idxs
def find_idxs(links, alignments):
idxs = []
for link in links:
idx = alignments.index(link)
idxs.append(idx)
return sorted(idxs)
def get_alignments(xml_file):
doc = ET.parse(xml_file)
links = []
for link in doc.iterfind('link'):
tgt_link, src_link = link.get('xtargets').split(';')
src_bead = parse_link(src_link)
tgt_bead = parse_link(tgt_link)
links.append((src_bead, tgt_bead))
return links
def parse_link(link):
bead = []
if len(link) > 0:
bead = [ int(item.split(':')[1]) - 1 for item in link.split(' ')]
return bead
if __name__ == '__main__':
main()