Chinese-English Parallel Corpus
This commit is contained in:
67
mark_disagreement.py
Normal file
67
mark_disagreement.py
Normal file
@@ -0,0 +1,67 @@
|
||||
# bfsujason@163.com
|
||||
|
||||
import os
|
||||
import argparse
|
||||
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Mark up disagreements between annotators')
|
||||
parser.add_argument('-a1', '--anno_1', type=str, required=True, help='Intertext alignment file for annotator 1.')
|
||||
parser.add_argument('-a2', '--anno_2', type=str, required=True, help='Intertext alignment file for annotator 2.')
|
||||
args = parser.parse_args()
|
||||
|
||||
alignments_1 = get_alignments(args.anno_1)
|
||||
alignments_2 = get_alignments(args.anno_2)
|
||||
|
||||
diff_1 = find_diff(alignments_1, alignments_2)
|
||||
diff_2 = find_diff(alignments_2, alignments_1)
|
||||
|
||||
mark_diff(diff_1, args.anno_1)
|
||||
mark_diff(diff_2, args.anno_2)
|
||||
|
||||
def mark_diff(diff, xml_file):
|
||||
doc = ET.parse(xml_file)
|
||||
print(diff)
|
||||
i = 0
|
||||
for link in doc.iterfind('link'):
|
||||
link.attrib.pop('mark', None)
|
||||
if i in diff:
|
||||
link.set('mark', '1')
|
||||
i = i + 1
|
||||
doc.write(xml_file)
|
||||
|
||||
def find_diff(alignments_1, alignments_2):
|
||||
idxs = []
|
||||
anno_1 = set([(tuple(x), tuple(y)) for x, y in alignments_1])
|
||||
anno_2 = set([(tuple(x), tuple(y)) for x, y in alignments_2])
|
||||
diff = list(anno_1 - anno_2)
|
||||
diff = [(list(x), list(y)) for x, y in diff]
|
||||
idxs = find_idxs(diff, alignments_1)
|
||||
return idxs
|
||||
|
||||
def find_idxs(links, alignments):
|
||||
idxs = []
|
||||
for link in links:
|
||||
idx = alignments.index(link)
|
||||
idxs.append(idx)
|
||||
return sorted(idxs)
|
||||
|
||||
def get_alignments(xml_file):
|
||||
doc = ET.parse(xml_file)
|
||||
links = []
|
||||
for link in doc.iterfind('link'):
|
||||
tgt_link, src_link = link.get('xtargets').split(';')
|
||||
src_bead = parse_link(src_link)
|
||||
tgt_bead = parse_link(tgt_link)
|
||||
links.append((src_bead, tgt_bead))
|
||||
return links
|
||||
|
||||
def parse_link(link):
|
||||
bead = []
|
||||
if len(link) > 0:
|
||||
bead = [ int(item.split(':')[1]) - 1 for item in link.split(' ')]
|
||||
return bead
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user