51 lines
2.0 KiB
Python
51 lines
2.0 KiB
Python
import os
|
|
import argparse
|
|
|
|
import xml.etree.ElementTree as ET
|
|
#from nltk.metrics import masi_distance, jaccard_distance
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Inter-coder Agreement Calculator')
|
|
parser.add_argument('-a1', '--anno_1', type=str, required=True, help='Intertext alignment directory for annotator 1.')
|
|
parser.add_argument('-a2', '--anno_2', type=str, required=True, help='Intertext alignment directory for annotator 2.')
|
|
args = parser.parse_args()
|
|
|
|
anno_1_files = [file for file in sorted(os.listdir(args.anno_1)) if len(file.split('.')) == 4]
|
|
anno_2_files = [file for file in sorted(os.listdir(args.anno_2)) if len(file.split('.')) == 4]
|
|
intersections = 0
|
|
unions = 0
|
|
for anno_1_file, anno_2_file in zip(anno_1_files, anno_2_files):
|
|
alignments_1 = get_alignments(os.path.join(args.anno_1, anno_1_file))
|
|
alignments_2 = get_alignments(os.path.join(args.anno_2, anno_2_file))
|
|
len_intersection = len(alignments_1.intersection(alignments_2))
|
|
len_union = len(alignments_1.union(alignments_2))
|
|
#print("Len_intersection: {}".format(len_intersection))
|
|
#print("Len_union: {}".format(len_union))
|
|
#masi = masi_distance(alignments_1, alignments_2)
|
|
#print("MASI: {}".format(masi))
|
|
intersections += len_intersection
|
|
unions += len_union
|
|
|
|
jac = intersections / unions
|
|
print("Jaccard Index: {:.3f}".format(jac))
|
|
|
|
def get_alignments(xml_file):
|
|
doc = ET.parse(xml_file)
|
|
links = []
|
|
for link in doc.iterfind('link'):
|
|
tgt_link, src_link = link.get('xtargets').split(';')
|
|
src_bead = parse_link(src_link)
|
|
tgt_bead = parse_link(tgt_link)
|
|
links.append((src_bead, tgt_bead))
|
|
alignments = set([(tuple(x), tuple(y)) for x, y in links])
|
|
return alignments
|
|
|
|
def parse_link(link):
|
|
bead = []
|
|
if len(link) > 0:
|
|
bead = [int(item.split(':')[1]) - 1 for item in link.split(' ')]
|
|
return bead
|
|
|
|
if __name__ == '__main__':
|
|
main()
|