Chinese-English Parallel Corpus
This commit is contained in:
50
compute_ica.py
Normal file
50
compute_ica.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import os
|
||||
import argparse
|
||||
|
||||
import xml.etree.ElementTree as ET
|
||||
#from nltk.metrics import masi_distance, jaccard_distance
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Inter-coder Agreement Calculator')
|
||||
parser.add_argument('-a1', '--anno_1', type=str, required=True, help='Intertext alignment directory for annotator 1.')
|
||||
parser.add_argument('-a2', '--anno_2', type=str, required=True, help='Intertext alignment directory for annotator 2.')
|
||||
args = parser.parse_args()
|
||||
|
||||
anno_1_files = [file for file in sorted(os.listdir(args.anno_1)) if len(file.split('.')) == 4]
|
||||
anno_2_files = [file for file in sorted(os.listdir(args.anno_2)) if len(file.split('.')) == 4]
|
||||
intersections = 0
|
||||
unions = 0
|
||||
for anno_1_file, anno_2_file in zip(anno_1_files, anno_2_files):
|
||||
alignments_1 = get_alignments(os.path.join(args.anno_1, anno_1_file))
|
||||
alignments_2 = get_alignments(os.path.join(args.anno_2, anno_2_file))
|
||||
len_intersection = len(alignments_1.intersection(alignments_2))
|
||||
len_union = len(alignments_1.union(alignments_2))
|
||||
#print("Len_intersection: {}".format(len_intersection))
|
||||
#print("Len_union: {}".format(len_union))
|
||||
#masi = masi_distance(alignments_1, alignments_2)
|
||||
#print("MASI: {}".format(masi))
|
||||
intersections += len_intersection
|
||||
unions += len_union
|
||||
|
||||
jac = intersections / unions
|
||||
print("Jaccard Index: {:.3f}".format(jac))
|
||||
|
||||
def get_alignments(xml_file):
|
||||
doc = ET.parse(xml_file)
|
||||
links = []
|
||||
for link in doc.iterfind('link'):
|
||||
tgt_link, src_link = link.get('xtargets').split(';')
|
||||
src_bead = parse_link(src_link)
|
||||
tgt_bead = parse_link(tgt_link)
|
||||
links.append((src_bead, tgt_bead))
|
||||
alignments = set([(tuple(x), tuple(y)) for x, y in links])
|
||||
return alignments
|
||||
|
||||
def parse_link(link):
|
||||
bead = []
|
||||
if len(link) > 0:
|
||||
bead = [int(item.split(':')[1]) - 1 for item in link.split(' ')]
|
||||
return bead
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user