first commit
This commit is contained in:
159
utils/eval_mac.pl
Normal file
159
utils/eval_mac.pl
Normal file
@@ -0,0 +1,159 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
# bfsujason@163.com
|
||||
# 2021.02.02
|
||||
|
||||
# This script evaluates the performance of sentence alignment algorithms.
|
||||
|
||||
# Usage:
|
||||
# perl eval.pl --meta ../corpus/test/meta_data.tsv --gold ../corpus/test/gold --auto ../corpus/test/auto --by book|chapter|align
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
use 5.010;
|
||||
use utf8;
|
||||
|
||||
use File::Spec;
|
||||
use Getopt::Long;
|
||||
use List::Util qw(first);
|
||||
|
||||
binmode(STDOUT, ":utf8");
|
||||
|
||||
sub main {
|
||||
GetOptions( \ my %opts,
|
||||
'meta=s',
|
||||
'gold=s',
|
||||
'auto=s',
|
||||
'by=s'
|
||||
);
|
||||
my $meta = read_meta($opts{meta});
|
||||
my $gold = [];
|
||||
my $auto = [];
|
||||
foreach my $record ( @{$meta} ) {
|
||||
my $text_id = $record->[0];
|
||||
my $book_id = $record->[1];
|
||||
my $cur_gold = read_align(File::Spec->catfile($opts{gold}, $text_id . '.align'), $text_id, $book_id);
|
||||
my $cur_auto = read_align(File::Spec->catfile($opts{auto}, $text_id . '.align'), $text_id, $book_id);
|
||||
push @{$gold}, @{$cur_gold};
|
||||
push @{$auto}, @{$cur_auto};
|
||||
}
|
||||
|
||||
my ($p, $r, $f1) = _eval($gold, $auto);
|
||||
say "\nOveral performance:";
|
||||
say join "\t", ("P: $p", "R: $r", "F1: $f1");
|
||||
|
||||
my $gold_by_group = _group_align($gold, $opts{by});
|
||||
my $auto_by_group = _group_align($auto, $opts{by});
|
||||
|
||||
say "\nPerformance by $opts{by}:";
|
||||
foreach my $k ( sort {$a <=> $b} keys %{$gold_by_group} ) {
|
||||
my ($p, $r, $f1) = _eval($gold_by_group->{$k}, $auto_by_group->{$k});
|
||||
say join "\t", ($k, "P: $p", "R: $r", "F1: $f1");
|
||||
}
|
||||
}
|
||||
|
||||
sub _group_align {
|
||||
my ($align, $by) = @_;
|
||||
my $align_by_group = {};
|
||||
my $group_id;
|
||||
if ( $by eq 'book' ) {
|
||||
$group_id = 0;
|
||||
} elsif ( $by eq 'chapter' ) {
|
||||
$group_id = 1;
|
||||
} elsif ( $by eq 'align' ) {
|
||||
$group_id = 3;
|
||||
}
|
||||
|
||||
foreach my $item ( @{$align} ) {
|
||||
my @records = split /\|\|/, $item;
|
||||
push @{$align_by_group->{$records[$group_id]}}, $item;
|
||||
}
|
||||
return $align_by_group;
|
||||
}
|
||||
|
||||
sub _group_align_old {
|
||||
my $align = shift;
|
||||
my $align_by_book = {};
|
||||
my $align_by_type = {};
|
||||
foreach my $item ( @{$align} ) {
|
||||
my ($book_id, $text_id, $bead, $type) = split /\|\|/, $item;
|
||||
push @{$align_by_book->{$book_id}}, $item;
|
||||
push @{$align_by_type->{$type}}, $item;
|
||||
}
|
||||
return ($align_by_book, $align_by_type);
|
||||
}
|
||||
|
||||
sub _eval {
|
||||
my ($gold, $auto) = @_;
|
||||
my $intersect = find_intersect($gold, $auto);
|
||||
my $gold_num = scalar @{$gold};
|
||||
my $auto_num = scalar @{$auto};
|
||||
my ($p, $r, $f1) = (0, 0, 0);
|
||||
$p = sprintf("%.3f", $intersect / $auto_num);
|
||||
$r = sprintf("%.3f", $intersect / $gold_num);
|
||||
if ( $p + $r > 0 ) {
|
||||
$f1 = sprintf("%.3f", (2 * $p * $r) / ($p + $r));
|
||||
}
|
||||
return ($p, $r, $f1);
|
||||
}
|
||||
|
||||
sub find_intersect {
|
||||
my ($gold_align, $auto_align) = @_;
|
||||
my $intersect = 0;
|
||||
foreach my $bead ( @{$gold_align} ) {
|
||||
my $match = first {$_ eq $bead} @{$auto_align};
|
||||
$intersect++ if $match;
|
||||
}
|
||||
return $intersect;
|
||||
}
|
||||
|
||||
# parse align file
|
||||
sub read_align {
|
||||
my ($auto_align_fn, $text_id, $book_id) = @_;
|
||||
my $auto_align = [];
|
||||
open my $IN, '<:utf8', $auto_align_fn;
|
||||
while ( defined(my $bead = <$IN>) ) {
|
||||
chomp $bead;
|
||||
$bead =~ s/\s+//g;
|
||||
my ($src, $tgt) = split /:/, $bead;
|
||||
my $src_type = get_seg_type($src);
|
||||
my $tgt_type = get_seg_type($tgt);
|
||||
#my $seg_type = join "<=>", ($src_type, $tgt_type);
|
||||
my $seg_type = $src_type + $tgt_type;
|
||||
$bead = join "||", ($book_id, $text_id, $bead, $seg_type);
|
||||
push @{$auto_align}, $bead;
|
||||
}
|
||||
return $auto_align;
|
||||
}
|
||||
|
||||
sub get_seg_type {
|
||||
my $seg = shift;
|
||||
my $type = 0;
|
||||
if ( $seg ne '[]' ) {
|
||||
my @idx = split /\,/,$seg;
|
||||
$type = scalar @idx;
|
||||
}
|
||||
return $type;
|
||||
}
|
||||
|
||||
# parse metadata file
|
||||
sub read_meta {
|
||||
my $meta_fn = shift;
|
||||
my $meta = [];
|
||||
open my $IN, '<:utf8', $meta_fn;
|
||||
while ( defined(my $line = <$IN>) ) {
|
||||
next if $. == 1;
|
||||
next if $line =~ /^#/;
|
||||
chomp $line;
|
||||
my @records = split /\t/, $line;
|
||||
push @{$meta}, [$records[0], $records[1]];
|
||||
}
|
||||
return $meta;
|
||||
}
|
||||
|
||||
unless ( caller ) {
|
||||
main();
|
||||
}
|
||||
|
||||
__END__
|
||||
Reference in New Issue
Block a user