first commit
This commit is contained in:
83
utils/hunalign2id.pl
Normal file
83
utils/hunalign2id.pl
Normal file
@@ -0,0 +1,83 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
use 5.010;
|
||||
use utf8;
|
||||
|
||||
use Getopt::Long;
|
||||
use File::Spec;
|
||||
|
||||
sub _main {
|
||||
GetOptions( \ my %opts,
|
||||
'in_dir=s',
|
||||
'out_dir=s',
|
||||
);
|
||||
# convert text output to sentence IDs
|
||||
_para2id_batch($opts{in_dir}, $opts{out_dir});
|
||||
}
|
||||
|
||||
sub _para2id_batch {
|
||||
my ($in_dir, $out_dir) = @_;
|
||||
my $fns = _get_align_fns($in_dir);
|
||||
foreach my $fn ( @{$fns} ) {
|
||||
my $in_path = File::Spec->catfile($in_dir, $fn);
|
||||
my $out_path = File::Spec->catfile($out_dir, $fn);
|
||||
my $ids = _para2id($in_path);
|
||||
open my $OUT, '>:utf8', $out_path;
|
||||
say $OUT join "\n", @{$ids};
|
||||
}
|
||||
}
|
||||
|
||||
sub _para2id {
|
||||
my $text = shift;
|
||||
my @para_id;
|
||||
open my $IN, '<:utf8', $text;
|
||||
my $src_id = -1;
|
||||
my $tgt_id = -1;
|
||||
while ( defined(my $line = <$IN>) ) {
|
||||
chomp $line;
|
||||
#say $line;
|
||||
my ($src, $tgt, $score) = split /\t/, $line;
|
||||
next if not $src and not $tgt; # skip empty line
|
||||
my ($src_len, $src_seg_id) = _seg2id($src, $src_id);
|
||||
my ($tgt_len, $tgt_seg_id) = _seg2id($tgt, $tgt_id);
|
||||
$src_id += $src_len;
|
||||
$tgt_id += $tgt_len;
|
||||
push @para_id, join ':', ($src_seg_id, $tgt_seg_id);
|
||||
}
|
||||
return \@para_id;
|
||||
}
|
||||
|
||||
sub _seg2id {
|
||||
my ($text, $id) = @_;
|
||||
my @seg = split /\s+~~~\s+/, $text;
|
||||
my $len = scalar @seg;
|
||||
if ( $len > 0 ) {
|
||||
my @seg_id = map { $id + $_ } ( 1 .. $len);
|
||||
my $_seg = join ',', @seg_id;
|
||||
$_seg = '[' . $_seg . ']';
|
||||
return $len, $_seg;
|
||||
} else {
|
||||
return $len, '[]';
|
||||
}
|
||||
}
|
||||
|
||||
sub _get_align_fns {
|
||||
my $dir = shift;
|
||||
my $fns = [];
|
||||
opendir(my $DH, $dir);
|
||||
while ( my $fn = readdir $DH ) {
|
||||
next if $fn =~ /^\./;
|
||||
#say $fn;
|
||||
push @{$fns}, $fn;
|
||||
}
|
||||
return $fns;
|
||||
}
|
||||
|
||||
unless ( caller ) {
|
||||
_main();
|
||||
}
|
||||
|
||||
__END__
|
||||
Reference in New Issue
Block a user