first commit

This commit is contained in:
nlpfun
2021-05-18 00:03:45 +08:00
parent ca6ffedb45
commit 6d284528b4
430 changed files with 1467034 additions and 0 deletions

65
utils/bleualign2id.pl Normal file
View File

@@ -0,0 +1,65 @@
#!/usr/bin/perl
use strict;
use warnings;
use 5.010;
use utf8;
use Getopt::Long;
use File::Spec;
sub _main {
GetOptions( \ my %opts,
'in_dir=s',
'out_dir=s',
);
my $src_fns = _get_src_fns($opts{in_dir});
foreach my $src_fn ( @{$src_fns} ) {
my ($base_fn) = $src_fn =~ /(.*)\-/;
my $src_path = File::Spec->catfile($opts{in_dir}, $src_fn);
my $tgt_path = File::Spec->catfile($opts{in_dir}, $base_fn . '-t');
my $out_path = File::Spec->catfile($opts{out_dir}, $base_fn);
#say $src_path;
#say $tgt_path;
#say $out_path;
my $src_seg = _read_align($src_path);
my $tgt_seg = _read_align($tgt_path);
my @bi_seg = map {
'[' . $src_seg->[$_] . ']' .
':' .
'[' . $tgt_seg->[$_] . ']'
} ( 0 .. scalar @{$src_seg} - 1 );
open my $OUT, '>:utf8', $out_path;
say $OUT join "\n", @bi_seg;
}
}
sub _read_align {
my $file = shift;
my $align = [];
open my $IN, '<:utf8', $file;
while ( defined(my $line = <$IN>) ) {
chomp $line;
push @{$align}, $line;
}
return $align;
}
sub _get_src_fns {
my $dir = shift;
my $fns;
opendir(my $DH, $dir);
while( my $fn = readdir($DH) ) {
next if $fn =~ /^\./;
push @{$fns}, $fn if $fn =~ /\-s/;
}
return $fns;
}
unless ( caller ) {
_main();
}
__END__

67
utils/create_job.pl Normal file
View File

@@ -0,0 +1,67 @@
#!/usr/bin/perl
use strict;
use warnings;
use 5.010;
use utf8;
use Getopt::Long;
use File::Spec;
sub _main {
GetOptions( \ my %opts,
'i=s',
'o=s',
'j=s',
's=s',
't=s',
'trans',
);
_create_job_file(
File::Spec->rel2abs($opts{i}),
File::Spec->rel2abs($opts{o}),
File::Spec->rel2abs($opts{j}),
$opts{s},
$opts{t},
$opts{trans}
);
}
sub _create_job_file {
my ($data_dir, $auto_dir, $job_fn, $src, $tgt, $trans) = @_;
my ($src_fns, $tgt_fns) = _get_src_tgt_fns($data_dir, $src, $tgt);
my @align_fns = map { my ($id) = $_ =~ /(\d+)\./; $id . '.align'; } @{$src_fns};
my @table = map { join "\t", File::Spec->catfile($data_dir, $src_fns->[$_]),
File::Spec->catfile($data_dir, $tgt_fns->[$_]),
File::Spec->catfile($auto_dir, $align_fns[$_]) } ( 0 .. scalar @{$src_fns} - 1 );
if ( $trans ) {
my @trans_fns = map { my ($id) = $_ =~ /(\d+)\./; $id . '.trans'; } @{$src_fns};
@table = map { join "\t", (File::Spec->catfile($data_dir, $trans_fns[$_]), $table[$_]) } ( 0 .. scalar @table - 1 );
}
#my $job_fn = File::Spec->catfile($job_dir, $aligner . '.job');
open my $OUT, '>:utf8', $job_fn;
binmode $OUT; # output unix-like LF(\n) instead of CRLF(\r\n)
print $OUT join "\n", @table;
}
sub _get_src_tgt_fns {
my ($dir, $src, $tgt) = @_;
my ($src_fns, $tgt_fns);
opendir(my $DH, $dir);
foreach my $fn ( sort readdir $DH ) {
next if $fn =~ /^\./;
push @{$src_fns}, $fn if $fn =~ /\.$src\z/;
push @{$tgt_fns}, $fn if $fn =~ /\.$tgt\z/;
}
return ($src_fns, $tgt_fns);
}
unless ( caller ) {
_main();
}
__END__

32
utils/embed.py Normal file
View File

@@ -0,0 +1,32 @@
import os
import argparse
import time
import numpy as np
from sentence_transformers import SentenceTransformer
def _main():
parser = argparse.ArgumentParser('Convert sentences into vectors.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-i', '--input', type=str, help='input file.')
parser.add_argument('-o', '--output', type=str, help='output file.')
args = parser.parse_args()
model = SentenceTransformer('LaBSE')
embed_texts(model, args.input, args.output)
def get_sents(TEXT):
sents = []
with open(TEXT, 'r', encoding="utf-8") as f:
for line in f:
sents.append(line.strip())
return sents
def embed_texts(model, fin, fout):
t_0 = time.time()
print("Embedding text {} ...".format(fin))
txt = get_sents(fin)
embed = model.encode(txt)
embed.tofile(fout)
print("It takes {} seconds".format(time.time() - t_0))
if __name__ == '__main__':
_main()

234
utils/eval_bible.pl Normal file
View File

@@ -0,0 +1,234 @@
#!/usr/bin/perl
# bfsujason@163.com
# 2021.02.15
use strict;
use warnings;
use 5.010;
use utf8;
use File::Spec;
use Getopt::Long;
use List::Util qw(first uniq);
binmode(STDOUT, ":utf8");
sub main {
GetOptions( \ my %opts,
'meta=s',
'gold=s',
'auto=s',
'src_verse=s',
'tgt_verse=s',
);
my $meta = read_meta($opts{meta});
my $src_sent_verse = read_verse($opts{src_verse});
my $tgt_sent_verse = read_verse($opts{tgt_verse});
#foreach my $k ( sort {$a <=> $b} keys %{$src_sent_verse} ) {
# say $k, '=>', $src_sent_verse->{$k};
#}
foreach my $id ( @{$meta} ) {
my $auto_align = read_align(
File::Spec->catfile($opts{auto}, $id . '.align')
);
my $gold_align = read_align(
File::Spec->catfile($opts{gold}, $id . '.align')
);
my $merged_auto_align = merge_align(
$auto_align, $src_sent_verse, $tgt_sent_verse
);
#open my $OUT, '>:utf8', 'merged_align';
#write_align($OUT, $merged_auto_align);
my ($p, $r, $f1) = _eval($gold_align, $merged_auto_align);
say join "\t", ($id, $p, $r, $f1);
}
}
sub _eval {
my ($gold, $auto) = @_;
my $intersect = find_intersect($gold, $auto);
my $gold_num = scalar @{$gold};
my $auto_num = scalar @{$auto};
my ($p, $r, $f1) = (0, 0, 0);
$p = sprintf("%.3f", $intersect / $auto_num);
$r = sprintf("%.3f", $intersect / $gold_num);
if ( $p + $r > 0 ) {
$f1 = sprintf("%.3f", (2 * $p * $r) / ($p + $r));
}
return ($p, $r, $f1);
}
sub find_intersect {
my ($gold, $auto) = @_;
my $gold_align = flatten_align($gold);
my $auto_align = flatten_align($auto);
my $intersect = 0;
foreach my $bead ( @{$gold_align} ) {
my $match = first {$_ eq $bead} @{$auto_align};
$intersect++ if $match;
}
return $intersect;
}
sub flatten_align {
my $align = shift;
my @flattened_align = map {
my $src = join ',', @{$_->[0]};
my $tgt = join ',', @{$_->[1]};
join '<=>', ($src, $tgt)
} @{$align};
return \@flattened_align;
}
sub write_align {
my ($fh, $align) = @_;
foreach my $bead ( @{$align} ) {
my $src = join ",", @{$bead->[0]};
my $tgt = join ",", @{$bead->[1]};
say $fh '['.$src .']:['.$tgt.']';
}
}
sub merge_align {
my ($align, $src_sent_verse, $tgt_sent_verse) = @_;
my $merged_align = [];
my $last_bead_type = '';
foreach my $bead ( @{$align} ) {
my $bead_type = find_bead_type(
$bead, $src_sent_verse, $tgt_sent_verse
);
if ( not $last_bead_type ) {
push @{$merged_align}, $bead;
} else {
if ( $bead_type eq $last_bead_type ) {
push @{$merged_align->[-1]->[0]}, @{$bead->[0]};
push @{$merged_align->[-1]->[1]}, @{$bead->[1]};
} else {
push @{$merged_align}, $bead;
}
}
$last_bead_type = $bead_type;
}
return $merged_align;
}
sub find_seg_type {
my ($seg, $sent2verse) = @_;
my $seg_len = scalar @{$seg};
if ( $seg_len == 0 ) {
return ['NULL'];
} else {
my @uniq_seg = uniq map { $sent2verse->{$_} } @{$seg};
return \@uniq_seg;
}
}
sub find_bead_type {
my ($bead, $src_verse, $tgt_verse) = @_;
my $bead_type = '';
my $src_seg = $bead->[0];
my $tgt_seg = $bead->[1];
my $src_seg_type = find_seg_type($src_seg, $src_verse);
my $tgt_seg_type = find_seg_type($tgt_seg, $tgt_verse);
my $src_seg_len = scalar @{$src_seg_type};
my $tgt_seg_len = scalar @{$tgt_seg_type};
if ( $src_seg_len != 1 or $tgt_seg_len != 1 ) {
return $bead_type;
} else {
my $src_verse = $src_seg_type->[0];
my $tgt_verse = $tgt_seg_type->[0];
if ( $src_verse ne $tgt_verse ) {
if ( $src_verse eq 'NULL' ) {
return $tgt_verse;
} elsif ( $tgt_verse eq 'NULL' ) {
return $src_verse;
} else {
return $bead_type;
}
} else {
return $src_verse;
}
}
}
sub _find_bead_type {
my ($bead, $src_verse, $tgt_verse) = @_;
my $bead_type = '';
my $src_seg = $bead->[0];
my $tgt_seg = $bead->[1];
my $src_seg_len = scalar @{$src_seg};
my $tgt_seg_len = scalar @{$tgt_seg};
if ( $src_seg_len == 0 or $tgt_seg_len == 0 ) { # addition OR omission
return $bead_type;
} else {
my @src_seg_verse = uniq map { $src_verse->{$_} } @{$src_seg};
my @tgt_seg_verse = uniq map { $tgt_verse->{$_} } @{$tgt_seg};
my $src_seg_verse_len = scalar @src_seg_verse;
my $tgt_seg_verse_len = scalar @tgt_seg_verse;
if ( $src_seg_verse_len != 1 or $tgt_seg_verse_len != 1 ) {
return $bead_type;
} else {
if ( $src_seg_verse[0] ne $tgt_seg_verse[0] ) {
return $bead_type;
} else {
$bead_type = $src_seg_verse[0];
return $bead_type;
}
}
}
}
sub read_align {
my $file = shift;
my $align = [];
open my $IN, "<:utf8", $file;
while ( defined(my $line = <$IN>) ) {
chomp $line;
$line =~ s/\s+//g;
my ($src, $tgt) = split /:/, $line;
$src =~ s/\[|\]//g;
$tgt =~ s/\[|\]//g;
my @src = split /,/, $src;
my @tgt = split /,/, $tgt;
push @{$align}, [\@src, \@tgt];
}
return $align;
}
sub read_verse {
my $file = shift;
my $sent2verse = {};
open my $IN, '<:utf8', $file;
while ( defined(my $line = <$IN>) ) {
chomp $line;
$sent2verse->{$. - 1} = $line;
}
return $sent2verse;
}
sub read_meta {
my $file = shift;
my $meta = [];
open my $IN, '<:utf8', $file;
while ( defined(my $line = <$IN>) ) {
next if $. == 1;
next if $line =~ /^#/;
chomp $line;
my @records = split /\t/, $line;
push @{$meta}, $records[0];
}
return $meta;
}
unless ( caller ) {
main();
}
__END__

159
utils/eval_mac.pl Normal file
View File

@@ -0,0 +1,159 @@
#!/usr/bin/perl
# bfsujason@163.com
# 2021.02.02
# This script evaluates the performance of sentence alignment algorithms.
# Usage:
# perl eval.pl --meta ../corpus/test/meta_data.tsv --gold ../corpus/test/gold --auto ../corpus/test/auto --by book|chapter|align
use strict;
use warnings;
use 5.010;
use utf8;
use File::Spec;
use Getopt::Long;
use List::Util qw(first);
binmode(STDOUT, ":utf8");
sub main {
GetOptions( \ my %opts,
'meta=s',
'gold=s',
'auto=s',
'by=s'
);
my $meta = read_meta($opts{meta});
my $gold = [];
my $auto = [];
foreach my $record ( @{$meta} ) {
my $text_id = $record->[0];
my $book_id = $record->[1];
my $cur_gold = read_align(File::Spec->catfile($opts{gold}, $text_id . '.align'), $text_id, $book_id);
my $cur_auto = read_align(File::Spec->catfile($opts{auto}, $text_id . '.align'), $text_id, $book_id);
push @{$gold}, @{$cur_gold};
push @{$auto}, @{$cur_auto};
}
my ($p, $r, $f1) = _eval($gold, $auto);
say "\nOveral performance:";
say join "\t", ("P: $p", "R: $r", "F1: $f1");
my $gold_by_group = _group_align($gold, $opts{by});
my $auto_by_group = _group_align($auto, $opts{by});
say "\nPerformance by $opts{by}:";
foreach my $k ( sort {$a <=> $b} keys %{$gold_by_group} ) {
my ($p, $r, $f1) = _eval($gold_by_group->{$k}, $auto_by_group->{$k});
say join "\t", ($k, "P: $p", "R: $r", "F1: $f1");
}
}
sub _group_align {
my ($align, $by) = @_;
my $align_by_group = {};
my $group_id;
if ( $by eq 'book' ) {
$group_id = 0;
} elsif ( $by eq 'chapter' ) {
$group_id = 1;
} elsif ( $by eq 'align' ) {
$group_id = 3;
}
foreach my $item ( @{$align} ) {
my @records = split /\|\|/, $item;
push @{$align_by_group->{$records[$group_id]}}, $item;
}
return $align_by_group;
}
sub _group_align_old {
my $align = shift;
my $align_by_book = {};
my $align_by_type = {};
foreach my $item ( @{$align} ) {
my ($book_id, $text_id, $bead, $type) = split /\|\|/, $item;
push @{$align_by_book->{$book_id}}, $item;
push @{$align_by_type->{$type}}, $item;
}
return ($align_by_book, $align_by_type);
}
sub _eval {
my ($gold, $auto) = @_;
my $intersect = find_intersect($gold, $auto);
my $gold_num = scalar @{$gold};
my $auto_num = scalar @{$auto};
my ($p, $r, $f1) = (0, 0, 0);
$p = sprintf("%.3f", $intersect / $auto_num);
$r = sprintf("%.3f", $intersect / $gold_num);
if ( $p + $r > 0 ) {
$f1 = sprintf("%.3f", (2 * $p * $r) / ($p + $r));
}
return ($p, $r, $f1);
}
sub find_intersect {
my ($gold_align, $auto_align) = @_;
my $intersect = 0;
foreach my $bead ( @{$gold_align} ) {
my $match = first {$_ eq $bead} @{$auto_align};
$intersect++ if $match;
}
return $intersect;
}
# parse align file
sub read_align {
my ($auto_align_fn, $text_id, $book_id) = @_;
my $auto_align = [];
open my $IN, '<:utf8', $auto_align_fn;
while ( defined(my $bead = <$IN>) ) {
chomp $bead;
$bead =~ s/\s+//g;
my ($src, $tgt) = split /:/, $bead;
my $src_type = get_seg_type($src);
my $tgt_type = get_seg_type($tgt);
#my $seg_type = join "<=>", ($src_type, $tgt_type);
my $seg_type = $src_type + $tgt_type;
$bead = join "||", ($book_id, $text_id, $bead, $seg_type);
push @{$auto_align}, $bead;
}
return $auto_align;
}
sub get_seg_type {
my $seg = shift;
my $type = 0;
if ( $seg ne '[]' ) {
my @idx = split /\,/,$seg;
$type = scalar @idx;
}
return $type;
}
# parse metadata file
sub read_meta {
my $meta_fn = shift;
my $meta = [];
open my $IN, '<:utf8', $meta_fn;
while ( defined(my $line = <$IN>) ) {
next if $. == 1;
next if $line =~ /^#/;
chomp $line;
my @records = split /\t/, $line;
push @{$meta}, [$records[0], $records[1]];
}
return $meta;
}
unless ( caller ) {
main();
}
__END__

83
utils/hunalign2id.pl Normal file
View File

@@ -0,0 +1,83 @@
#!/usr/bin/perl
use strict;
use warnings;
use 5.010;
use utf8;
use Getopt::Long;
use File::Spec;
sub _main {
GetOptions( \ my %opts,
'in_dir=s',
'out_dir=s',
);
# convert text output to sentence IDs
_para2id_batch($opts{in_dir}, $opts{out_dir});
}
sub _para2id_batch {
my ($in_dir, $out_dir) = @_;
my $fns = _get_align_fns($in_dir);
foreach my $fn ( @{$fns} ) {
my $in_path = File::Spec->catfile($in_dir, $fn);
my $out_path = File::Spec->catfile($out_dir, $fn);
my $ids = _para2id($in_path);
open my $OUT, '>:utf8', $out_path;
say $OUT join "\n", @{$ids};
}
}
sub _para2id {
my $text = shift;
my @para_id;
open my $IN, '<:utf8', $text;
my $src_id = -1;
my $tgt_id = -1;
while ( defined(my $line = <$IN>) ) {
chomp $line;
#say $line;
my ($src, $tgt, $score) = split /\t/, $line;
next if not $src and not $tgt; # skip empty line
my ($src_len, $src_seg_id) = _seg2id($src, $src_id);
my ($tgt_len, $tgt_seg_id) = _seg2id($tgt, $tgt_id);
$src_id += $src_len;
$tgt_id += $tgt_len;
push @para_id, join ':', ($src_seg_id, $tgt_seg_id);
}
return \@para_id;
}
sub _seg2id {
my ($text, $id) = @_;
my @seg = split /\s+~~~\s+/, $text;
my $len = scalar @seg;
if ( $len > 0 ) {
my @seg_id = map { $id + $_ } ( 1 .. $len);
my $_seg = join ',', @seg_id;
$_seg = '[' . $_seg . ']';
return $len, $_seg;
} else {
return $len, '[]';
}
}
sub _get_align_fns {
my $dir = shift;
my $fns = [];
opendir(my $DH, $dir);
while ( my $fn = readdir $DH ) {
next if $fn =~ /^\./;
#say $fn;
push @{$fns}, $fn;
}
return $fns;
}
unless ( caller ) {
_main();
}
__END__

68
utils/overlap.py Normal file
View File

@@ -0,0 +1,68 @@
#!/usr/bin/env python3
import os
import argparse
def go(output_file, input_dir, num_overlaps, lang):
output = set()
for fin in os.listdir(input_dir):
if fin.endswith('.' + lang):
fpath = os.path.join(input_dir, fin)
lines = open(fpath, 'rt', encoding="utf-8").readlines()
for out_line in yield_overlaps(lines, num_overlaps):
output.add(out_line)
# for reproducibility
output = list(output)
output.sort()
with open(output_file, 'wt', encoding="utf-8") as fout:
for line in output:
fout.write(line + '\n')
def yield_overlaps(lines, num_overlaps):
lines = [preprocess_line(line) for line in lines]
for overlap in range(1, num_overlaps + 1):
for out_line in layer(lines, overlap):
# check must be here so all outputs are unique
out_line2 = out_line[:10000] # limit line so dont encode arbitrarily long sentences
yield out_line2
def layer(lines, num_overlaps, comb=' '):
"""
make front-padded overlapping sentences
"""
if num_overlaps < 1:
raise Exception('num_overlaps must be >= 1')
out = ['PAD', ] * min(num_overlaps - 1, len(lines))
for ii in range(len(lines) - num_overlaps + 1):
out.append(comb.join(lines[ii:ii + num_overlaps]))
return out
def preprocess_line(line):
line = line.strip()
if len(line) == 0:
line = 'BLANK_LINE'
return line
def _main():
parser = argparse.ArgumentParser('Create text file containing overlapping sentences.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-i', '--input', type=str,
help='input directory.')
parser.add_argument('-o', '--output', type=str,
help='output text file containing overlapping sentneces')
parser.add_argument('-l', '--language', type=str,
help='language code')
parser.add_argument('-n', '--num_overlaps', type=int, default=4,
help='Maximum number of allowed overlaps.')
args = parser.parse_args()
go(output_file=args.output,
input_dir=args.input,
num_overlaps=args.num_overlaps,
lang=args.language)
if __name__ == '__main__':
_main()