Files
bertalign/bin/bleualign/bleualign/align.py
2021-05-17 23:44:19 +08:00

1184 lines
46 KiB
Python

#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright © 2010 University of Zürich
# Author: Rico Sennrich <sennrich@cl.uzh.ch>
# For licensing information, see LICENSE
from __future__ import division,print_function,unicode_literals
import sys
import time
import math
from operator import itemgetter
from bleualign.gale_church import align_texts
import bleualign.score as bleu
from bleualign.utils import evaluate, finalevaluation
import io
import platform
if sys.version_info >= (2,6) and platform.system() != "Windows":
import multiprocessing
multiprocessing_enabled = 1
else:
multiprocessing_enabled = 0
def collect_article(src,srctotarget,target,targettosrc,options):
EOF = False
while not EOF:
all_texts = []
all_translations = []
for text,translations in [(src,srctotarget),(target,targettosrc)]:
textlist = []
translist = [[] for i in translations]
for line in text:
if line.rstrip() == options['end_of_article_marker']:
for f in translations:
f.readline()
break
for i,f in enumerate(translations):
translist[i].append(f.readline().rstrip())
if options['factored']:
rawline = ' '.join(word.split('|')[0] for word in line.split())
textlist.append((rawline,line.rstrip()))
else:
textlist.append(line.rstrip())
else:
EOF = True
all_texts.append(textlist)
all_translations.append(translist)
sourcelist, targetlist = all_texts
translist1, translist2 = all_translations
yield sourcelist,targetlist,translist1,translist2
#takes a queue as argument and puts all articles to be aligned in it.
#best call this in a separate process because we limit the queue size for memory reasons
def tasks_producer(tasks,num_tasks,data,num_processes):
for i,task in enumerate(collect_article(*data)):
num_tasks.value += 1
tasks.put((i,task),True)
#poison pills
for i in range(num_processes):
tasks.put((None,None))
num_tasks.value -= 1 # only if this point is reached, process finishes when all tasks are done.
class Aligner:
default_options = {
#source and target files needed by Aligner
#they can be filenames, arrays of strings or io objects.
'srcfile':None, 'targetfile': None,
#the format of srcfile and targetfile
#False for normal text, True for 'text | other information', seprating by '|'
'factored': False,
#translations of srcfile and targetfile, not influenced by 'factored'
#they can be filenames, arrays of strings or io objects, too.
'srctotarget': [], 'targettosrc': [],
#run aligner without srctotarget and targettosrc
'no_translation_override':False,
#only consider target sentences for bleu-based alignment that are among top N alternatives for a given source sentence
'maxalternatives':3,
#bleu scoring algorithm works with 4-grams by default. We got better results when using 2-grams (since there are less 0 scores then)
'bleu_ngrams' : 2,
#BLEU is word-based by default, but character-level BLEU is more suitable for some languages, e.g. continuous script languages without space.
#it is a good idea to also increase bleu_ngrams when switching to character-level BLEU
'bleu_charlevel' : False,
#consider N to 1 (and 1 to N) alignment in gapfilling (complexity is size_of_gap*value^2, so don't turn this unnecessarily high)
#also, there are potential precision issues.
#set to 1 to disable bleu-based 1 to N alignments and let gale & church fill the gaps
'Nto1' : 2,
#do only gale-church, no bleualign
'galechurch': None,
#gapfillheuristics: what to do with sentences that aren't aligned one-to-one by the first BLEU pass, nor have a 1 to N alignment validated by BLEU?
#possible members are: bleu1to1, galechurch
#what they do is commented in the source code
'gapfillheuristics' : ["bleu1to1","galechurch"],
#defines string that identifies hard boundaries (articles, chapters etc.)
#string needs to be on a line of its own (see examples in eval directory)
#must be reliable (article i in the source text needs to correspond to article i in the target text)
'end_of_article_marker' : ".EOA",
#filtering out bad alignments by bleuscore
#filter has sentences or articles type
#filterthreshold means choosing the best X% of alignments (according to BLEU)
#bleuthreshold requires a sentence pair to achieve a certain BLEU score to be included in the output
#set filterlang True, whose when you want to filter alignemts which src is similar to target than translation
'filter': None, 'filterthreshold': 90, 'bleuthreshold': 0, 'filterlang': None,
#it will print unalignemt pair(zero to one or one to zero pair)
'printempty': False,
#setting output for four output filenames, it will add suffixes automatically
#or passing filenames or io object for them in respectly.
#if not passing anything or assigning None, they will use StringIO to save results.
'output': None,
'output-src': None, 'output-target': None,
'output-src-bad': None, 'output-target-bad': None,
#the best alignment of corpus for evaluation
'eval': None,
#defines amount of debugging output.
'verbosity': 1, 'log_to':sys.stdout,
#number of parallel processes
'num_processes': 1
}
def __init__(self,options):
self.src, self.target = None,None
self.srctotarget, self.targettosrc= [],[]
self.out1, self.out2, self.out_bad1, self.out_bad2 = None,None,None,None
self.sources_out,self.targets_out = [],[]
self.finalbleu = []
self.bleualign = []
self.close_src, self.close_target = False, False
self.close_srctotarget, self.close_targettosrc = [], []
self.close_out1, self.close_out2 = False, False
self.close_out_bad1, self.close_out_bad2 = False, False
self.options = self.default_options.copy()
self.options.update(options)
if not self.options['srcfile']:
raise ValueError('Source file not specified.')
if not self.options['targetfile']:
raise ValueError('Target file not specified.')
if not self.options['srctotarget'] and not self.options['targettosrc']\
and not self.options['no_translation_override']:
raise ValueError("ERROR: no translation available: BLEU scores can be computed between the source and target text, but this is not the intended usage of Bleualign and may result in poor performance! If you're *really* sure that this is what you want, set 'galechurch' for the options.")
self.src, self.close_src = \
self._inputObjectFromParameter(self.options['srcfile'])
self.target, self.close_target = \
self._inputObjectFromParameter(self.options['targetfile'])
for f in self.options['srctotarget']:
obj, close_obj = \
self._inputObjectFromParameter(f)
self.srctotarget.append(obj)
self.close_srctotarget.append(close_obj)
for f in self.options['targettosrc']:
obj, close_obj = \
self._inputObjectFromParameter(f)
self.targettosrc.append(obj)
self.close_targettosrc.append(close_obj)
self.out1,self.close_out1=self._outputObjectFromParameter(
self.options['output-src'], self.options['output'], '-s')
self.out2,self.close_out2=self._outputObjectFromParameter(
self.options['output-target'], self.options['output'], '-t')
if self.options['filter']:
self.out_bad1,self.close_out_bad1=self._outputObjectFromParameter(
self.options['output-src-bad'], self.options['output'], '-bad-s')
self.out_bad2,self.close_out_bad2=self._outputObjectFromParameter(
self.options['output-target-bad'], self.options['output'], '-bad-t')
# for passing by string array
def _stringArray2stringIo(self, stringArray):
return io.StringIO('\n'.join([line.rstrip() for line in stringArray]))
# parameter may be filename, IO object or string array
def _inputObjectFromParameter(self, parameter):
try:
inputObject = io.open(parameter, 'r', encoding='UTF-8')
close_object = True
except:
if isinstance(parameter, io.TextIOBase):
inputObject = parameter
else:
inputObject = self._stringArray2stringIo(parameter)
close_object = False
return inputObject, close_object
# parameter may be filename, IO object or string array
def _outputObjectFromParameter(self, parameter, filename, suffix):
close_object = False
if parameter:
try:
outputObject = io.open(parameter, 'w', encoding='UTF-8')
close_object = True
except:
outputObject = parameter
elif filename:
outputObject = io.open(filename + suffix, 'w', encoding='UTF-8')
else:
outputObject = io.StringIO()
return outputObject, close_object
#takes care of multiprocessing; calls process() function for each article
def mainloop(self):
results = {}
if multiprocessing_enabled and self.options['num_processes'] > 1:
tasks = multiprocessing.Queue(self.options['num_processes']+1)
manager = multiprocessing.Manager()
scores = manager.dict()
num_tasks = manager.Value('i',1)
scorers = [AlignMultiprocessed(tasks,self.options,scores,self.log) for i in range(self.options['num_processes'])]
for p in scorers:
p.start()
#this function produces the alignment tasks for the consumers in scorers
producer = multiprocessing.Process(target=tasks_producer,args=(tasks,num_tasks,(self.src,self.srctotarget,self.target,self.targettosrc,self.options),self.options['num_processes']))
producer.start()
i = 0
#get results from processed and call printout function
while i < num_tasks.value:
#wait till result #i is populated
while True:
try:
data,multialign,bleualign,scoredict = scores[i]
break
except:
time.sleep(0.1)
for p in scorers:
if p.exitcode == 1:
for p in scorers:
p.terminate()
producer.terminate()
raise RuntimeError("Multiprocessing error")
continue
(sourcelist,targetlist,translist1,translist2) = data
self.scoredict = scoredict
self.multialign = multialign
self.bleualign = bleualign
#normal case: translation from source to target exists
if translist1:
translist = translist1[0]
#no translation provided. we copy source sentences for further processing
else:
if self.options['factored']:
translist = [item[0] for item in sourcelist]
else:
translist = sourcelist
self.printout(sourcelist, translist, targetlist)
if self.options['eval']:
self.log('evaluation ' + str(i))
results[i] = evaluate(self.options,self.multialign,self.options['eval'][i],self.log)
del(scores[i])
i += 1
else:
for i,(sourcelist,targetlist,translist1,translist2) in enumerate(collect_article(self.src,self.srctotarget,self.target,self.targettosrc,self.options)):
self.log('reading in article ' + str(i) + ': ',1)
self.multialign = self.process(sourcelist,targetlist,translist1,translist2)
if translist1:
translist = translist1[0]
else:
if self.options['factored']:
translist = [item[0] for item in sourcelist]
else:
translist = sourcelist
self.printout(sourcelist, translist, targetlist)
if self.options['eval']:
self.log('evaluation ' + str(i))
results[i] = evaluate(self.options, self.multialign,self.options['eval'][i],self.log)
if self.out1:
self.out1.flush()
if self.out2:
self.out2.flush()
if self.options['eval']:
finalevaluation(results, self.log)
if self.options['filter']:
self.write_filtered()
self.close_file_streams()
return self.out1,self.out2
#results of alignment or good aligment if filtering
def results(self):
return self.out1,self.out2
#bad aligment for filtering. Otherwise, None
def results_bad(self):
return self.out_bad1,self.out_bad2
#Start different alignment runs depending on which and how many translations are sent to program; intersect results.
def process(self,sourcelist,targetlist,translist1,translist2):
multialign = []
phase1 = []
phase2 = []
#do nothing if last line in file is .EOA or file is empty.
if not targetlist or not sourcelist:
self.log('WARNING: article is empty. Skipping.',0)
return []
self.log('processing',1)
if self.options['factored']:
raw_sourcelist = [item[0] for item in sourcelist]
raw_targetlist = [item[0] for item in targetlist]
else:
raw_sourcelist = sourcelist
raw_targetlist = targetlist
for i,translist in enumerate(translist1):
self.log("computing alignment between srctotarget (file " + str(i) + ") and target text",1)
phase1.append(self.align(translist, raw_targetlist))
for i,translist in enumerate(translist2):
self.log("computing alignment between targettosrc (file " + str(i) + ") and source text",1)
phase2.append(self.align(translist, raw_sourcelist))
if not (translist1 or translist2):
if self.options['no_translation_override'] or self.options['galechurch']:
phase1 = [self.align(raw_sourcelist, raw_targetlist)]
else:
self.log("ERROR: no translation available", 1)
if multiprocessing_enabled and self.options['num_processes'] > 1:
sys.exit(1)
else:
raise RuntimeError("ERROR: no translation available")
if len(phase1) > 1:
self.log("intersecting all srctotarget alignments",1)
phase1 = sorted(set(phase1[0]).intersection(*[set(x) for x in phase1[1:]]))
elif phase1:
phase1 = phase1[0]
if len(phase2) > 1:
self.log("intersecting all targettosrc alignments",1)
phase2 = sorted(set(phase2[0]).intersection(*[set(x) for x in phase2[1:]]))
elif phase2:
phase2 = phase2[0]
if phase1 and phase2:
self.log("intersecting both directions",1)
phase3 = []
phase2mirror = [(j,k) for ((k,j),t) in phase2]
for pair,t in phase1:
if pair in phase2mirror:
phase3.append((pair,'INTERSECT: ' + t + ' - ' + phase2[phase2mirror.index(pair)][1]))
multialign = phase3
elif phase1:
multialign = phase1
elif phase2:
multialign = [((j,k),t) for ((k,j),t) in phase2]
return multialign
#Compute alignment for one article and one automatic translation.
def align(self, translist, targetlist):
if self.options["galechurch"]:
self.multialign,self.bleualign,self.scoredict = [],[],{}
translist = [item for item in enumerate(translist)]
targetlist = [item for item in enumerate(targetlist)]
churchaligns = self.gale_church(translist,targetlist)
for src,target in churchaligns:
self.addtoAlignments((src,target),'GALECHURCH')
return self.multialign
else:
self.log('Evaluating sentences with bleu',1)
self.scoredict = self.eval_sents(translist,targetlist)
self.log('finished',1)
self.log('searching for longest path of good alignments',1)
self.pathfinder(translist, targetlist)
self.log('finished',1)
self.log(time.asctime(),2)
self.log('filling gaps',1)
self.gapfinder(translist, targetlist)
self.log('finished',1)
self.log(time.asctime(),2)
return self.multialign
#use this if you want to implement your own similarity score
def eval_sents_dummy(self,translist,targetlist):
scoredict = {}
for testID,testSent in enumerate(translist):
scores = []
for refID,refSent in enumerate(targetlist):
score = 100-abs(len(testSent)-len(refSent)) #replace this with your own similarity score
if score > 0:
scores.append((score,refID,score))
scoredict[testID] = sorted(scores,key=itemgetter(0),reverse=True)[:self.options['maxalternatives']]
return scoredict
# given list of test sentences and list of reference sentences, calculate bleu scores
#if you want to replace bleu with your own similarity measure, use eval_sents_dummy
def eval_sents(self,translist,targetlist):
scoredict = {}
cooked_test = {}
cooked_test2 = {}
ngrams = self.options['bleu_ngrams']
charlevel = self.options['bleu_charlevel']
cooktarget_cache = {}
cooktarget = []
for idx, item in enumerate(targetlist):
if charlevel:
item = tuple(item)
if item in cooktarget_cache:
cooktarget.append((idx, cooktarget_cache[item]))
else:
cooked = (idx, bleu.cook_ref_set(item, ngrams))
cooktarget.append(cooked)
cooktarget_cache[item] = cooked[1]
for testID,testSent in enumerate(translist):
if charlevel:
testSent = tuple(testSent)
#copied over from bleu.py to minimize redundancy
test_normalized = bleu.normalize(testSent)
cooked_test["testlen"] = len(test_normalized)
cooked_test["guess"] = [max(len(test_normalized)-k+1,0) for k in range(1,self.options['bleu_ngrams']+1)]
counts = bleu.count_ngrams(test_normalized, self.options['bleu_ngrams'])
#separate by n-gram length. if we have no matching bigrams, we don't have to compare unigrams
ngrams_sorted = dict([(x,set()) for x in range(self.options['bleu_ngrams'])])
for ngram in counts:
ngrams_sorted[len(ngram)-1].add(ngram)
scorelist = []
scorelist_cache = {}
for (refID,(reflen, refmaxcounts, refset)) in cooktarget:
if refset in scorelist_cache:
if scorelist_cache[refset] is not None:
m, c = scorelist_cache[refset]
scorelist.append((m, refID, c))
continue
ngrams_filtered = ngrams_sorted[self.options['bleu_ngrams']-1].intersection(refset)
if ngrams_filtered:
cooked_test["reflen"] = reflen
cooked_test['correct'] = [0]*self.options['bleu_ngrams']
for ngram in ngrams_filtered:
cooked_test["correct"][self.options['bleu_ngrams']-1] += min(refmaxcounts[ngram], counts[ngram])
for order in range(self.options['bleu_ngrams']-1):
for ngram in ngrams_sorted[order].intersection(refset):
cooked_test["correct"][order] += min(refmaxcounts[ngram], counts[ngram])
#copied over from bleu.py to minimize redundancy
logbleu = 0.0
for k in range(self.options['bleu_ngrams']):
logbleu += math.log(cooked_test['correct'][k])-math.log(cooked_test['guess'][k])
logbleu /= self.options['bleu_ngrams']
logbleu += min(0,1-float(cooked_test['reflen'])/cooked_test['testlen'])
score = math.exp(logbleu)
if score > 0:
#calculate bleu score in reverse direction
cooked_test2["guess"] = [max(cooked_test['reflen']-k+1,0) for k in range(1,self.options['bleu_ngrams']+1)]
logbleu = 0.0
for k in range(self.options['bleu_ngrams']):
logbleu += math.log(cooked_test['correct'][k])-math.log(cooked_test2['guess'][k])
logbleu /= self.options['bleu_ngrams']
logbleu += min(0,1-float(cooked_test['testlen'])/cooked_test['reflen'])
score2 = math.exp(logbleu)
meanscore = (2*score*score2)/(score+score2)
scorelist.append((meanscore,refID,cooked_test['correct']))
scorelist_cache[refset] = (meanscore, cooked_test['correct'])
else:
scorelist_cache[refset] = None
else:
scorelist_cache[refset] = None
scoredict[testID] = sorted(scorelist,key=itemgetter(0),reverse=True)[:self.options['maxalternatives']]
return scoredict
#follow the backpointers in score matrix to extract best path of 1-to-1 alignments
def extract_best_path(self,pointers):
i = len(pointers)-1
j = len(pointers[0])-1
pointer = ''
best_path = []
while i >= 0 and j >= 0:
pointer = pointers[i][j]
if pointer == '^':
i -= 1
elif pointer == '<':
j -= 1
elif pointer == 'match':
best_path.append((i,j))
i -= 1
j -= 1
best_path.reverse()
return best_path
#dynamic programming search for best path of alignments (maximal score)
def pathfinder(self, translist, targetlist):
# add an extra row/column to the matrix and start filling it from 1,1 (to avoid exceptions for first row/column)
matrix = [[0 for column in range(len(targetlist)+1)] for row in range(len(translist)+1)]
pointers = [['' for column in range(len(targetlist))] for row in range(len(translist))]
for i in range(len(translist)):
alignments = dict([(target, score) for (score, target, correct) in self.scoredict[i]])
for j in range(len(targetlist)):
best_score = matrix[i][j+1]
best_pointer = '^'
score = matrix[i+1][j]
if score > best_score:
best_score = score
best_pointer = '<'
if j in alignments:
score = alignments[j] + matrix[i][j]
if score > best_score:
best_score = score
best_pointer = 'match'
matrix[i+1][j+1] = best_score
pointers[i][j] = best_pointer
self.bleualign = self.extract_best_path(pointers)
#find unaligned sentences and create work packets for gapfiller()
#gapfiller() takes two sentence pairs and all unaligned sentences in between as arguments; gapfinder() extracts these.
def gapfinder(self, translist, targetlist):
self.multialign = []
#find gaps: lastpair is considered pre-gap, pair is post-gap
lastpair = ((),())
src, target = None, None
for src,target in self.bleualign:
oldsrc, oldtarget = lastpair
#in first iteration, gap will start at 0
if not oldsrc:
oldsrc = (-1,)
if not oldtarget:
oldtarget = (-1,)
#identify gap sizes
sourcegap = list(range(oldsrc[-1]+1,src))
targetgap = list(range(oldtarget[-1]+1,target))
if targetgap or sourcegap:
lastpair = self.gapfiller(sourcegap, targetgap, lastpair, ((src,),(target,)), translist, targetlist)
else:
self.addtoAlignments(lastpair)
lastpair = ((src,),(target,))
#if self.bleualign is empty, gap will start at 0
if src is None:
src = -1
if target is None:
target = -1
#search for gap after last alignment pair
sourcegap = list(range(src+1, len(translist)))
targetgap = list(range(target+1, len(targetlist)))
if targetgap or sourcegap:
lastpair = self.gapfiller(sourcegap, targetgap, lastpair, ((),()), translist, targetlist)
self.addtoAlignments(lastpair)
#apply heuristics to align all sentences that remain unaligned after finding best path of 1-to-1 alignments
#heuristics include bleu-based 1-to-n alignment and length-based alignment
def gapfiller(self, sourcegap, targetgap, pregap, postgap, translist, targetlist):
evalsrc = []
evaltarget = []
#compile list of sentences in gap that will be considered for BLEU comparison
if self.options['Nto1'] > 1 or "bleu1to1" in self.options['gapfillheuristics']:
#concatenate all sentences in pregap alignment pair
tmpstr = ' '.join([translist[i] for i in pregap[0]])
evalsrc.append((pregap[0],tmpstr))
#concatenate all sentences in pregap alignment pair
tmpstr = ' '.join([targetlist[i] for i in pregap[1]])
evaltarget.append((pregap[1],tmpstr))
#search will be pruned to this window
if "bleu1to1" in self.options['gapfillheuristics']:
window = 10 + self.options['Nto1']
else:
window = self.options['Nto1']
for src in [j for i,j in enumerate(sourcegap) if (i < window or len(sourcegap)-i <= window)]:
Sent = translist[src]
evalsrc.append(((src,),Sent))
for target in [j for i,j in enumerate(targetgap) if (i < window or len(targetgap)-i <= window)]:
Sent = targetlist[target]
evaltarget.append(((target,),Sent))
#concatenate all sentences in postgap alignment pair
tmpstr = ' '.join([translist[i] for i in postgap[0]])
evalsrc.append((postgap[0],tmpstr))
#concatenate all sentences in postgap alignment pair
tmpstr = ' '.join([targetlist[i] for i in postgap[1]])
evaltarget.append((postgap[1],tmpstr))
nSrc = {}
for n in range(2,self.options['Nto1']+1):
nSrc[n] = self.createNSents(evalsrc,n)
for n in range(2,self.options['Nto1']+1):
evalsrc += nSrc[n]
nTar = {}
for n in range(2,self.options['Nto1']+1):
nTar[n] = self.createNSents(evaltarget,n)
for n in range(2,self.options['Nto1']+1):
evaltarget += nTar[n]
evalsrc_raw = [item[1] for item in evalsrc]
evaltarget_raw = [item[1] for item in evaltarget]
scoredict_raw = self.eval_sents(evalsrc_raw,evaltarget_raw)
scoredict = {}
for src,value in list(scoredict_raw.items()):
src = evalsrc[src][0]
if value:
newlist = []
for item in value:
score,target,score2 = item
target = evaltarget[target][0]
newlist.append((score,target,score2))
scoredict[src] = newlist
else:
scoredict[src] = []
while sourcegap or targetgap:
pregapsrc,pregaptarget = pregap
postgapsrc,postgaptarget = postgap
if sourcegap and self.options['Nto1'] > 1:
#try if concatenating source sentences together improves bleu score (beginning of gap)
if pregapsrc:
oldscore,oldtarget,oldcorrect = scoredict[pregapsrc][0]
combinedID = tuple(list(pregapsrc)+[sourcegap[0]])
if combinedID in scoredict:
newscore,newtarget,newcorrect = scoredict[combinedID][0]
if newscore > oldscore and newcorrect > oldcorrect and newtarget == pregaptarget:
#print('\nsource side: ' + str(combinedID) + ' better than ' + str(pregapsrc))
pregap = (combinedID,pregaptarget)
sourcegap.pop(0)
continue
#try if concatenating source sentences together improves bleu score (end of gap)
if postgapsrc:
oldscore,oldtarget,oldcorrect = scoredict[postgapsrc][0]
combinedID = tuple([sourcegap[-1]] + list(postgapsrc))
if combinedID in scoredict:
newscore,newtarget, newcorrect = scoredict[combinedID][0]
if newscore > oldscore and newcorrect > oldcorrect and newtarget == postgaptarget:
#print('\nsource side: ' + str(combinedID) + ' better than ' + str(postgapsrc))
postgap = (combinedID,postgaptarget)
sourcegap.pop()
continue
if targetgap and self.options['Nto1'] > 1:
#try if concatenating target sentences together improves bleu score (beginning of gap)
if pregapsrc:
newscore,newtarget,newcorrect = scoredict[pregapsrc][0]
if newtarget != pregaptarget and newtarget != postgaptarget:
#print('\ntarget side: ' + str(newtarget) + ' better than ' + str(pregaptarget))
pregap = (pregapsrc,newtarget)
for i in newtarget:
if i in targetgap:
del(targetgap[targetgap.index(i)])
continue
#try if concatenating target sentences together improves bleu score (end of gap)
if postgapsrc:
newscore,newtarget,newcorrect = scoredict[postgapsrc][0]
if newtarget != postgaptarget and newtarget != pregaptarget:
#print('\ntarget side: ' + str(newtarget) + ' better than ' + str(postgaptarget))
postgap = (postgapsrc,newtarget)
for i in newtarget:
if i in targetgap:
del(targetgap[targetgap.index(i)])
continue
#concatenation didn't help, and we still have possible one-to-one alignments
if sourcegap and targetgap:
#align first two sentences if BLEU validates this
if "bleu1to1" in self.options['gapfillheuristics']:
try:
besttarget = scoredict[(sourcegap[0],)][0][1]
except:
besttarget = 0
if besttarget == (targetgap[0],):
self.addtoAlignments(pregap)
#print('\none-to-one: ' + str((sourcegap[0],)) + ' to' + str((targetgap[0],)))
pregap = ((sourcegap[0],),besttarget)
del(sourcegap[0])
del(targetgap[0])
continue
#Alternative approach: use Gale & Church.
if "galechurch" in self.options['gapfillheuristics'] and (max(len(targetgap),len(sourcegap))<4 or max(len(targetgap),len(sourcegap))/min(len(targetgap),len(sourcegap)) < 2):
tempsrcgap = []
for src in sourcegap:
tempsrcgap.append((src,translist[src]))
temptargetgap = []
for target in targetgap:
temptargetgap.append((target,targetlist[target]))
churchaligns = self.gale_church(tempsrcgap,temptargetgap)
for src,target in churchaligns:
self.addtoAlignments((src,target),'GALECHURCH')
break
#no valid gapfiller left. break loop and ignore remaining gap
break
break
if not pregap in [i[0] for i in self.multialign]:
self.addtoAlignments(pregap)
return postgap
#Take list of (ID,Sentence) tuples for two language pairs and calculate Church & Gale alignment
#Then transform it into this program's alignment format
def gale_church(self,tempsrcgap,temptargetgap):
#get sentence lengths in characters
srclengths = [[len(i[1].strip()) for i in tempsrcgap]]
targetlengths = [[len(i[1].strip()) for i in temptargetgap]]
#call gale & church algorithm
pairs = sorted(list((align_texts(srclengths, targetlengths)[0])), key=itemgetter(0))
idict = {}
jdict = {}
newpairs = []
#store 1-to-n alignments in single pairs of tuples (instead of using multiple pairs of ints)
for i,j in pairs:
if i in idict and j in jdict:
done = 0
for iold1, jold1 in newpairs:
if done:
break
if i in iold1:
for iold2, jold2 in newpairs:
if done:
break
if j in jold2:
if not (iold1,jold1) == (iold2,jold2):
del(newpairs[newpairs.index((iold1,jold1))])
del(newpairs[newpairs.index((iold2,jold2))])
inew = tuple(sorted(list(iold1)+list(iold2)))
jnew = tuple(sorted(list(jold1)+list(jold2)))
newpairs.append((inew,jnew))
done = 1
break
elif i in idict:
for iold, jold in newpairs:
if i in iold:
jnew = tuple(sorted(list(jold)+[j]))
newpairs[newpairs.index((iold,jold))] = (iold,jnew)
jdict[j] = 0
break
elif j in jdict:
for iold, jold in newpairs:
if j in jold:
inew = tuple(sorted(list(iold)+[i]))
newpairs[newpairs.index((iold,jold))] = (inew,jold)
idict[i] = 0
break
else:
idict[i] = 0
jdict[j] = 0
newpairs.append(((i,),(j,)))
#Go from Church & Gale's numbering to our IDs
outpairs = []
for i,j in newpairs:
srcID = []
targetID = []
for src in i:
srcID.append(tempsrcgap[src][0])
for target in j:
targetID.append(temptargetgap[target][0])
#print('\nChurch & Gale: ' + str(tuple(srcID)) + ' to ' + str(tuple(targetID)))
outpairs.append((tuple(srcID),tuple(targetID)))
return outpairs
#get a list of (ID,Sentence) tuples and generate bi- or tri-sentence tuples
def createNSents(self,l,n=2):
out = []
for i in range(len(l)-n+1):
IDs = tuple([k for sublist in l[i:i+n] for k in sublist[0]])
Sents = " ".join([k[1] for k in l[i:i+n]])
out.append((IDs,Sents))
return out
def addtoAlignments(self,pair,aligntype=None):
if not (pair[0] and pair[1]):
return
if aligntype:
self.multialign.append((pair,aligntype))
else:
src,target = pair
if len(src) == 1 and len(target) == 1 and (src[0],target[0]) in self.bleualign:
self.multialign.append((pair,"BLEU"))
else:
self.multialign.append((pair,"GAPFILLER"))
def print_alignment_statistics(self, source_len, target_len):
multialignsrccount = sum([len(i[0][0]) for i in self.multialign])
multialigntargetcount = sum([len(i[0][1]) for i in self.multialign])
self.log("Results of BLEU 1-to-1 alignment",2)
if self.options['verbosity'] >= 2:
bleualignsrc = list(map(itemgetter(0),self.bleualign))
for sourceid in range(source_len):
if sourceid in bleualignsrc:
self.log('\033[92m' + str(sourceid) + ": "
+ str(self.bleualign[bleualignsrc.index(sourceid)][1]) + '\033[1;m')
else:
bestcand = self.scoredict.get(sourceid,[])
if bestcand:
bestcand = bestcand[0][1]
self.log('\033[1;31m'+str(sourceid) + ": unaligned. best cand "
+ str(bestcand)+'\033[1;m')
if source_len and target_len:
self.log("\n" + str(len(self.bleualign)) + ' out of ' + str(source_len) + ' source sentences aligned by BLEU ' + str(100*len(self.bleualign)/float(source_len)) + '%',2)
self.log("after gap filling, " + str(multialignsrccount) + ' out of '+ str(source_len) + ' source sentences aligned ' + str(100*multialignsrccount/float(source_len)) + '%',2)
self.log("after gap filling, " + str(multialigntargetcount) + ' out of '+ str(target_len) + ' target sentences aligned ' + str(100*multialigntargetcount/float(target_len)) + '%',2)
#print out some debugging info, and print output to file
def printout(self, sourcelist, translist, targetlist):
self.print_alignment_statistics(len(sourcelist), len(targetlist))
sources = []
translations = []
targets = []
sources_factored = []
targets_factored = []
if self.options['factored']:
sources_output = sources_factored
targets_output = targets_factored
else:
sources_output = sources
targets_output = targets
self.multialign = sorted(self.multialign,key=itemgetter(0))
sentscores = {}
lastsrc,lasttarget = 0,0
for j,(src,target) in enumerate([i[0] for i in self.multialign]):
if self.options['printempty']:
if src[0] != lastsrc + 1:
sources.extend([sourcelist[ID] for ID in range(lastsrc+1,src[0])])
targets.extend(['' for ID in range(lastsrc+1,src[0])])
translations.extend(['' for ID in range(lastsrc+1,src[0])])
if target[0] != lasttarget + 1:
sources.extend(['' for ID in range(lasttarget+1,target[0])])
targets.extend([targetlist[ID] for ID in range(lasttarget+1,target[0])])
translations.extend(['' for ID in range(lasttarget+1,target[0])])
lastsrc = src[-1]
lasttarget = target[-1]
translations.append(' '.join([translist[ID] for ID in src]))
if self.options['factored']:
sources.append(' '.join([sourcelist[ID][0] for ID in src]))
targets.append(' '.join([targetlist[ID][0] for ID in target]))
sources_factored.append(' '.join([sourcelist[ID][1] for ID in src]))
targets_factored.append(' '.join([targetlist[ID][1] for ID in target]))
else:
#sources.append(' '.join([sourcelist[ID] for ID in src]))
#targets.append(' '.join([targetlist[ID] for ID in target]))
sources.append(','.join(["{}".format(ID) for ID in src]))
targets.append(','.join(["{}".format(ID) for ID in target]))
if self.options['filter'] == 'sentences':
self.check_sentence_pair(j, sources[-1], translations[-1], targets[-1], sources_output[-1], targets_output[-1], sentscores)
if self.options['filter'] == 'sentences':
self.filter_sentence_pairs(sentscores, sources_output, targets_output)
if self.options['filter'] == 'articles':
self.filter_article_pairs(sources, translations, targets, sources_output, targets_output)
self.log("\nfinished with article",1)
self.log("\n====================\n",1)
if self.out1 and self.out2 and not self.options['filter']:
if self.options['factored']:
self.out1.write('\n'.join(sources_factored) + '\n')
self.out2.write('\n'.join(targets_factored) + '\n')
else:
self.out1.write('\n'.join(sources) + '\n')
self.out2.write('\n'.join(targets) + '\n')
#get BLEU score of sentence pair (for filtering)
def check_sentence_pair(self, j, src, trans, target, source_out, target_out, sentscores):
sentscore = self.score_article([trans],[target])
sentscore2 = self.score_article([src],[target])
if sentscore2 > sentscore and self.options['filterlang']:
self.out_bad1.write(source_out + '\n')
self.out_bad2.write(target_out + '\n')
else:
if sentscore > 0:
sentscorex = self.score_article([target],[trans])
newsentscore = (2*sentscore*sentscorex)/(sentscore+sentscorex)
else:
newsentscore = 0
sentscores[j]=newsentscore
# get BLEU score for article pair
def score_article(self,test,ref):
refs = [bleu.cook_refs([refSent],self.options['bleu_ngrams']) for refSent in ref]
testcook = []
for i,line in enumerate(test):
testcook.append(bleu.cook_test(line,refs[i],self.options['bleu_ngrams']))
score = bleu.score_cooked(testcook,self.options['bleu_ngrams'])
return score
# store BLEU score for each sentence pair (used for filtering at the very end)
def filter_sentence_pairs(self, sentscores, sources_output, targets_output):
before = len(self.sources_out)
for j,(src,target) in enumerate([i[0] for i in self.multialign]):
if j in sentscores: # false if sentence pair has been filtered out by language filter
confidence = sentscores[j]
self.finalbleu.append((confidence,sentscores.get(j),before,before+1))
before += 1
self.sources_out.append(sources_output[j])
self.targets_out.append(targets_output[j])
# store BLEU score for each article pair (used for filtering at the very end)
def filter_article_pairs(self, sources, translations, targets, sources_output, targets_output):
articlescore = self.score_article(translations,targets)
articlescore2 = self.score_article(sources,targets)
self.log('\nBLEU score for article: ' + str(articlescore) + ' / ' + str(articlescore2),1)
if articlescore2 > articlescore and self.options['filterlang']:
if self.options['factored']:
sources,targets = sources_factored,targets_factored
for i,line in enumerate(sources):
self.out_bad1.write(line + '\n')
self.out_bad2.write(targets[i] + '\n')
else:
articlescorex = self.score_article(targets,translations)
if articlescore > 0:
articlescore = (articlescore*articlescorex*2)/(articlescore+articlescorex)
before = len(self.sources_out)
after = before + len(self.multialign)
self.finalbleu.append((articlescore,articlescore2,before,after))
self.sources_out += sources_output
self.targets_out += targets_output
#filter bad sentence pairs / article pairs
def write_filtered(self):
self.finalbleu = sorted(self.finalbleu,key=itemgetter(0),reverse=True)
self.log(self.finalbleu,2)
totallength=0
totalscore=0
for (articlescore,articlescore2,before,after) in self.finalbleu:
length = after-before
totallength += length
totalscore += articlescore*length
if totallength != 0:
averagescore = totalscore/totallength
self.log("The average BLEU score is: " + str(averagescore),1)
goodlength = totallength*self.options['filterthreshold']/float(100)
totallength = 0
bad_percentiles = []
for i,(articlescore,articlescore2,before,after) in enumerate(self.finalbleu):
length = after-before
totallength += length
if totallength > goodlength:
bad_percentiles = self.finalbleu[i+1:]
self.log("\nDiscarding the following " + self.options['filter'] + " based on relative BLEU\n",2)
self.log(bad_percentiles,2)
if self.options['verbosity'] >= 3:
for score,score2,start,end in bad_percentiles:
for i in range(start,end):
self.log(score,3)
self.log(self.sources_out[i],3)
self.log(self.targets_out[i],3)
self.log('-----------------',3)
break
stopwrite = set([i[2] for i in bad_percentiles])
resumewrite = set([i[3] for i in bad_percentiles])
stopped = 0
#absolute BLEU threshold
if self.options['bleuthreshold']:
bad_sentences = []
for i,(articlescore,articlescore2,before,after) in enumerate(self.finalbleu):
if articlescore < self.options['bleuthreshold']:
bad_sentences.append((articlescore,articlescore2,before,after))
stopwrite.add(before)
resumewrite.add(after)
self.log("\nDiscarding the following " + self.options['filter'] + " based on absolute BLEU\n",2)
self.log(bad_sentences,2)
if self.options['verbosity'] >= 3:
for score,score2,start,end in bad_sentences:
for i in range(start,end):
self.log(score,3)
self.log(self.sources_out[i],3)
self.log(self.targets_out[i],3)
self.log('-----------------',3)
if self.out1 and self.out2 and self.out_bad1 and self.out_bad2:
for i,line in enumerate(self.sources_out):
if i in resumewrite:
stopped = 0
if i in stopwrite:
stopped = 1
if stopped:
self.out_bad1.write(line + '\n')
self.out_bad2.write(self.targets_out[i] + '\n')
else:
self.out1.write(line + '\n')
self.out2.write(self.targets_out[i] + '\n')
#close all files opened by __init__
def close_file_streams(self):
if self.close_src:
self.src.close()
if self.close_target:
self.target.close()
if self.close_out1:
self.out1.close()
if self.close_out2:
self.out2.close()
if self.close_out_bad1:
self.out_bad1.close()
if self.close_out_bad2:
self.out_bad2.close()
for should_be_closed,output_stream\
in zip(self.close_srctotarget,self.srctotarget):
if should_be_closed:
output_stream.close()
for should_be_closed,output_stream\
in zip(self.close_targettosrc,self.targettosrc):
if should_be_closed:
output_stream.close()
def log(self, msg, level = 1, end='\n'):
if level <= self.options['verbosity']:
print(msg, end=end, file = self.options['log_to'])
#Allows parallelizing of alignment
if multiprocessing_enabled:
class AlignMultiprocessed(multiprocessing.Process,Aligner):
def __init__(self,tasks,options,scores,log):
multiprocessing.Process.__init__(self)
self.options = options
self.tasks = tasks
self.scores = scores
self.log = log
self.bleualign = []
self.scoredict = None
def run(self):
i,data = self.tasks.get()
while i != None:
self.log('reading in article ' + str(i) + ': ',1)
sourcelist,targetlist,translist1,translist2 = data
self.multialign = self.process(sourcelist,targetlist,translist1,translist2)
self.scores[i] = (data,self.multialign,self.bleualign,self.scoredict)
i,data = self.tasks.get()