| |
| |
| |
| |
| |
|
|
| package Corpus; |
| BEGIN |
| { |
| push @INC, "../perllib"; |
| } |
| use Error; |
|
|
| return 1; |
|
|
| |
|
|
| |
| |
| our @FACTORNAMES = ('surf', 'pos', 'lemma', 'stem', 'morph'); |
|
|
| |
| |
| sub new |
| { |
| my $class = shift; |
| my %args = @_; |
| my ($corpusName, $refFileDescs, $infoLine) = ($args{'-name'}, $args{'-descriptions'}, $args{'-info_line'}); |
| my ($factorList, $inputLingmodels, $outputLingmodels) = split(/\s*:\s*/, $infoLine); |
| my $self = {}; |
| $self->{'corpusName'} = $corpusName; |
| $self->{'truth'} = []; |
| $self->{'input'} = []; |
| $self->{'tokenCount'} = {}; |
| $self->{'truthFilename'} = ""; |
| $self->{'inputFilename'} = ""; |
| $self->{'sysoutFilenames'} = {}; |
| $self->{'phraseTableFilenames'} = {}; |
| $self->{'fileCtimes'} = {}; |
| $self->{'factorIndices'} = {}; |
| my @factors = split(/\s+/, $factorList); |
| for(my $i = 0; $i < scalar(@factors); $i++) |
| { |
| $self->{'factorIndices'}->{$factors[$i]} = $i; |
| } |
| $self->{'inputLMs'} = {}; |
| $self->{'outputLMs'} = {}; |
| foreach my $lmInfo (split(/\s*,\s*/, $inputLingmodels)) |
| { |
| my @tokens = split(/\s+/, $lmInfo); |
| $self->{'inputLMs'}->{$tokens[0]} = $tokens[1]; |
| } |
| foreach my $lmInfo (split(/\s*,\s*/, $outputLingmodels)) |
| { |
| my @tokens = split(/\s+/, $lmInfo); |
| $self->{'outputLMs'}->{$tokens[0]} = $tokens[1]; |
| } |
| $self->{'phraseTables'} = {}; |
| $self->{'unknownCount'} = {}; |
| $self->{'sysoutWER'} = {}; |
| $self->{'sysoutPWER'} = {}; |
| $self->{'nnAdjWERPWER'} = {}; |
| $self->{'perplexity'} = {}; |
| $self->{'fileDescriptions'} = {}; |
| $self->{'bleuScores'} = {}; |
| $self->{'bleuConfidence'} = {}; |
| $self->{'subsetBLEUstats'} = {}; |
| $self->{'comparisonStats'} = {}; |
| $self->{'cacheFilename'} = "cache/$corpusName.cache"; |
| bless $self, $class; |
| $self->locateFiles($refFileDescs); |
| $self->loadCacheFile(); |
| print STDERR "on load:\n"; |
| $self->printDetails(); |
| return $self; |
| } |
|
|
| |
| |
| |
| sub getFileDescription |
| { |
| my ($self, $filename) = @_; |
| if(!defined($self->{'fileDescriptions'}->{$filename})) |
| { |
| throw Error::Simple(-text => "Corpus::getFileDescription(): invalid filename '$filename'\n"); |
| } |
| return $self->{'fileDescriptions'}->{$filename}; |
| } |
|
|
| |
| |
| sub getSystemNames |
| { |
| my $self = shift; |
| return keys %{$self->{'sysoutFilenames'}}; |
| } |
|
|
| |
| |
| |
| |
| sub calcUnknownTokens |
| { |
| my ($self, $factorName) = @_; |
| |
| if(exists $self->{'unknownCount'}->{$factorName} && exists $self->{'tokenCount'}->{'input'}) |
| { |
| return ($self->{'unknownCount'}->{$factorName}, $self->{'tokenCount'}->{'input'}); |
| } |
| warn "calcing unknown tokens\n"; |
|
|
| $self->ensureFilenameDefined('input'); |
| $self->ensurePhraseTableDefined($factorName); |
| $self->ensureFactorPosDefined($factorName); |
| $self->loadSentences('input', $self->{'inputFilename'}); |
| $self->loadPhraseTable($factorName); |
|
|
| |
| my ($unknownTokens, $totalTokens) = (0, 0); |
| my $factorIndex = $self->{'factorIndices'}->{$factorName}; |
| foreach my $sentence (@{$self->{'input'}}) |
| { |
| $totalTokens += scalar(@$sentence); |
| foreach my $word (@$sentence) |
| { |
| if(!defined($self->{'phraseTables'}->{$factorName}->{$word->[$factorIndex]})) |
| { |
| $unknownTokens++; |
| } |
| } |
| } |
| $self->{'unknownCount'}->{$factorName} = $unknownTokens; |
| $self->{'tokenCount'}->{'input'} = $totalTokens; |
|
|
| return ($unknownTokens, $totalTokens); |
| } |
|
|
| |
| |
| |
| sub calcNounAdjWER_PWERDiff |
| { |
| my ($self, $sysname) = @_; |
| |
| if(exists $self->{'nnAdjWERPWER'}->{$sysname}) |
| { |
| return @{$self->{'nnAdjWERPWER'}->{$sysname}}; |
| } |
| warn "calcing NN/JJ PWER/WER\n"; |
|
|
| $self->ensureFilenameDefined('truth'); |
| $self->ensureFilenameDefined($sysname); |
| $self->ensureFactorPosDefined('surf'); |
| $self->ensureFactorPosDefined('pos'); |
| $self->loadSentences('truth', $self->{'truthFilename'}); |
| $self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname}); |
| |
| my ($werScore, $pwerScore) = (0, 0); |
| my $nnNadjTags = $self->getPOSTagList('nounAndAdj'); |
| for(my $i = 0; $i < scalar(@{$self->{'truth'}}); $i++) |
| { |
| my @nnAdjEWords = $self->filterFactors($self->{'truth'}->[$i], $self->{'factorIndices'}->{'pos'}, $nnNadjTags); |
| my @nnAdjSWords = $self->filterFactors($self->{$sysname}->[$i], $self->{'factorIndices'}->{'pos'}, $nnNadjTags); |
| my ($sentWer, $tmp) = $self->sentenceWER(\@nnAdjSWords, \@nnAdjEWords, $self->{'factorIndices'}->{'surf'}); |
| $werScore += $sentWer; |
| ($sentWer, $tmp) = $self->sentencePWER(\@nnAdjSWords, \@nnAdjEWords, $self->{'factorIndices'}->{'surf'}); |
| $pwerScore += $sentWer; |
| } |
|
|
| |
| $self->releaseSentences('truth'); |
| $self->releaseSentences($sysname); |
| $self->{'nnAdjWERPWER'}->{$sysname} = [$werScore / $self->{'tokenCount'}->{'truth'}, $pwerScore / $self->{'tokenCount'}->{'truth'}]; |
| return @{$self->{'nnAdjWERPWER'}->{$sysname}}; |
| } |
|
|
| |
| |
| |
| |
| sub calcOverallWER |
| { |
| my ($self, $sysname, $factorName) = (shift, shift, 'surf'); |
| if(scalar(@_) > 0) {$factorName = shift;} |
| |
| if(exists $self->{'sysoutWER'}->{$sysname}->{$factorName}) |
| { |
| return $self->{'sysoutWER'}->{$sysname}->{$factorName}->[0]; |
| } |
| warn "calcing WER\n"; |
|
|
| $self->ensureFilenameDefined('truth'); |
| $self->ensureFilenameDefined($sysname); |
| $self->ensureFactorPosDefined($factorName); |
| $self->loadSentences('truth', $self->{'truthFilename'}); |
| $self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname}); |
|
|
| my ($wer, $swers, $indices) = $self->corpusWER($self->{$sysname}, $self->{'truth'}, $self->{'factorIndices'}->{$factorName}); |
| $self->{'sysoutWER'}->{$sysname}->{$factorName} = [$wer, $swers, $indices]; |
|
|
| |
| $self->releaseSentences('truth'); |
| $self->releaseSentences($sysname); |
| return $self->{'sysoutWER'}->{$sysname}->{$factorName}->[0] / $self->{'tokenCount'}->{'truth'}; |
| } |
|
|
| |
| |
| |
| |
| sub calcOverallPWER |
| { |
| my ($self, $sysname, $factorName) = (shift, shift, 'surf'); |
| if(scalar(@_) > 0) {$factorName = shift;} |
| |
| if(exists $self->{'sysoutPWER'}->{$sysname}->{$factorName}) |
| { |
| return $self->{'sysoutPWER'}->{$sysname}->{$factorName}->[0]; |
| } |
| warn "calcing PWER\n"; |
|
|
| $self->ensureFilenameDefined('truth'); |
| $self->ensureFilenameDefined($sysname); |
| $self->ensureFactorPosDefined($factorName); |
| $self->loadSentences('truth', $self->{'truthFilename'}); |
| $self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname}); |
|
|
| my ($pwer, $spwers, $indices) = $self->corpusPWER($self->{$sysname}, $self->{'truth'}, $self->{'factorIndices'}->{$factorName}); |
| $self->{'sysoutPWER'}->{$sysname}->{$factorName} = [$pwer, $spwers, $indices]; |
|
|
| |
| $self->releaseSentences('truth'); |
| $self->releaseSentences($sysname); |
| return $self->{'sysoutPWER'}->{$sysname}->{$factorName}->[0] / $self->{'tokenCount'}->{'truth'}; |
| } |
|
|
| |
| |
| sub calcBLEU |
| { |
| my ($self, $sysname, $factorName) = (shift, shift, 'surf'); |
| if(scalar(@_) > 0) {$factorName = shift;} |
| |
| if(exists $self->{'bleuScores'}->{$sysname} && exists $self->{'bleuScores'}->{$sysname}->{$factorName}) |
| { |
| return $self->{'bleuScores'}->{$sysname}->{$factorName}; |
| } |
| warn "calcing BLEU\n"; |
|
|
| $self->ensureFilenameDefined('truth'); |
| $self->ensureFilenameDefined($sysname); |
| $self->ensureFactorPosDefined($factorName); |
| $self->loadSentences('truth', $self->{'truthFilename'}); |
| $self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname}); |
|
|
| |
| if(!exists $self->{'bleuScores'}->{$sysname}) {$self->{'bleuScores'}->{$sysname} = {};} |
| if(!exists $self->{'bleuScores'}->{$sysname}->{$factorName}) {$self->{'bleuScores'}->{$sysname}->{$factorName} = [[], []];} |
|
|
| my ($good1, $tot1, $good2, $tot2, $good3, $tot3, $good4, $tot4, $totCLength, $totRLength) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0); |
| my $factorIndex = $self->{'factorIndices'}->{$factorName}; |
| for(my $i = 0; $i < scalar(@{$self->{'truth'}}); $i++) |
| { |
| my ($truthSentence, $sysoutSentence) = ($self->{'truth'}->[$i], $self->{$sysname}->[$i]); |
| my ($unigood, $unicount, $bigood, $bicount, $trigood, $tricount, $quadrugood, $quadrucount, $cLength, $rLength) = |
| $self->sentenceBLEU($truthSentence, $sysoutSentence, $factorIndex, 0); |
| push @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[1]}, [$unigood, $unicount, $bigood, $bicount, $trigood, $tricount, $quadrugood, $quadrucount, $cLength, $rLength]; |
| $good1 += $unigood; $tot1 += $unicount; |
| $good2 += $bigood; $tot2 += $bicount; |
| $good3 += $trigood; $tot3 += $tricount; |
| $good4 += $quadrugood; $tot4 += $quadrucount; |
| $totCLength += $cLength; |
| $totRLength += $rLength; |
| } |
| my $brevity = ($totCLength > $totRLength || $totCLength == 0) ? 1 : exp(1 - $totRLength / $totCLength); |
| my ($pct1, $pct2, $pct3, $pct4) = ($tot1 == 0 ? -1 : $good1 / $tot1, $tot2 == 0 ? -1 : $good2 / $tot2, |
| $tot3 == 0 ? -1 : $good3 / $tot3, $tot4 == 0 ? -1 : $good4 / $tot4); |
| my ($logsum, $logcount) = (0, 0); |
| if($tot1 > 0) {$logsum += my_log($pct1); $logcount++;} |
| if($tot2 > 0) {$logsum += my_log($pct2); $logcount++;} |
| if($tot3 > 0) {$logsum += my_log($pct3); $logcount++;} |
| if($tot4 > 0) {$logsum += my_log($pct4); $logcount++;} |
| my $bleu = $brevity * exp($logsum / $logcount); |
| $self->{'bleuScores'}->{$sysname}->{$factorName}->[0] = [$bleu, 100 * $pct1, 100 * $pct2, 100 * $pct3, 100 * $pct4, $brevity]; |
|
|
| |
| $self->releaseSentences('truth'); |
| $self->releaseSentences($sysname); |
| return @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[0]}; |
| } |
|
|
| |
| |
| |
| |
| |
| sub statisticallyTestBLEUResults |
| { |
| my ($self, $sysname, $factorName) = (shift, shift, 'surf'); |
| if(scalar(@_) > 0) {$factorName = shift;} |
| |
| if(exists $self->{'bleuConfidence'}->{$sysname} && exists $self->{'bleuConfidence'}->{$sysname}->{$factorName}) |
| { |
| return $self->{'bleuConfidence'}->{$sysname}->{$factorName}; |
| } |
| warn "performing consistency tests\n"; |
|
|
| my $k = 30; |
| my $criticalTStat = 2.045; |
| $self->ensureFilenameDefined('truth'); |
| $self->ensureFilenameDefined($sysname); |
| $self->ensureFactorPosDefined($factorName); |
|
|
| |
| if(!exists $self->{'bleuScores'}->{$sysname}->{$factorName}) |
| { |
| $self->calcBLEU($sysname, $factorName); |
| } |
| if(!exists $self->{'subsetBLEUstats'}->{$sysname}) {$self->{'subsetBLEUstats'}->{$sysname} = {};} |
| if(!exists $self->{'subsetBLEUstats'}->{$sysname}->{$factorName}) {$self->{'subsetBLEUstats'}->{$sysname}->{$factorName} = [];} |
|
|
| |
| my @sentenceStats = @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[1]}; |
| for(my $i = 0; $i < $k; $i++) |
| { |
| my ($good1, $tot1, $good2, $tot2, $good3, $tot3, $good4, $tot4, $sysoutLength, $truthLength) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0); |
| for(my $j = $i; $j < scalar(@sentenceStats); $j += $k) |
| { |
| $good1 += $sentenceStats[$j]->[0]; $tot1 += $sentenceStats[$j]->[1]; |
| $good2 += $sentenceStats[$j]->[2]; $tot2 += $sentenceStats[$j]->[3]; |
| $good3 += $sentenceStats[$j]->[4]; $tot3 += $sentenceStats[$j]->[5]; |
| $good4 += $sentenceStats[$j]->[6]; $tot4 += $sentenceStats[$j]->[7]; |
| $sysoutLength += $sentenceStats[$j]->[8]; |
| $truthLength += $sentenceStats[$j]->[9]; |
| } |
| push @{$self->{'subsetBLEUstats'}->{$sysname}->{$factorName}}, [$good1, $tot1, $good2, $tot2, $good3, $tot3, $good4, $tot4, $sysoutLength, $truthLength]; |
| } |
| my $subsetStats = $self->{'subsetBLEUstats'}->{$sysname}->{$factorName}; |
| |
| my $fullCorpusBLEU = $self->{'bleuScores'}->{$sysname}->{$factorName}->[0]; |
| my @means = (0) x 4; |
| my @devs = (0) x 4; |
| my $t = []; |
| if(!exists $self->{'bleuConfidence'}->{$sysname}) {$self->{'bleuConfidence'}->{$sysname} = {};} |
| $self->{'bleuConfidence'}->{$sysname}->{$factorName} = [[], []]; |
| for(my $i = 0; $i < 4; $i++) |
| { |
| for(my $j = 0; $j < $k; $j++) |
| { |
| $means[$i] += $subsetStats->[$j]->[2 * $i] / $subsetStats->[$j]->[2 * $i + 1]; |
| } |
| $means[$i] /= $k; |
| for(my $j = 0; $j < $k; $j++) |
| { |
| $devs[$i] += ($subsetStats->[$j]->[2 * $i] / $subsetStats->[$j]->[2 * $i + 1] - $means[$i]) ** 2; |
| } |
| $devs[$i] = sqrt($devs[$i] / ($k - 1)); |
| $t->[$i] = ($fullCorpusBLEU->[$i + 1] / 100 - $means[$i]) / $devs[$i]; |
| push @{$self->{'bleuConfidence'}->{$sysname}->{$factorName}->[0]}, getLowerBoundPValue($t->[$i]); |
| push @{$self->{'bleuConfidence'}->{$sysname}->{$factorName}->[1]}, |
| [$means[$i] - $criticalTStat * $devs[$i] / sqrt($k), $means[$i] + $criticalTStat * $devs[$i] / sqrt($k)]; |
| } |
|
|
| return $self->{'bleuConfidence'}->{$sysname}->{$factorName}; |
| } |
|
|
| |
| |
| sub calcPerplexity |
| { |
| my ($self, $sysname, $factorName) = @_; |
| print STDERR "ppl $sysname $factorName\n"; |
| |
| if(exists $self->{'perplexity'}->{$sysname} && exists $self->{'perplexity'}->{$sysname}->{$factorName}) |
| { |
| return $self->{'perplexity'}->{$sysname}->{$factorName}; |
| } |
| warn "calcing perplexity\n"; |
|
|
| $self->ensureFilenameDefined($sysname); |
| my $sysoutFilename; |
| if($sysname eq 'truth' || $sysname eq 'input') {$sysoutFilename = $self->{"${sysname}Filename"};} |
| else {$sysoutFilename = $self->{'sysoutFilenames'}->{$sysname};} |
| my $lmFilename; |
| if($sysname eq 'input') {$lmFilename = $self->{'inputLMs'}->{$factorName};} |
| else {$lmFilename = $self->{'outputLMs'}->{$factorName};} |
| my $tmpfile = ".tmp" . time; |
| my $cmd = "perl ./extract-factors.pl $sysoutFilename " . $self->{'factorIndices'}->{$factorName} . " > $tmpfile"; |
| `$cmd`; |
| my @output = `./ngram -lm $lmFilename -ppl $tmpfile`; |
| `rm -f $tmpfile`; |
| $output[1] =~ /ppl1=\s*([0-9\.]+)/; |
| $self->{'perplexity'}->{$sysname}->{$factorName} = $1; |
| return $self->{'perplexity'}->{$sysname}->{$factorName}; |
| } |
|
|
| |
| |
| |
| |
| |
| sub statisticallyCompareSystemResults |
| { |
| my ($self, $sysname1, $sysname2, $factorName) = @_; |
| |
| if(exists $self->{'comparisonStats'}->{$sysname1} && exists $self->{'comparisonStats'}->{$sysname1}->{$sysname2} |
| && exists $self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName}) |
| { |
| return $self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName}; |
| } |
| warn "comparing sysoutputs\n"; |
|
|
| $self->ensureFilenameDefined($sysname1); |
| $self->ensureFilenameDefined($sysname2); |
| $self->ensureFactorPosDefined($factorName); |
| |
| if(!exists $self->{'subsetBLEUstats'}->{$sysname1}->{$factorName}) {$self->statisticallyTestBLEUResults($sysname1, $factorName);} |
| if(!exists $self->{'subsetBLEUstats'}->{$sysname2}->{$factorName}) {$self->statisticallyTestBLEUResults($sysname2, $factorName);} |
|
|
| if(!exists $self->{'comparisonStats'}->{$sysname1}) {$self->{'comparisonStats'}->{$sysname1} = {};} |
| if(!exists $self->{'comparisonStats'}->{$sysname1}->{$sysname2}) {$self->{'comparisonStats'}->{$sysname1}->{$sysname2} = {};} |
| if(!exists $self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName}) {$self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName} = [];} |
| my ($tConfidences, $tWinningIndices, $signConfidences, $signWinningIndices) = ([], [], [], []); |
| for(my $i = 0; $i < 4; $i++) |
| { |
| |
| my ($mean, $dev) = (0, 0); |
| |
| my ($nPlus, $nMinus) = (0, 0); |
| my $j; |
| for($j = 0; $j < scalar(@{$self->{'subsetBLEUstats'}->{$sysname1}->{$factorName}}); $j++) |
| { |
| my ($stats1, $stats2) = ($self->{'subsetBLEUstats'}->{$sysname1}->{$factorName}->[$j], $self->{'subsetBLEUstats'}->{$sysname2}->{$factorName}->[$j]); |
| my ($prec1, $prec2) = ($stats1->[2 * $i] / $stats1->[2 * $i + 1], $stats2->[2 * $i] / $stats2->[2 * $i + 1]); |
| $mean += $prec1 - $prec2; |
| if($prec1 > $prec2) {$nPlus++;} else {$nMinus++;} |
| } |
| $mean /= $j; |
| for($j = 0; $j < scalar(@{$self->{'subsetBLEUstats'}->{$sysname1}->{$factorName}}); $j++) |
| { |
| my ($stats1, $stats2) = ($self->{'subsetBLEUstats'}->{$sysname1}->{$factorName}->[$j], $self->{'subsetBLEUstats'}->{$sysname2}->{$factorName}->[$j]); |
| my ($prec1, $prec2) = ($stats1->[2 * $i] / $stats1->[2 * $i + 1], $stats2->[2 * $i] / $stats2->[2 * $i + 1]); |
| $dev += ($prec1 - $prec2 - $mean) ** 2; |
| } |
| $dev = sqrt($dev / (($j - 1) * $j)); |
| |
| my $t = $mean / $dev; |
| my $cc = getUpperBoundPValue($t); |
| print STDERR "comparing at n=$i: mu $mean, sigma $dev, t $t -> conf >= " . (1 - $cc) . "\n"; |
| push @$tConfidences, $cc; |
| push @$tWinningIndices, ($mean > 0) ? 0 : 1; |
| |
| my %binomialCoefficients; |
| for(my $k = 0; $k <= $nPlus + $nMinus; $k++) |
| { |
| $binomialCoefficients{$k} = binCoeff($nPlus + $nMinus, $k); |
| } |
| my $sumCoeffs = 0; |
| foreach my $coeff (values %binomialCoefficients) |
| { |
| if($coeff > $binomialCoefficients{$nPlus}) {$sumCoeffs += $coeff;} |
| } |
| push @$signConfidences, $sumCoeffs; |
| push @$signWinningIndices, ($nPlus > $nMinus) ? 0 : 1; |
| } |
| $self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName} = [$tConfidences, $tWinningIndices, $signConfidences, $signWinningIndices]; |
| return $self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName}; |
| } |
|
|
| |
| |
| |
| |
| |
| sub writeComparisonPage |
| { |
| my ($self, $fh, $filter) = @_; |
| my @filteredExtensions = grep($filter, ('e', 'f', keys %{$self->{'sysoutFilenames'}})); |
| my %openedFiles = $self->openFiles(@filteredExtensions); |
| my $id = 1; |
| while(my %lines = $self->readLineFromFiles(%openedFiles)) |
| { |
| $self->printSingleSentenceComparison($fh, $id, %lines); |
| $id++; |
| } |
| $self->closeFiles(%openedFiles); |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| |
| sub DESTROY |
| { |
| my $self = shift; |
| $self->writeCacheFile(); |
| } |
|
|
| |
| |
| |
| sub writeCacheFile |
| { |
| my $self = shift; |
| if(!open(CACHEFILE, ">" . $self->{'cacheFilename'})) |
| { |
| warn "Corpus::writeCacheFile(): can't open '" . $self->{'cacheFilename'} . "' for write\n"; |
| return; |
| } |
|
|
| |
| print CACHEFILE "File changetimes\n"; |
| my $ensureCtimeIsOutput = sub |
| { |
| my $ext = shift; |
| |
| if(exists $self->{'fileCtimes'}->{$ext} && $self->cacheIsCurrentForFile($ext)) {print CACHEFILE "$ext " . $self->{'fileCtimes'}->{$ext} . "\n";} |
| else {print CACHEFILE "$ext " . time . "\n";} |
| }; |
| if(exists $self->{'truthFilename'}) {&$ensureCtimeIsOutput('e');} |
| if(exists $self->{'inputFilename'}) {&$ensureCtimeIsOutput('f');} |
| foreach my $factorName (keys %{$self->{'phraseTableFilenames'}}) {&$ensureCtimeIsOutput("pt_$factorName");} |
| foreach my $sysname (keys %{$self->{'sysoutFilenames'}}) {&$ensureCtimeIsOutput($sysname);} |
| |
| print CACHEFILE "\nBLEU scores\n"; |
| foreach my $sysname (keys %{$self->{'bleuScores'}}) |
| { |
| foreach my $factorName (keys %{$self->{'bleuScores'}->{$sysname}}) |
| { |
| print CACHEFILE "$sysname $factorName " . join(' ', @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[0]}); |
| foreach my $sentenceBLEU (@{$self->{'bleuScores'}->{$sysname}->{$factorName}->[1]}) |
| { |
| print CACHEFILE ";" . join(' ', @$sentenceBLEU); |
| } |
| print CACHEFILE "\n"; |
| } |
| } |
| |
| print CACHEFILE "\nBLEU statistics\n"; |
| foreach my $sysname (keys %{$self->{'bleuConfidence'}}) |
| { |
| foreach my $factorName (keys %{$self->{'bleuConfidence'}->{$sysname}}) |
| { |
| print CACHEFILE "$sysname $factorName " . join(' ', @{$self->{'bleuConfidence'}->{$sysname}->{$factorName}->[0]}); |
| foreach my $subsetConfidence (@{$self->{'bleuConfidence'}->{$sysname}->{$factorName}->[1]}) |
| { |
| print CACHEFILE ";" . join(' ', @$subsetConfidence); |
| } |
| print CACHEFILE "\n"; |
| } |
| } |
| |
| print CACHEFILE "\nStatistical comparisons\n"; |
| foreach my $sysname1 (keys %{$self->{'comparisonStats'}}) |
| { |
| foreach my $sysname2 (keys %{$self->{'comparisonStats'}->{$sysname1}}) |
| { |
| foreach my $factorName (keys %{$self->{'comparisonStats'}->{$sysname1}->{$sysname2}}) |
| { |
| print CACHEFILE "$sysname1 $sysname2 $factorName " . join(';', map {join(' ', @$_)} @{$self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName}}) . "\n"; |
| } |
| } |
| } |
| |
| print CACHEFILE "\nUnknown-token counts\n"; |
| foreach my $factorName (keys %{$self->{'unknownCount'}}) |
| { |
| print CACHEFILE $factorName . " " . $self->{'phraseTableFilenames'}->{$factorName} . " " . $self->{'unknownCount'}->{$factorName} . " " . $self->{'tokenCount'}->{'input'} . "\n"; |
| } |
| |
| print CACHEFILE "\nWER scores\n"; |
| my $printWERFunc = |
| sub |
| { |
| my $werType = shift; |
| foreach my $sysname (keys %{$self->{$werType}}) |
| { |
| foreach my $factorName (keys %{$self->{$werType}->{$sysname}}) |
| { |
| my ($totalWER, $sentenceWERs, $errorWords) = @{$self->{$werType}->{$sysname}->{$factorName}}; |
| print CACHEFILE "$werType $sysname $factorName $totalWER " . join(' ', @$sentenceWERs); |
| foreach my $indices (@$errorWords) |
| { |
| print CACHEFILE ";" . join(' ', @$indices); |
| } |
| print CACHEFILE "\n"; |
| } |
| } |
| }; |
| &$printWERFunc('sysoutWER'); |
| &$printWERFunc('sysoutPWER'); |
| |
| print CACHEFILE "\nPerplexity\n"; |
| foreach my $sysname (keys %{$self->{'perplexity'}}) |
| { |
| foreach my $factorName (keys %{$self->{'perplexity'}->{$sysname}}) |
| { |
| print CACHEFILE "$sysname $factorName " . $self->{'perplexity'}->{$sysname}->{$factorName} . "\n"; |
| } |
| } |
| print "\nNN/ADJ WER/PWER\n"; |
| foreach my $sysname (keys %{$self->{'nnAdjWERPWER'}}) |
| { |
| print CACHEFILE "$sysname " . join(' ', @{$self->{'nnAdjWERPWER'}->{$sysname}}) . "\n"; |
| } |
| print "\n"; |
| close(CACHEFILE); |
| } |
|
|
| |
| |
| |
| sub loadCacheFile |
| { |
| my $self = shift; |
| if(!open(CACHEFILE, "<" . $self->{'cacheFilename'})) |
| { |
| warn "Corpus::loadCacheFile(): can't open '" . $self->{'cacheFilename'} . "' for read\n"; |
| return; |
| } |
| my $mode = 'none'; |
| while(my $line = <CACHEFILE>) |
| { |
| next if $line =~ /^[ \t\n\r\x0a]*$/; |
| chomp $line; |
| |
| if($line =~ /File changetimes/) {$mode = 'ctime';} |
| elsif($line =~ /BLEU scores/) {$mode = 'bleu';} |
| elsif($line =~ /BLEU statistics/) {$mode = 'bstats';} |
| elsif($line =~ /Statistical comparisons/) {$mode = 'cmp';} |
| elsif($line =~ /Unknown-token counts/) {$mode = 'unk';} |
| elsif($line =~ /WER scores/) {$mode = 'wer';} |
| elsif($line =~ /Perplexity/) {$mode = 'ppl';} |
| elsif($line =~ /NN\/ADJ WER\/PWER/) {$mode = 'nawp';} |
| |
| elsif($mode eq 'ctime') |
| { |
| local ($fileExtension, $ctime) = split(/\s+/, $line); |
| $self->{'fileCtimes'}->{$fileExtension} = $ctime; |
| } |
| elsif($mode eq 'bleu') |
| { |
| local ($sysname, $factorName, $rest) = split(/\s+/, $line, 3); |
| next if !$self->cacheIsCurrentForFile($sysname) || !$self->cacheIsCurrentForFile('e'); |
| if(!exists $self->{'bleuScores'}->{$sysname}) {$self->{'bleuScores'}->{$sysname} = {};} |
| if(!exists $self->{'bleuScores'}->{$sysname}->{$factorName}) {$self->{'bleuScores'}->{$sysname}->{$factorName} = [[], []];} |
| my @stats = map {my @tmp = split(/\s+/, $_); \@tmp;} split(/;/, $rest); |
| print STDERR "bleu 1: " . join(', ', @{shift @stats}) . "\n"; |
| print STDERR "bleu 2: " . join(' ', map {"{" . join(', ', @$_) . "}"} @stats) . "\n"; |
| |
| |
| } |
| elsif($mode eq 'bstats') |
| { |
| local ($sysname, $factorName, $rest) = split(/\s+/, $line, 3); |
| next if !$self->cacheIsCurrentForFile($sysname) || !$self->cacheIsCurrentForFile('e'); |
| if(!exists $self->{'bleuConfidence'}->{$sysname}) {$self->{'bleuConfidence'}->{$sysname} = {};} |
| if(!exists $self->{'bleuConfidence'}->{$sysname}->{$factorName}) {$self->{'bleuConfidence'}->{$sysname}->{$factorName} = [[], []];} |
| my @stats = map {my @tmp = split(/\s+/, $_); \@tmp;} split(/;/, $rest); |
| $self->{'bleuConfidence'}->{$sysname}->{$factorName}->[0] = shift @stats; |
| $self->{'bleuConfidence'}->{$sysname}->{$factorName}->[1] = \@stats; |
| } |
| elsif($mode eq 'cmp') |
| { |
| local ($sysname1, $sysname2, $factorName, $rest) = split(/\s+/, $line, 4); |
| next if !$self->cacheIsCurrentForFile($sysname1) || !$self->cacheIsCurrentForFile($sysname2) || !$self->cacheIsCurrentForFile('e'); |
| if(!exists $self->{'comparisonStats'}->{$sysname1}) {$self->{'comparisonStats'}->{$sysname1} = {};} |
| if(!exists $self->{'comparisonStats'}->{$sysname1}->{$sysname2}) {$self->{'comparisonStats'}->{$sysname1}->{$sysname2} = {};} |
| if(!exists $self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName}) {$self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName} = [];} |
| my @stats = map {my @x = split(' ', $_); \@x} split(/;/, $rest); |
| $self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName} = \@stats; |
| } |
| elsif($mode eq 'unk') |
| { |
| local ($factorName, $phraseTableFilename, $unknownCount, $totalCount) = split(' ', $line); |
| next if !$self->cacheIsCurrentForFile('f') || !$self->cacheIsCurrentForFile("pt_$factorName"); |
| if(defined($self->{'phraseTableFilenames'}->{$factorName}) && $self->{'phraseTableFilenames'}->{$factorName} eq $phraseTableFilename) |
| { |
| $self->{'unknownCount'}->{$factorName} = $unknownCount; |
| $self->{'totalTokens'} = $totalCount; |
| } |
| } |
| elsif($mode eq 'wer') |
| { |
| local ($werType, $sysname, $factorName, $totalWER, $details) = split(/\s+/, $line, 5); |
| next if !$self->cacheIsCurrentForFile($sysname) || !$self->cacheIsCurrentForFile('e'); |
| $details =~ /^([^;]*);(.*)/; |
| my @sentenceWERs = split(/\s+/, $1); |
| if(!exists $self->{$werType}->{$sysname}) {$self->{$werType}->{$sysname} = {};} |
| $self->{$werType}->{$sysname}->{$factorName} = [$totalWER, \@sentenceWERs, []]; |
| my @indexLists = split(/;/, $2); |
| for(my $i = 0; $i < scalar(@sentenceWERs); $i++) |
| { |
| my @indices = grep(/\S/, split(/\s+/, $indexLists[$i])); |
| $self->{$werType}->{$sysname}->{$factorName}->[2] = \@indices; |
| } |
| } |
| elsif($mode eq 'ppl') |
| { |
| local ($sysname, $factorName, $perplexity) = split(/\s+/, $line); |
| next if !$self->cacheIsCurrentForFile($sysname); |
| if(!exists $self->{'perplexity'}->{$sysname}) {$self->{'perplexity'}->{$sysname} = {};} |
| $self->{'perplexity'}->{$sysname}->{$factorName} = $perplexity; |
| } |
| elsif($mode eq 'nawp') |
| { |
| local ($sysname, @scores) = split(/\s+/, $line); |
| next if !$self->cacheIsCurrentForFile($sysname); |
| $self->{'nnAdjWERPWER'}->{$sysname} = \@scores; |
| } |
| } |
| close(CACHEFILE); |
| } |
|
|
| |
| |
| sub flushCache |
| { |
| my ($self, $cacheType, $sysname, $factorName) = @_; |
| if($cacheType eq 'bleu') |
| { |
| if(defined($self->{'bleuScores'}->{$sysname}) && defined($self->{'bleuScores'}->{$sysname}->{$factorName})) |
| { |
| delete $self->{'bleuScores'}->{$sysname}->{$factorName}; |
| } |
| } |
| } |
|
|
| |
| |
| sub cacheIsCurrentForFile |
| { |
| my ($self, $ext) = @_; |
| return 0 if !exists $self->{'fileCtimes'}->{$ext} ; |
| my @liveStats = stat($self->{'corpusName'} . ".$ext"); |
| return ($liveStats[9] <= $self->{'fileCtimes'}->{$ext}) ? 1 : 0; |
| } |
|
|
| |
| |
| sub min |
| { |
| my ($a, $b) = @_; |
| return ($a < $b) ? $a : $b; |
| } |
| |
| sub max |
| { |
| my ($a, $b) = @_; |
| return ($a > $b) ? $a : $b; |
| } |
| |
| sub my_log |
| { |
| return -9999999999 unless $_[0]; |
| return log($_[0]); |
| } |
| |
| sub round |
| { |
| my $x = shift; |
| if($x - int($x) < .5) {return int($x);} |
| return int($x) + 1; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| sub getLowerBoundPValue |
| { |
| my $t = abs(shift); |
| |
| my %t2p = |
| ( |
| 0.0063 => .995, |
| 0.0126 => .99, |
| 0.0253 => .98, |
| 0.0380 => .97, |
| 0.0506 => .96, |
| 0.0633 => .95, |
| 0.0950 => .925, |
| 0.127 => .9, |
| 0.191 => .85, |
| 0.256 => .8, |
| 0.389 => .7, |
| 0.530 => .6, |
| 0.683 => .5, |
| 0.854 => .4, |
| 1.055 => .3, |
| 1.311 => .2, |
| 1.699 => .1 |
| ); |
| foreach my $tCmp (sort keys %t2p) {return $t2p{$tCmp} if $t <= $tCmp;} |
| return 0; |
| } |
| |
| |
| sub getUpperBoundPValue |
| { |
| my $t = abs(shift); |
| |
| my %t2p = |
| ( |
| 4.506 => .0001, |
| 4.254 => .0002, |
| 3.918 => .0005, |
| 3.659 => .001, |
| 3.396 => .002, |
| 3.038 => .005, |
| 2.756 => .01, |
| 2.462 => .02, |
| 2.045 => .05, |
| 1.699 => .1, |
| 1.311 => .2, |
| 0.683 => .5 |
| ); |
| foreach my $tCmp (reverse sort keys %t2p) {return $t2p{$tCmp} if $t >= $tCmp;} |
| return 1; |
| } |
|
|
| |
| |
| sub binCoeff |
| { |
| my ($n, $r) = @_; |
| my $coeff = 1; |
| for(my $i = $r + 1; $i <= $n; $i++) {$coeff *= $i; $coeff /= ($i - $r);} |
| return $coeff * (.5 ** $n); |
| } |
|
|
| |
| |
| |
| sub ensureFactorPosDefined |
| { |
| my ($self, $factorName) = @_; |
| if(!defined($self->{'factorIndices'}->{$factorName})) |
| { |
| throw Error::Simple(-text => "Corpus: no index known for factor '$factorName'\n"); |
| } |
| } |
|
|
| |
| |
| |
| sub ensureFilenameDefined |
| { |
| my ($self, $sysname) = @_; |
| if($sysname eq 'truth' || $sysname eq 'input') |
| { |
| if(!defined($self->{"${sysname}Filename"})) |
| { |
| throw Error::Simple(-text => "Corpus: no $sysname corpus defined\n"); |
| } |
| } |
| else |
| { |
| if(!defined($self->{'sysoutFilenames'}->{$sysname})) |
| { |
| throw Error::Simple(-text => "Corpus: no system $sysname defined\n"); |
| } |
| } |
| } |
|
|
| |
| |
| |
| sub ensurePhraseTableDefined |
| { |
| my ($self, $factorName) = @_; |
| if(!defined($self->{'phraseTableFilenames'}->{$factorName})) |
| { |
| throw Error::Simple(-text => "Corpus: no phrase table defined for factor '$factorName'\n"); |
| } |
| } |
|
|
| |
| |
| |
| sub locateFiles |
| { |
| my ($self, $refDescs) = @_; |
| open(DIR, "ls -x1 . |") or die "Corpus::locateFiles(): couldn't list current directory\n"; |
| my $corpusName = $self->{'corpusName'}; |
| while(my $filename = <DIR>) |
| { |
| chop $filename; |
| if($filename =~ /^$corpusName\.(.*)$/) |
| { |
| my $ext = $1; |
| if($ext eq 'e') {$self->{'truthFilename'} = $filename;} |
| elsif($ext eq 'f') {$self->{'inputFilename'} = $filename;} |
| elsif($ext =~ /pt_(.*)/) {$self->{'phraseTableFilenames'}->{$1} = $filename;} |
| else {$self->{'sysoutFilenames'}->{$ext} = $filename;} |
| if(defined($refDescs->{$filename})) |
| { |
| $self->{'fileDescriptions'}->{$filename} = $refDescs->{$filename}; |
| } |
| } |
| } |
| close(DIR); |
| } |
|
|
| |
| |
| |
| sub loadSentences |
| { |
| my ($self, $sysname, $filename) = @_; |
| |
| if(exists $self->{$sysname} && scalar(@{$self->{$sysname}}) > 0) {return;} |
|
|
| $self->{$sysname} = []; |
| $self->{'tokenCount'}->{$sysname} = 0; |
| open(INFILE, "<$filename") or die "Corpus::load(): couldn't open '$filename' for read\n"; |
| while(my $line = <INFILE>) |
| { |
| my @words = split(/\s+/, $line); |
| $self->{'tokenCount'}->{$sysname} += scalar(@words); |
| my $refFactors = []; |
| foreach my $word (@words) |
| { |
| my @factors = split(/\|/, $word); |
| push @$refFactors, \@factors; |
| } |
| push @{$self->{$sysname}}, $refFactors; |
| } |
| close(INFILE); |
| } |
|
|
| |
| |
| |
| sub releaseSentences |
| { |
| |
| |
| } |
|
|
| |
| |
| |
| sub loadPhraseTable |
| { |
| my ($self, $factorName) = @_; |
| $self->ensurePhraseTableDefined($factorName); |
|
|
| my $filename = $self->{'phraseTableFilenames'}->{$factorName}; |
| open(PTABLE, "<$filename") or die "couldn't open '$filename' for read\n"; |
| $self->{'phraseTables'}->{$factorName} = {}; |
| |
| while(my $line = <PTABLE>) |
| { |
| my @phrases = split(/\s*\|\|\|\s*/, $line, 2); |
| $self->{'phraseTables'}->{$factorName}->{$phrases[0]} = 0; |
| } |
| close(PTABLE); |
| } |
|
|
| |
| |
| sub releasePhraseTable |
| { |
| my ($self, $factorName) = @_; |
| $self->{'phraseTables'}->{$factorName} = {}; |
| } |
|
|
| |
| |
| sub getPOSTagList |
| { |
| my ($self, $listname) = @_; |
| |
| if($listname eq 'nounAndAdj') {return ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS'];} |
| |
| } |
|
|
| |
| |
| sub filterFactors |
| { |
| my ($self, $refFullList, $index, $refFactorValues) = @_; |
| my $valuesRegex = join("|", @$refFactorValues); |
| my @filteredList = (); |
| foreach my $factors (@$refFullList) |
| { |
| if($factors->[$index] =~ m/$valuesRegex/) |
| { |
| push @filteredList, $factors; |
| } |
| } |
| return @filteredList; |
| } |
|
|
| |
| |
| sub corpusWER |
| { |
| my ($self, $refSysOutput, $refTruth, $index) = @_; |
| my ($totWER, $sentenceWER, $errIndices) = (0, [], []); |
| for(my $i = 0; $i < scalar(@$refSysOutput); $i++) |
| { |
| my ($sentWER, $indices) = $self->sentenceWER($refSysOutput->[$i], $refTruth->[$i], $index); |
| $totWER += $sentWER; |
| push @$sentenceWER, $sentWER; |
| push @$errIndices, $indices; |
| } |
| return ($totWER, $sentenceWER, $errIndices); |
| } |
|
|
| |
| |
| sub sentenceWER |
| { |
| |
| my ($DIR_NONE, $DIR_SKIPTRUTH, $DIR_SKIPOUT, $DIR_SKIPBOTH) = (-1, 0, 1, 2); |
| my ($self, $refSysOutput, $refTruth, $index) = @_; |
| my ($totWER, $indices) = (0, []); |
| my ($sLength, $eLength) = (scalar(@$refSysOutput), scalar(@$refTruth)); |
| if($sLength == 0 || $eLength == 0) {return ($totWER, $indices);} |
|
|
| my @refWordsMatchIndices = (-1) x $eLength; |
| my @sysoutWordsMatchIndices = (-1) x $sLength; |
| my $table = []; |
| |
| for(my $i = 0; $i < $sLength; $i++) |
| { |
| push @$table, []; |
| for(my $j = 0; $j < $eLength; $j++) |
| { |
| my ($maxPrev, $prevDir) = (0, $DIR_NONE); |
| if($i > 0 && $table->[$i - 1]->[$j]->[0] >= $maxPrev) {$maxPrev = $table->[$i - 1]->[$j]->[0]; $prevDir = $DIR_SKIPOUT;} |
| if($j > 0 && $table->[$i]->[$j - 1]->[0] >= $maxPrev) {$maxPrev = $table->[$i]->[$j - 1]->[0]; $prevDir = $DIR_SKIPTRUTH;} |
| if($i > 0 && $j > 0 && $table->[$i - 1]->[$j - 1]->[0] >= $maxPrev) {$maxPrev = $table->[$i - 1]->[$j - 1]->[0]; $prevDir = $DIR_SKIPBOTH;} |
| my $match = ($refSysOutput->[$i]->[$index] eq $refTruth->[$j]->[$index] && $refWordsMatchIndices[$j] == -1 && $sysoutWordsMatchIndices[$i] == -1) ? 1 : 0; |
| if($match == 1) {$refWordsMatchIndices[$j] = $i; $sysoutWordsMatchIndices[$i] = $j;} |
| push @{$table->[$i]}, [($match ? $maxPrev + 1 : $maxPrev), $prevDir]; |
| } |
| } |
|
|
| |
| my @unusedSysout = (0) x $sLength; |
| my ($i, $j) = ($sLength - 1, $eLength - 1); |
| while($i > 0) |
| { |
| push @{$table->[$i]->[$j]}, 0; |
| if($table->[$i]->[$j]->[1] == $DIR_SKIPTRUTH) |
| { |
| $j--; |
| } |
| elsif($table->[$i]->[$j]->[1] == $DIR_SKIPOUT) |
| { |
| if($table->[$i - 1]->[$j]->[0] == $table->[$i]->[$j]->[0]) {unshift @$indices, $i; $unusedSysout[$i] = 1;} |
| $i--; |
| } |
| elsif($table->[$i]->[$j]->[1] == $DIR_SKIPBOTH) |
| { |
| if($table->[$i - 1]->[$j - 1]->[0] == $table->[$i]->[$j]->[0]) {unshift @$indices, $i; $unusedSysout[$i] = 1;} |
| $i--; $j--; |
| } |
| } |
| |
| while($j > 0 && $refWordsMatchIndices[$j] != 0) {push @{$table->[0]->[$j]}, 0; $j--;} |
| if($j == 0 && $refWordsMatchIndices[0] != 0) {unshift @$indices, 0; $unusedSysout[0] = 1;} |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| my $matchCount = 0; |
| if($sLength > 0) {$matchCount = $table->[$sLength - 1]->[$eLength - 1]->[0];} |
| return ($sLength - $matchCount, $indices); |
| } |
|
|
| |
| |
| sub corpusPWER |
| { |
| my ($self, $refSysOutput, $refTruth, $index) = @_; |
| my ($totWER, $sentenceWER, $errIndices) = (0, [], []); |
| for(my $i = 0; $i < scalar(@$refSysOutput); $i++) |
| { |
| my ($sentWER, $indices) = $self->sentencePWER($refSysOutput->[$i], $refTruth->[$i], $index); |
| $totWER += $sentWER; |
| push @$sentenceWER, $sentWER; |
| push @$errIndices, $indices; |
| } |
| return ($totWER, $sentenceWER, $errIndices); |
| } |
|
|
| |
| |
| sub sentencePWER |
| { |
| my ($self, $refSysOutput, $refTruth, $index) = @_; |
| my ($totWER, $indices) = (0, []); |
| my ($sLength, $eLength) = (scalar(@$refSysOutput), scalar(@$refTruth)); |
| my @truthWordUsed = (0) x $eLength; |
| for(my $j = 0; $j < $sLength; $j++) |
| { |
| my $found = 0; |
| for(my $k = 0; $k < $eLength; $k++) |
| { |
| if(lc $refSysOutput->[$j]->[$index] eq lc $refTruth->[$k]->[$index] && $truthWordUsed[$k] == 0) |
| { |
| $truthWordUsed[$k] = 1; |
| $found = 1; |
| last; |
| } |
| } |
| if($found == 0) |
| { |
| $totWER++; |
| push @$indices, $j; |
| } |
| } |
| return ($totWER, $indices); |
| } |
|
|
| |
| |
| |
| sub sentenceBLEU |
| { |
| my ($self, $refTruth, $refSysOutput, $factorIndex, $debug) = @_; |
| my ($length_reference, $length_translation) = (scalar(@$refTruth), scalar(@$refSysOutput)); |
| my ($correct1, $correct2, $correct3, $correct4, $total1, $total2, $total3, $total4) = (0, 0, 0, 0, 0, 0, 0, 0); |
| my %REF_GRAM = (); |
| my ($i, $gram); |
| for($i = 0; $i < $length_reference; $i++) |
| { |
| $gram = $refTruth->[$i]->[$factorIndex]; |
| $REF_GRAM{$gram}++; |
| next if $i<1; |
| $gram = $refTruth->[$i - 1]->[$factorIndex] ." ".$gram; |
| $REF_GRAM{$gram}++; |
| next if $i<2; |
| $gram = $refTruth->[$i - 2]->[$factorIndex] ." ".$gram; |
| $REF_GRAM{$gram}++; |
| next if $i<3; |
| $gram = $refTruth->[$i - 3]->[$factorIndex] ." ".$gram; |
| $REF_GRAM{$gram}++; |
| } |
| for($i = 0; $i < $length_translation; $i++) |
| { |
| $gram = $refSysOutput->[$i]->[$factorIndex]; |
| if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) { |
| $REF_GRAM{$gram}--; |
| $correct1++; |
| } |
| next if $i<1; |
| $gram = $refSysOutput->[$i - 1]->[$factorIndex] ." ".$gram; |
| if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) { |
| $REF_GRAM{$gram}--; |
| $correct2++; |
| } |
| next if $i<2; |
| $gram = $refSysOutput->[$i - 2]->[$factorIndex] ." ".$gram; |
| if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) { |
| $REF_GRAM{$gram}--; |
| $correct3++; |
| } |
| next if $i<3; |
| $gram = $refSysOutput->[$i - 3]->[$factorIndex] ." ".$gram; |
| if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) { |
| $REF_GRAM{$gram}--; |
| $correct4++; |
| } |
| } |
| my $total = $length_translation; |
| $total1 = max(1, $total); |
| $total2 = max(1, $total - 1); |
| $total3 = max(1, $total - 2); |
| $total4 = max(1, $total - 3); |
|
|
| return ($correct1, $total1, $correct2, $total2, $correct3, $total3, $correct4, $total4, $length_translation, $length_reference); |
| } |
|
|
| |
|
|
| |
| |
| |
| sub openFiles |
| { |
| my ($self, @extensions) = @_; |
| my %openedFiles = (); |
| foreach my $ext (@extensions) |
| { |
| if(!open(FILE, "<" . $self->{'corpusName'} . $ext)) |
| { |
| warn "Corpus::openFiles(): couldn't open '" . $self->{'corpusName'} . $ext . "' for read\n"; |
| } |
| else |
| { |
| $openedFiles{$ext} = \*FILE; |
| } |
| } |
| return %openedFiles; |
| } |
|
|
| |
| |
| |
| sub readLineFromFiles |
| { |
| my ($self, %openedFiles) = @_; |
| my %lines; |
| foreach my $type (keys %openedFiles) |
| { |
| $lines{$type} = []; |
| my $sentence = <$openedFiles{$type}>; |
| my @words = split(/\s+/, $sentence); |
| foreach my $word (@words) |
| { |
| my @factors = split(/\|/, $word); |
| push @{$lines{$type}}, \@factors; |
| } |
| } |
| return %lines; |
| } |
|
|
| |
| |
| |
| sub closeFiles |
| { |
| my ($self, %openedFiles) = @_; |
| foreach my $type (keys %openedFiles) |
| { |
| close($openedFiles{$type}); |
| } |
| } |
|
|
| |
|
|
| |
| |
| |
| sub printSingleSentenceComparison |
| { |
| my ($self, $fh, $sentID, $sentences) = @_; |
| my $curFH = select; |
| select $fh; |
| |
| print "<script type=\"text/javascript\"> |
| function reorder_$sentID() |
| {/* |
| var table = document.getElementById('div_$sentID').firstChild; |
| var refTransRow = table.getElementById('row_e'); |
| var inputRow = table.getElementById('row_f'); |
| table.removeRow(refTransRow); |
| table.removeRow(inputRow); |
| var newRow1 = table.insertRow(0); |
| var newRow2 = table.insertRow(1); |
| newRow1.childNodes = inputRow.childNodes; |
| newRow2.childNodes = refTransRow.childNodes;*/ |
| } |
| </script>"; |
| |
| print "<div id=\"div_$sentID\" style=\"padding: 3px; margin: 5px\">"; |
| print "<table border=\"1\">"; |
| |
| |
| |
| foreach my $sentType (keys %$sentences) |
| { |
| my $bgcolor = $bgColors[$rowCount % 2]; |
| print "<tr id=\"row_$sentType\"><td align=right>"; |
| |
| if(defined($self->{'fileDescriptions'}->{$self->{'corpusName'} . $sentType})) |
| { |
| print "(" . $self->{'fileDescriptions'}->{$self->{'corpusName'} . $sentType} . ")"; |
| } |
| else |
| { |
| print "($sentType)"; |
| } |
| print "</td><td align=left>"; |
| |
| if($sentType eq 'f') |
| { |
| |
| } |
| elsif($sentType eq 'e') |
| { |
| |
| } |
| else |
| { |
| |
| } |
| print "</td></tr>"; |
| |
| } |
| print "</table>"; |
| print "</div>\n"; |
| select $curFH; |
| } |
|
|
| |
| |
| |
| sub printDetails |
| { |
| my $self = shift; |
| foreach my $key (keys %$self) |
| { |
| if(ref($self->{$key}) eq 'HASH') |
| { |
| print STDERR "obj: $key => {" . join(', ', map {"$_ => " . $self->{$key}->{$_}} (keys %{$self->{$key}})) . "}\n"; |
| } |
| elsif(ref($self->{$key}) eq 'ARRAY') |
| { |
| print STDERR "obj: $key => (" . join(', ', @{$self->{$key}}) . ")\n"; |
| } |
| elsif(ref($self->{$key}) eq '') |
| { |
| print STDERR "obj: $key => " . $self->{$key} . "\n"; |
| } |
| } |
| } |
|
|