|
|
|
|
|
|
|
|
|
|
|
use warnings; |
|
use strict; |
|
use Getopt::Long "GetOptions"; |
|
|
|
my $MAX_LENGTH = 4; |
|
|
|
my ($system,$system_alignment,$segmentation,$reference,$dir,$input,$corpus,$ttable,@FACTORED_TTABLE,$score_options,$hierarchical,$output_corpus,$alignment,$biconcor,$input_factors,$input_factor_names,$output_factor_names,$precision_by_coverage,$precision_by_coverage_factor,$coverage_dir,$search_graph); |
|
if (!&GetOptions('system=s' => \$system, |
|
'system-alignment=s' => \$system_alignment, |
|
'reference=s' => \$reference, |
|
'dir=s' => \$dir, |
|
'input-factors=i' => \$input_factors, |
|
'input-factor-names=s' => \$input_factor_names, |
|
'output-factor-names=s' => \$output_factor_names, |
|
'precision-by-coverage' => \$precision_by_coverage, |
|
'precision-by-coverage-factor=i' => \$precision_by_coverage_factor, |
|
'input=s' => \$input, |
|
'segmentation=s' => \$segmentation, |
|
'input-corpus=s' => \$corpus, |
|
'ttable=s' => \$ttable, |
|
'factored-ttable=s' => \@FACTORED_TTABLE, |
|
'score-options=s' => \$score_options, |
|
'output-corpus=s' => \$output_corpus, |
|
'alignment-file=s' => \$alignment, |
|
'coverage=s' => \$coverage_dir, |
|
'biconcor=s' => \$biconcor, |
|
'search-graph=s' => \$search_graph, |
|
'hierarchical' => \$hierarchical) || |
|
!defined($dir)) { |
|
die("ERROR: syntax: analysis.perl -system FILE -reference FILE -dir DIR [-input FILE] [-input-corpus FILE] [-ttable FILE] [-score-options SETTINGS] [-segmentation FILE] [-output-corpus FILE] [-alignment-file FILE] [-biconcor BIN]"); |
|
} |
|
|
|
`mkdir -p $dir`; |
|
|
|
|
|
if (defined($input_factor_names) && defined($output_factor_names)) { |
|
open(FACTOR,">$dir/factor-names") or die "Cannot open: $!"; |
|
print FACTOR $input_factor_names."\n"; |
|
print FACTOR $output_factor_names."\n"; |
|
close(FACTOR); |
|
} |
|
|
|
|
|
my(@SYSTEM,@REFERENCE); |
|
my (%PRECISION_CORRECT,%PRECISION_TOTAL, |
|
%RECALL_CORRECT,%RECALL_TOTAL); |
|
if (defined($system) || defined($reference)) { |
|
die("you need to you specify both system and reference, not just either") |
|
unless defined($system) && defined($reference); |
|
|
|
die("can't open system file $system") if ! -e $system; |
|
@SYSTEM = `cat $system`; |
|
chop(@SYSTEM); |
|
|
|
if (! -e $reference && -e $reference.".ref0") { |
|
for(my $i=0;-e $reference.".ref".$i;$i++) { |
|
my @REF = `cat $reference.ref$i`; |
|
chop(@REF); |
|
for(my $j=0;$j<scalar(@REF);$j++) { |
|
push @{$REFERENCE[$j]}, $REF[$j]; |
|
} |
|
} |
|
} |
|
else { |
|
die("can't open system file $reference") if ! -e $reference; |
|
@REFERENCE = `cat $reference`; |
|
chop(@REFERENCE); |
|
} |
|
|
|
for(my $i=0;$i<scalar @SYSTEM;$i++) { |
|
&add_match($SYSTEM[$i],$REFERENCE[$i], |
|
\%PRECISION_CORRECT,\%PRECISION_TOTAL); |
|
&add_match($REFERENCE[$i],$SYSTEM[$i], |
|
\%RECALL_CORRECT,\%RECALL_TOTAL); |
|
} |
|
|
|
open(SUMMARY,">$dir/summary") or die "Cannot open: $!"; |
|
&best_matches(\%PRECISION_CORRECT,\%PRECISION_TOTAL,"$dir/n-gram-precision"); |
|
&best_matches(\%RECALL_CORRECT,\%RECALL_TOTAL,"$dir/n-gram-recall"); |
|
&bleu_annotation(); |
|
close(SUMMARY); |
|
} |
|
|
|
|
|
if (defined($segmentation)) { |
|
if (defined($hierarchical)) { |
|
&hierarchical_segmentation(); |
|
} |
|
else { |
|
&segmentation(); |
|
} |
|
} |
|
|
|
|
|
my (%INPUT_PHRASE,%CORPUS_COVERED,%TTABLE_COVERED,%TTABLE_ENTROPY); |
|
if (!defined($coverage_dir) && (defined($ttable) || defined($corpus))) { |
|
if (!defined($input)) { |
|
die("ERROR: when specifying either ttable or input-corpus, please also specify input\n"); |
|
} |
|
$MAX_LENGTH = 7; |
|
&input_phrases(); |
|
&ttable_coverage("0",$ttable) if defined($ttable); |
|
&corpus_coverage() if defined($corpus); |
|
&input_annotation(); |
|
|
|
|
|
if (defined($input_factors)) { |
|
for(my $factor=1;$factor<$input_factors;$factor++) { |
|
&input_phrases($factor); |
|
&corpus_coverage($factor); |
|
} |
|
} |
|
|
|
|
|
foreach my $factored_ttable (@FACTORED_TTABLE) { |
|
die("factored ttable must be specified as factor:file -- $ttable") |
|
unless $factored_ttable =~ /^([\d,]+)\:(.+)/; |
|
my ($factor,$file) = ($1,$2); |
|
next if defined($ttable) && $file eq $ttable; |
|
&input_phrases($factor); |
|
&ttable_coverage($factor,$file); |
|
} |
|
} |
|
|
|
if (defined($precision_by_coverage)) { |
|
&precision_by_coverage("ttable"); |
|
&precision_by_coverage("corpus"); |
|
} |
|
|
|
|
|
if (defined($corpus) && defined($output_corpus) && defined($alignment) && defined($biconcor)) { |
|
`$biconcor -s $dir/biconcor -c $corpus -t $output_corpus -a $alignment`; |
|
} |
|
|
|
|
|
if (defined($search_graph)) { |
|
&process_search_graph($search_graph); |
|
} |
|
|
|
sub best_matches { |
|
my ($CORRECT,$TOTAL,$out) = @_; |
|
my $type = ($out =~ /precision/) ? "precision" : "recall"; |
|
for(my $length=1;$length<=$MAX_LENGTH;$length++) { |
|
my ($total,$correct) = (0,0); |
|
open(OUT,">$out.$length") or die "Cannot open: $!"; |
|
foreach my $ngram (keys %{$$TOTAL{$length}}) { |
|
printf OUT "%d\t%d\t%s\n", |
|
$$TOTAL{$length}{$ngram}, |
|
$$CORRECT{$length}{$ngram}, |
|
$ngram; |
|
$total += $$TOTAL{$length}{$ngram}; |
|
$correct += $$CORRECT{$length}{$ngram}; |
|
} |
|
close(OUT); |
|
print SUMMARY "$type-$length-total: $total\n"; |
|
print SUMMARY "$type-$length-correct: $correct\n"; |
|
} |
|
} |
|
|
|
|
|
sub input_phrases { |
|
my ($factor) = (@_); |
|
%INPUT_PHRASE = (); |
|
|
|
open(INPUT,$input) or die "Can't read input $input"; |
|
while(my $line = <INPUT>) { |
|
chop($line); |
|
$line = &get_factor_phrase($factor,$line); |
|
&extract_n_grams($line,\%INPUT_PHRASE); |
|
} |
|
close(INPUT); |
|
} |
|
|
|
|
|
sub get_factor_phrase { |
|
my ($factor,$line) = @_; |
|
|
|
|
|
$line =~ s/[\r\n]+//g; |
|
$line =~ s/\s+/ /; |
|
$line =~ s/^ //; |
|
$line =~ s/ $//; |
|
|
|
|
|
if (!defined($factor) || $factor eq "0") { |
|
$line =~ s/\|\S+//g; |
|
return $line; |
|
} |
|
my $factored_line = ""; |
|
|
|
|
|
foreach (split(/ /,$line)) { |
|
$factored_line .= &get_factor_word($factor,$_) . " "; |
|
} |
|
|
|
chop($factored_line); |
|
return $factored_line; |
|
} |
|
|
|
|
|
sub get_factor_word { |
|
my ($factor,$word) = @_; |
|
|
|
my @WORD = split(/\|/,$word); |
|
my $fword = ""; |
|
foreach (split(/,/,$factor)) { |
|
$fword .= $WORD[$_]."|"; |
|
} |
|
chop($fword); |
|
return $fword; |
|
} |
|
|
|
sub factor_ext { |
|
my ($factor) = @_; |
|
return "" if !defined($factor) || $factor eq "0"; |
|
return ".".$factor; |
|
} |
|
|
|
sub bleu_annotation { |
|
open(OUT,"| sort -r >$dir/bleu-annotation") or die "Cannot open: $!"; |
|
for(my $i=0;$i<scalar @SYSTEM;$i++) { |
|
my $system = $SYSTEM[$i]; |
|
$system =~ s/\s+/ /g; |
|
$system =~ s/^ //; |
|
$system =~ s/ $//; |
|
my (%SYS_NGRAM,%REF_NGRAM); |
|
&extract_n_grams( $system, \%SYS_NGRAM ); |
|
&extract_n_grams_arrayopt( $REFERENCE[$i], \%REF_NGRAM, "max" ); |
|
|
|
my @WORD = split(/ /,$system); |
|
my @MATCH; |
|
for(my $i=0;$i<scalar @WORD;$i++) { |
|
$MATCH[$i] = 0; |
|
} |
|
|
|
my $bleu = 1; |
|
for(my $length=1;$length<=$MAX_LENGTH && $length <= scalar @WORD;$length++) { |
|
my $ngram_correct = 1; |
|
for(my $i=0;$i<=scalar @WORD-$length;$i++) { |
|
my $ngram = ""; |
|
for(my $n=0;$n<$length;$n++) { |
|
$ngram .= " " if $n>0; |
|
$ngram .= $WORD[$i+$n]; |
|
} |
|
$REF_NGRAM{$length}{$ngram}--; |
|
if ($REF_NGRAM{$length}{$ngram} >= 0) { |
|
$ngram_correct++; |
|
for(my $n=0;$n<$length;$n++) { |
|
$MATCH[$i+$n] = $length; |
|
} |
|
} |
|
} |
|
$bleu *= ($ngram_correct/(scalar(@WORD)-$length+2)); |
|
} |
|
$bleu = $bleu ** (1/4); |
|
|
|
my $ref_length = 9999; |
|
if (ref($REFERENCE[$i]) eq 'ARRAY') { |
|
foreach my $ref (@{$REFERENCE[$i]}) { |
|
my @RW = split(/ /,$ref); |
|
$ref_length = scalar(@RW) if scalar(@RW) < $ref_length; |
|
} |
|
} |
|
else { |
|
my @RW = split(/ /,$REFERENCE[$i]); |
|
$ref_length = scalar(@RW); |
|
} |
|
|
|
if (scalar(@WORD) < $ref_length && scalar(@WORD)>0) { |
|
$bleu *= exp(1-$ref_length/scalar(@WORD)); |
|
} |
|
|
|
printf OUT "%5.4f\t%d\t",$bleu,$i; |
|
for(my $i=0;$i<scalar @WORD;$i++) { |
|
print OUT " " if $i; |
|
print OUT "$WORD[$i]|$MATCH[$i]"; |
|
} |
|
if (ref($REFERENCE[$i]) eq 'ARRAY') { |
|
foreach my $ref (@{$REFERENCE[$i]}) { |
|
print OUT "\t".$ref; |
|
} |
|
} |
|
else { |
|
print OUT "\t".$REFERENCE[$i] |
|
} |
|
print OUT "\n"; |
|
} |
|
close(OUT); |
|
} |
|
|
|
sub add_match { |
|
my ($system,$reference,$CORRECT,$TOTAL) = @_; |
|
my (%SYS_NGRAM,%REF_NGRAM); |
|
&extract_n_grams_arrayopt( $system, \%SYS_NGRAM, "min" ); |
|
&extract_n_grams_arrayopt( $reference, \%REF_NGRAM, "max" ); |
|
foreach my $length (keys %SYS_NGRAM) { |
|
foreach my $ngram (keys %{$SYS_NGRAM{$length}}) { |
|
my $sys_count = $SYS_NGRAM{$length}{$ngram}; |
|
my $ref_count = 0; |
|
$ref_count = $REF_NGRAM{$length}{$ngram} if defined($REF_NGRAM{$length}{$ngram}); |
|
my $match_count = ($sys_count > $ref_count) ? $ref_count : $sys_count; |
|
|
|
$$CORRECT{$length}{$ngram} += $match_count; |
|
$$TOTAL{$length}{$ngram} += $sys_count; |
|
|
|
} |
|
} |
|
} |
|
|
|
sub ttable_coverage { |
|
my ($factor,$ttable) = @_; |
|
|
|
|
|
if (! -e $ttable && -e $ttable.".gz") { |
|
open(TTABLE,"gzip -cd $ttable.gz|") or die "Cannot open: $!"; |
|
} |
|
elsif ($ttable =~ /.gz$/) { |
|
open(TTABLE,"gzip -cd $ttable|") or die "Cannot open: $!"; |
|
} |
|
else { |
|
open(TTABLE,$ttable) or die "Can't read ttable $ttable: $!"; |
|
} |
|
|
|
|
|
open(REPORT,">$dir/ttable-coverage-by-phrase".&factor_ext($factor)) or die "Cannot open: $!"; |
|
my ($last_in,$last_size,$size) = ("",0); |
|
|
|
my $p_e_given_f_score = 2; |
|
if ($score_options) { |
|
if ($score_options =~ /OnlyDirect/) { |
|
$p_e_given_f_score = 0; |
|
} |
|
elsif ($score_options =~ /NoLex/) { |
|
$p_e_given_f_score = 1; |
|
} |
|
} |
|
|
|
my @DISTRIBUTION = (); |
|
while(<TTABLE>) { |
|
chop; |
|
my @COLUMN = split(/ +\|\|\| +/); |
|
my ($in,$out,$scores) = @COLUMN; |
|
|
|
$in =~ s/ \[[^ \]]+\]$//; |
|
next if $in =~ /\[[^ \]]+\]\[[^ \]]+\]/; |
|
$in = &get_factor_phrase($factor,$in) if defined($factor) && $factor eq "0"; |
|
$scores = $COLUMN[4] if defined($hierarchical); |
|
my @IN = split(/ /,$in); |
|
$size = scalar @IN; |
|
next unless defined($INPUT_PHRASE{$size}{$in}); |
|
$TTABLE_COVERED{$size}{$in}++; |
|
my @SCORE = split(/ /,$scores); |
|
if ($in ne $last_in) { |
|
if ($last_in ne "") { |
|
my $entropy = &compute_entropy(@DISTRIBUTION); |
|
printf REPORT "%s\t%d\t%.5f\n",$last_in,$TTABLE_COVERED{$last_size}{$last_in},$entropy; |
|
$TTABLE_ENTROPY{$last_size}{$last_in} = $entropy; |
|
@DISTRIBUTION = (); |
|
} |
|
$last_in = $in; |
|
$last_size = $size; |
|
} |
|
push @DISTRIBUTION, $SCORE[$p_e_given_f_score]; |
|
} |
|
my $entropy = &compute_entropy(@DISTRIBUTION); |
|
printf REPORT "%s\t%d\t%.5f\n",$last_in,$TTABLE_COVERED{$last_size}{$last_in},$entropy; |
|
$TTABLE_ENTROPY{$last_size}{$last_in} = $entropy; |
|
close(REPORT); |
|
close(TTABLE); |
|
|
|
&additional_coverage_reports($factor,"ttable",\%TTABLE_COVERED); |
|
} |
|
|
|
sub compute_entropy { |
|
my $z = 0; |
|
foreach my $p (@_) { |
|
$z += $p; |
|
} |
|
my $entropy = 0; |
|
foreach my $p (@_) { |
|
next if $p == 0; |
|
$entropy -= ($p/$z)*log($p/$z)/log(2); |
|
} |
|
return $entropy; |
|
} |
|
|
|
sub corpus_coverage { |
|
my ($factor) = @_; |
|
%CORPUS_COVERED = (); |
|
|
|
|
|
open(CORPUS,$corpus) or die "Can't read corpus $corpus"; |
|
while(<CORPUS>) { |
|
my $line = &get_factor_phrase($factor,$_); |
|
my @WORD = split(/ /,$line); |
|
my $sentence_length = scalar @WORD; |
|
for(my $start=0;$start < $sentence_length;$start++) { |
|
my $phrase = ""; |
|
for(my $length=1;$length<$MAX_LENGTH && $start+$length<=$sentence_length;$length++) { |
|
$phrase .= " " if $length > 1; |
|
$phrase .= $WORD[$start+$length-1]; |
|
last if !defined($INPUT_PHRASE{$length}{$phrase}); |
|
$CORPUS_COVERED{$length}{$phrase}++; |
|
} |
|
} |
|
} |
|
close(CORPUS); |
|
|
|
|
|
open(REPORT,">$dir/corpus-coverage-by-phrase".&factor_ext($factor)) or die "Cannot open: $!"; |
|
foreach my $size (sort {$a <=> $b} keys %INPUT_PHRASE) { |
|
foreach my $phrase (keys %{$INPUT_PHRASE{$size}}) { |
|
next unless defined $CORPUS_COVERED{$size}{$phrase}; |
|
printf REPORT "%s\t%d\n", $phrase, $CORPUS_COVERED{$size}{$phrase}; |
|
} |
|
} |
|
close(REPORT); |
|
|
|
&additional_coverage_reports($factor,"corpus",\%CORPUS_COVERED); |
|
} |
|
|
|
sub additional_coverage_reports { |
|
my ($factor,$name,$COVERED) = @_; |
|
|
|
|
|
open(REPORT,">$dir/$name-unknown".&factor_ext($factor)) or die "Cannot open: $!"; |
|
foreach my $phrase (keys %{$INPUT_PHRASE{1}}) { |
|
next if defined($$COVERED{1}{$phrase}); |
|
printf REPORT "%s\t%d\n",$phrase,$INPUT_PHRASE{1}{$phrase}; |
|
} |
|
close(REPORT); |
|
|
|
|
|
open(REPORT,">$dir/$name-coverage-summary".&factor_ext($factor)) or die "Cannot open: $!"; |
|
foreach my $size (sort {$a <=> $b} keys %INPUT_PHRASE) { |
|
my (%COUNT_TYPE,%COUNT_TOKEN); |
|
foreach my $phrase (keys %{$INPUT_PHRASE{$size}}) { |
|
my $covered = $$COVERED{$size}{$phrase}; |
|
$covered = 0 unless defined($covered); |
|
$COUNT_TYPE{$covered}++; |
|
$COUNT_TOKEN{$covered} += $INPUT_PHRASE{$size}{$phrase}; |
|
} |
|
foreach my $count (sort {$a <=> $b} keys %COUNT_TYPE) { |
|
printf REPORT "%d\t%d\t%d\t%d\n",$size,$count,$COUNT_TYPE{$count},$COUNT_TOKEN{$count}; |
|
} |
|
} |
|
close(REPORT); |
|
} |
|
|
|
sub input_annotation { |
|
open(OUT,">$dir/input-annotation") or die "Cannot open: $!";; |
|
open(INPUT,$input) or die "Can't read input $input"; |
|
while(<INPUT>) { |
|
chop; |
|
s/\|\S+//g; |
|
s/<\S[^>]*>//g; |
|
s/\s+/ /g; s/^ //; s/ $//; |
|
print OUT $_."\t"; |
|
my @WORD = split; |
|
my $sentence_length = scalar @WORD; |
|
for(my $start=0;$start < $sentence_length;$start++) { |
|
my $phrase = ""; |
|
for(my $length=1;$length<$MAX_LENGTH && $start+$length<=$sentence_length;$length++) { |
|
$phrase .= " " if $length > 1; |
|
$phrase .= $WORD[$start+$length-1]; |
|
|
|
my $ttable_covered = $TTABLE_COVERED{$length}{$phrase}; |
|
my $corpus_covered = $CORPUS_COVERED{$length}{$phrase}; |
|
next unless defined($ttable_covered) || defined($corpus_covered); |
|
my $ttable_entropy = $TTABLE_ENTROPY{$length}{$phrase} || 0; |
|
|
|
$ttable_covered = 0 unless defined($ttable_covered); |
|
$corpus_covered = 0 unless defined($corpus_covered); |
|
|
|
if (defined($TTABLE_COVERED{$length}{$phrase})) { |
|
printf OUT "%d-%d:%d:%d:%.5f ",$start,$start+$length-1,$corpus_covered,$ttable_covered,$ttable_entropy; |
|
} |
|
} |
|
} |
|
print OUT "\n"; |
|
} |
|
close(INPUT); |
|
close(OUT); |
|
} |
|
|
|
sub extract_n_grams_arrayopt { |
|
my ($sentence,$NGRAM,$minmax) = @_; |
|
if (ref($sentence) eq 'ARRAY') { |
|
my %MINMAX_NGRAM; |
|
&extract_n_grams($$sentence[0],\%MINMAX_NGRAM); |
|
for(my $i=1;$i<scalar(@{$sentence});$i++) { |
|
my %SET_NGRAM; |
|
&extract_n_grams($$sentence[$i],\%SET_NGRAM); |
|
for(my $length=1;$length<=$MAX_LENGTH;$length++) { |
|
if ($minmax eq "min") { |
|
foreach my $ngram (keys %{$MINMAX_NGRAM{$length}}) { |
|
if (!defined($SET_NGRAM{$length}{$ngram})) { |
|
delete( $MINMAX_NGRAM{$length}{$ngram} ); |
|
} |
|
elsif($MINMAX_NGRAM{$length}{$ngram} > $SET_NGRAM{$length}{$ngram}) { |
|
$MINMAX_NGRAM{$length}{$ngram} = $SET_NGRAM{$length}{$ngram}; |
|
} |
|
} |
|
} |
|
else { |
|
foreach my $ngram (keys %{$SET_NGRAM{$length}}) { |
|
if (!defined($MINMAX_NGRAM{$length}{$ngram}) || |
|
$SET_NGRAM{$length}{$ngram} > $MINMAX_NGRAM{$length}{$ngram}) { |
|
$MINMAX_NGRAM{$length}{$ngram} = $SET_NGRAM{$length}{$ngram}; |
|
} |
|
} |
|
} |
|
} |
|
} |
|
for(my $length=1;$length<=$MAX_LENGTH;$length++) { |
|
foreach my $ngram (keys %{$MINMAX_NGRAM{$length}}) { |
|
$$NGRAM{$length}{$ngram} += $MINMAX_NGRAM{$length}{$ngram}; |
|
} |
|
} |
|
} |
|
else { |
|
&extract_n_grams($sentence,$NGRAM); |
|
} |
|
} |
|
|
|
sub extract_n_grams { |
|
my ($sentence,$NGRAM) = @_; |
|
|
|
$sentence =~ s/[\r\n]+//g; |
|
$sentence =~ s/\s+/ /g; |
|
$sentence =~ s/^ //; |
|
$sentence =~ s/ $//; |
|
|
|
my @WORD = split(/ /,$sentence); |
|
for(my $length=1;$length<=$MAX_LENGTH;$length++) { |
|
for(my $i=0;$i<=scalar(@WORD)-$length;$i++) { |
|
my $ngram = ""; |
|
for(my $n=0;$n<$length;$n++) { |
|
$ngram .= " " if $n>0; |
|
$ngram .= $WORD[$i+$n]; |
|
} |
|
$$NGRAM{$length}{$ngram}++; |
|
} |
|
} |
|
} |
|
|
|
sub precision_by_coverage { |
|
my ($coverage_type) = @_; |
|
my (%PREC_BY_WORD,%TOTAL_BY_WORD,%LENGTH_BY_WORD,%DELETED_BY_WORD); |
|
my (%PREC_BY_COVERAGE,%TOTAL_BY_COVERAGE,%LENGTH_BY_COVERAGE,%DELETED_BY_COVERAGE); |
|
my (%PREC_BY_FACTOR,%TOTAL_BY_FACTOR,%LENGTH_BY_FACTOR,%DELETED_BY_FACTOR); |
|
my (%PREC_BY_FACTOR_COVERAGE,%TOTAL_BY_FACTOR_COVERAGE,%LENGTH_BY_FACTOR_COVERAGE,%DELETED_BY_FACTOR_COVERAGE); |
|
|
|
|
|
my %COVERAGE; |
|
print STDERR "".(defined($coverage_dir)?$coverage_dir:$dir) |
|
."/$coverage_type-coverage-by-phrase"; |
|
open(COVERAGE,(defined($coverage_dir)?$coverage_dir:$dir) |
|
."/$coverage_type-coverage-by-phrase") or die "Cannot open: $!"; |
|
while(<COVERAGE>) { |
|
chop; |
|
my ($phrase,$count) = split(/\t/); |
|
$COVERAGE{$phrase} = $count; |
|
} |
|
close(COVERAGE); |
|
|
|
|
|
open(FILE,$segmentation) || die("ERROR: could not open segmentation file $segmentation"); |
|
open(INPUT,$input) or die "Can't read input $input"; |
|
open(ALIGNMENT,$system_alignment) or die "Can't read output alignment file $system_alignment"; |
|
|
|
|
|
my $line_count = 0; |
|
while(my $line = <FILE>) { |
|
chop($line); |
|
|
|
|
|
my $input = <INPUT>; |
|
my @INPUT = split(/ /,&get_factor_phrase(0,$input)); |
|
my @FACTOR = split(/ /,&get_factor_phrase($precision_by_coverage_factor,$input)); |
|
|
|
|
|
my $alignment = <ALIGNMENT>; |
|
my %ALIGNED; |
|
foreach (split(/ /,$alignment)) { |
|
my ($input_pos,$output_pos) = split(/\-/,$_); |
|
push @{$ALIGNED{$input_pos}}, $output_pos; |
|
} |
|
|
|
|
|
|
|
my @OUTPUT = split(/ /,$SYSTEM[$line_count]); |
|
|
|
|
|
|
|
my (%SYS_NGRAM,%REF_NGRAM,%PREC_NGRAM); |
|
&extract_n_grams( $SYSTEM[$line_count], \%SYS_NGRAM ); |
|
&extract_n_grams_arrayopt( $REFERENCE[$line_count++], \%REF_NGRAM, "max" ); |
|
foreach my $ngram (keys %{$SYS_NGRAM{1}}) { |
|
$PREC_NGRAM{1}{$ngram} = 0; |
|
if (defined($REF_NGRAM{1}) && |
|
defined($REF_NGRAM{1}{$ngram})) { |
|
my $ref_count = $REF_NGRAM{1}{$ngram}; |
|
my $sys_count = $SYS_NGRAM{1}{$ngram}; |
|
$PREC_NGRAM{1}{$ngram} = |
|
($ref_count >= $sys_count) ? 1 : $ref_count/$sys_count; |
|
} |
|
} |
|
close(REPORT); |
|
|
|
|
|
my $output_pos = 0; |
|
while($line =~ /([^|]+) \|(\d+)\-(\d+)\|\s*(.*)$/) { |
|
my ($output,$from,$to) = ($1,$2,$3); |
|
$line = $4; |
|
|
|
|
|
if ($from == $to && |
|
scalar(split(/ /,$output)) == 1 && |
|
!defined($ALIGNED{$from})) { |
|
push @{$ALIGNED{$from}},$output_pos; |
|
} |
|
$output_pos += scalar(split(/ /,$output)); |
|
|
|
|
|
for(my $i=$from; $i<=$to; $i++) { |
|
my $coverage = 0; |
|
$coverage = $COVERAGE{$INPUT[$i]} if defined($COVERAGE{$INPUT[$i]}); |
|
|
|
my ($precision,$deleted,$length) = (0,0,0); |
|
|
|
|
|
if (!defined($ALIGNED{$i})) { |
|
$deleted = 1; |
|
} |
|
|
|
else { |
|
foreach my $o (@{$ALIGNED{$i}}) { |
|
$precision += $PREC_NGRAM{1}{$OUTPUT[$o]}; |
|
} |
|
$precision /= scalar(@{$ALIGNED{$i}}); |
|
$length = scalar(@{$ALIGNED{$i}}); |
|
} |
|
|
|
my $word = $INPUT[$i]; |
|
$word .= "\t".$FACTOR[$i] if $precision_by_coverage_factor; |
|
$DELETED_BY_WORD{$word} += $deleted; |
|
$PREC_BY_WORD{$word} += $precision; |
|
$LENGTH_BY_WORD{$word} += $length; |
|
$TOTAL_BY_WORD{$word}++; |
|
|
|
$DELETED_BY_COVERAGE{$coverage} += $deleted; |
|
$PREC_BY_COVERAGE{$coverage} += $precision; |
|
$LENGTH_BY_COVERAGE{$coverage} += $length; |
|
$TOTAL_BY_COVERAGE{$coverage}++; |
|
|
|
if ($precision_by_coverage_factor) { |
|
$DELETED_BY_FACTOR{$FACTOR[$i]} += $deleted; |
|
$DELETED_BY_FACTOR_COVERAGE{$FACTOR[$i]}{$coverage} += $deleted; |
|
$PREC_BY_FACTOR{$FACTOR[$i]} += $precision; |
|
$PREC_BY_FACTOR_COVERAGE{$FACTOR[$i]}{$coverage} += $precision; |
|
$LENGTH_BY_FACTOR{$FACTOR[$i]} += $length; |
|
$LENGTH_BY_FACTOR_COVERAGE{$FACTOR[$i]}{$coverage} += $length; |
|
$TOTAL_BY_FACTOR{$FACTOR[$i]}++; |
|
$TOTAL_BY_FACTOR_COVERAGE{$FACTOR[$i]}{$coverage}++; |
|
} |
|
} |
|
} |
|
} |
|
close(FILE); |
|
|
|
open(REPORT,">$dir/precision-by-$coverage_type-coverage") or die "Cannot open: $!"; |
|
foreach my $coverage (sort {$a <=> $b} keys %TOTAL_BY_COVERAGE) { |
|
printf REPORT "%d\t%.3f\t%d\t%d\t%d\n", $coverage, $PREC_BY_COVERAGE{$coverage}, $DELETED_BY_COVERAGE{$coverage}, $LENGTH_BY_COVERAGE{$coverage}, $TOTAL_BY_COVERAGE{$coverage}; |
|
} |
|
close(REPORT); |
|
|
|
open(REPORT,">$dir/precision-by-input-word") or die "Cannot open: $!"; |
|
foreach my $word (keys %TOTAL_BY_WORD) { |
|
my ($w,$f) = split(/\t/,$word); |
|
my $coverage = 0; |
|
$coverage = $COVERAGE{$w} if defined($COVERAGE{$w}); |
|
printf REPORT "%.3f\t%d\t%d\t%d\t%d\t%s\n", $PREC_BY_WORD{$word}, $DELETED_BY_WORD{$word}, $LENGTH_BY_WORD{$word}, $TOTAL_BY_WORD{$word},$coverage,$word; |
|
} |
|
close(REPORT); |
|
|
|
if ($precision_by_coverage_factor) { |
|
open(REPORT,">$dir/precision-by-$coverage_type-coverage.$precision_by_coverage_factor") or die "Cannot open: $!"; |
|
foreach my $factor (sort keys %TOTAL_BY_FACTOR_COVERAGE) { |
|
foreach my $coverage (sort {$a <=> $b} keys %{$TOTAL_BY_FACTOR_COVERAGE{$factor}}) { |
|
printf REPORT "%s\t%d\t%.3f\t%d\t%d\t%d\n", $factor, $coverage, $PREC_BY_FACTOR_COVERAGE{$factor}{$coverage}, $DELETED_BY_FACTOR_COVERAGE{$factor}{$coverage}, $LENGTH_BY_FACTOR_COVERAGE{$factor}{$coverage}, $TOTAL_BY_FACTOR_COVERAGE{$factor}{$coverage}; |
|
} |
|
} |
|
close(REPORT); |
|
} |
|
} |
|
|
|
sub segmentation { |
|
my %SEGMENTATION; |
|
|
|
open(FILE,$segmentation) || die("ERROR: could not open segmentation file $segmentation"); |
|
open(OUT,">$dir/segmentation-annotation") or die "Cannot open: $!"; |
|
while(<FILE>) { |
|
chop; |
|
my $count=0; |
|
my $out = -1; |
|
foreach (split) { |
|
if (/^\|(\d+)\-(\d+)\|$/) { |
|
print OUT " " unless $out-($count-1) == 0; |
|
printf OUT "%d:%d:%d:%d",$1,$2,$out-($count-1),$out; |
|
my $in_count = $2-$1+1; |
|
$SEGMENTATION{$in_count}{$count}++; |
|
$count = 0; |
|
} |
|
else { |
|
$out++; |
|
$count++; |
|
} |
|
} |
|
print OUT "\n"; |
|
} |
|
close(OUT); |
|
close(FILE); |
|
|
|
open(SUMMARY,">$dir/segmentation") or die "Cannot open: $!"; |
|
foreach my $in (sort { $a <=> $b } keys %SEGMENTATION) { |
|
foreach my $out (sort { $a <=> $b } keys %{$SEGMENTATION{$in}}) { |
|
printf SUMMARY "%d\t%d\t%d\n", $in, $out, $SEGMENTATION{$in}{$out}; |
|
} |
|
} |
|
close(SUMMARY); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
sub hierarchical_segmentation { |
|
my $last_sentence = -1; |
|
my @DERIVATION; |
|
my %STATS; |
|
open(TRACE,$segmentation.".trace") or die "Cannot open: $!"; |
|
open(INPUT_TREE,">$dir/input-tree") or die "Cannot open: $!"; |
|
open(OUTPUT_TREE,">$dir/output-tree") or die "Cannot open: $!"; |
|
open(NODE,">$dir/node") or die "Cannot open: $!"; |
|
while(<TRACE>) { |
|
my $sentence; |
|
my %ITEM; |
|
&hs_scan_line($_, \$sentence, \%ITEM) || die("cannot scan line $_"); |
|
if ($last_sentence >= 0 && $sentence != $last_sentence) { |
|
&hs_process($last_sentence,\@DERIVATION,\%STATS); |
|
@DERIVATION = (); |
|
} |
|
push @DERIVATION,\%ITEM; |
|
$last_sentence = $sentence; |
|
} |
|
&hs_process($last_sentence,\@DERIVATION,\%STATS); |
|
close(TRACE); |
|
close(NODE); |
|
close(INPUT_TREE); |
|
close(OUTPUT_TREE); |
|
|
|
open(SUMMARY,">$dir/rule") or die "Cannot open: $!"; |
|
print SUMMARY "sentence-count\t".(++$last_sentence)."\n"; |
|
print SUMMARY "glue-rule\t".$STATS{'glue-rule'}."\n"; |
|
print SUMMARY "depth\t".$STATS{'depth'}."\n"; |
|
foreach (keys %{$STATS{'rule-type'}}) { |
|
print SUMMARY "rule\t$_\t".$STATS{'rule-type'}{$_}."\n"; |
|
} |
|
close(SUMMARY); |
|
} |
|
|
|
|
|
sub hs_scan_line { |
|
my ($line,$ref_sentence,$ref_item) = @_; |
|
|
|
if ($line =~ /^Trans Opt/) { |
|
|
|
$line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ || |
|
$line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>\S+ \-\> (.+) :([\(\),\d\- ]*): c=/ || |
|
$line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>\S+ \-\> (.+) :([\(\),\d\- ]*): term=.*: nonterm=.*: c=/ || return 0; |
|
my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7); |
|
|
|
${$ref_sentence} = $sentence; |
|
|
|
$ref_item->{'start'} = $start; |
|
$ref_item->{'end'} = $end; |
|
$ref_item->{'rule_lhs'} = $rule_lhs; |
|
|
|
$rule_rhs =~ s/</</g; |
|
$rule_rhs =~ s/>/>/g; |
|
@{$ref_item->{'rule_rhs'}} = split(/ /,$rule_rhs); |
|
|
|
foreach (split(/ /,$alignment)) { |
|
/(\d+)[\-,](\d+)/ || die("funny alignment: $_\n"); |
|
$ref_item->{'alignment'}{$2} = $1; |
|
$ref_item->{'alignedSpan'}{$1} = 1; |
|
} |
|
|
|
@{$ref_item->{'spans'}} = (); |
|
foreach my $span (reverse split(/\s+/,$spans)) { |
|
$span =~ /\[(\d+)\.\.(\d+)\]=(\S+)$/ || die("funny span: $span\n"); |
|
my %SPAN = ( 'from' => $1, 'to' => $2, 'word' => $3 ); |
|
push @{$ref_item->{'spans'}}, \%SPAN; |
|
} |
|
} else { |
|
|
|
$line =~ /^(\d+) \|\|\| \[\S+\] -> (.+) \|\|\| \[(\S+)\] -> (.+) \|\|\| (.*)\|\|\| (.*)/ || return 0; |
|
my ($sentence,$source_rhs,$target_lhs,$target_rhs,$alignment,$source_spans) = ($1,$2,$3,$4,$5,$6); |
|
|
|
${$ref_sentence} = $sentence; |
|
|
|
@{$ref_item->{'spans'}} = (); |
|
foreach (split(/ /,$source_rhs)) { |
|
/^\[?([^\]]+)\]?$/; |
|
my %SPAN = ( 'word' => $1 ); |
|
push @{$ref_item->{'spans'}}, \%SPAN; |
|
} |
|
|
|
my $i = 0; |
|
foreach my $span (split(/ /,$source_spans)) { |
|
$span =~ /(\d+)\.\.(\d+)/ || die("funny span: $span\n"); |
|
$ref_item->{'spans'}[$i]{'from'} = $1; |
|
$ref_item->{'spans'}[$i]{'to'} = $2; |
|
if ($i == 0) { |
|
$ref_item->{'start'} = $1; |
|
} |
|
$ref_item->{'end'} = $2; |
|
$i++; |
|
} |
|
|
|
$ref_item->{'rule_lhs'} = $target_lhs; |
|
|
|
$target_rhs =~ s/</</g; |
|
$target_rhs =~ s/>/>/g; |
|
@{$ref_item->{'rule_rhs'}} = (); |
|
foreach (split(/ /,$target_rhs)) { |
|
/^\[?([^\]]+)\]?$/; |
|
push @{$ref_item->{'rule_rhs'}}, $1; |
|
} |
|
|
|
foreach (split(/ /,$alignment)) { |
|
/(\d+)[\-,](\d+)/ || die("funny alignment: $_\n"); |
|
$ref_item->{'alignment'}{$2} = $1; |
|
$ref_item->{'alignedSpan'}{$1} = 1; |
|
} |
|
} |
|
|
|
return 1; |
|
} |
|
|
|
|
|
sub hs_process { |
|
my ($sentence,$DERIVATION,$STATS) = @_; |
|
|
|
my $DROP_RULE = shift @{$DERIVATION}; |
|
my $max = $$DERIVATION[0]{'end'}; |
|
|
|
|
|
my %GLUE_RULE; |
|
$GLUE_RULE{'start'} = 1; |
|
$GLUE_RULE{'end'} = $max; |
|
$GLUE_RULE{'rule_lhs'} = "S"; |
|
$GLUE_RULE{'depth'} = 0; |
|
my $x=0; |
|
while(1) { |
|
my $RULE = shift @{$DERIVATION}; |
|
if (scalar(@{$$RULE{'rule_rhs'}}) == 2 && |
|
($$RULE{'rule_lhs'} eq "S" && |
|
$$RULE{'rule_rhs'}[0] eq "S" && |
|
$$RULE{'rule_rhs'}[1] eq "X") || |
|
($$RULE{'rule_lhs'} eq "Q" && |
|
$$RULE{'rule_rhs'}[0] eq "Q")) { |
|
unshift @{$GLUE_RULE{'spans'}},$$RULE{'spans'}[1]; |
|
push @{$GLUE_RULE{'rule_rhs'}}, $$RULE{'rule_rhs'}[1]; |
|
$GLUE_RULE{'alignment'}{$x} = $x; |
|
$GLUE_RULE{'alignedSpan'}{$x} = 1; |
|
$x++; |
|
} |
|
else { |
|
unshift @{$DERIVATION}, $RULE; |
|
last; |
|
} |
|
} |
|
unshift @{$DERIVATION}, \%GLUE_RULE; |
|
$$STATS{'glue-rule'} += $x; |
|
|
|
|
|
my %CHART; |
|
foreach my $RULE (@{$DERIVATION}) { |
|
$CHART{$$RULE{'start'}}{$$RULE{'end'}} = $RULE; |
|
} |
|
|
|
|
|
&hs_compute_depth(1,$max,0,\%CHART); |
|
my $max_depth = 0; |
|
foreach my $RULE (@{$DERIVATION}) { |
|
next unless defined($$RULE{'depth'}); |
|
$max_depth = $$RULE{'depth'} if $$RULE{'depth'} > $max_depth; |
|
} |
|
&hs_recompute_depth(1,$max,\%CHART,$max_depth); |
|
$$STATS{'depth'} += $max_depth; |
|
|
|
|
|
|
|
my @MATRIX; |
|
&hs_create_out_span(1,$max,\%CHART,\@MATRIX); |
|
print OUTPUT_TREE &hs_output_matrix($sentence,\@MATRIX,$max_depth); |
|
|
|
my @MATRIX_IN; |
|
&hs_create_in_span(1,$max,\%CHART,\@MATRIX_IN); |
|
print INPUT_TREE &hs_output_matrix($sentence,\@MATRIX_IN,$max_depth); |
|
|
|
|
|
my $id = 0; |
|
foreach my $RULE (@{$DERIVATION}) { |
|
next unless defined($$RULE{'start_div'}); |
|
$$STATS{'rule-type'}{&hs_rule_type($RULE)}++ if $id>0; |
|
$$RULE{'id'} = $id++; |
|
} |
|
&hs_get_children(1,$max,\%CHART); |
|
|
|
foreach my $RULE (@{$DERIVATION}) { |
|
next unless defined($$RULE{'start_div'}); |
|
|
|
print NODE $sentence." "; |
|
print NODE $$RULE{'depth'}." "; |
|
print NODE $$RULE{'start_div'}." ".$$RULE{'end_div'}." "; |
|
print NODE $$RULE{'start_div_in'}." ".$$RULE{'end_div_in'}." "; |
|
print NODE join(",",@{$$RULE{'children'}})."\n"; |
|
} |
|
} |
|
|
|
sub hs_output_matrix { |
|
my ($sentence,$MATRIX,$max_depth) = @_; |
|
my @OPEN; |
|
my $out = ""; |
|
for(my $d=0;$d<=$max_depth;$d++) { push @OPEN, 0; } |
|
foreach my $SPAN (@$MATRIX) { |
|
$out .= $sentence."\t"; |
|
for(my $d=0;$d<=$max_depth;$d++) { |
|
my $class = " "; |
|
my $closing_flag = 0; |
|
if (defined($$SPAN{'closing'}) && defined($$SPAN{'closing'}{$d})) { |
|
$closing_flag = 1; |
|
} |
|
if ($d == $$SPAN{'depth'}) { |
|
if (defined($$SPAN{'opening'}) && $closing_flag) { |
|
$class = "O"; |
|
} |
|
elsif(defined($$SPAN{'opening'})) { |
|
$class = "["; |
|
} |
|
elsif($closing_flag) { |
|
$class = "]"; |
|
} |
|
else { |
|
$class = "-"; |
|
} |
|
} |
|
elsif ($closing_flag) { |
|
$class = "]"; |
|
} |
|
elsif ($OPEN[$d]) { |
|
$class = "-"; |
|
} |
|
$out .= $class; |
|
} |
|
$out .= "\t"; |
|
$out .= $$SPAN{'lhs'} if defined($$SPAN{'lhs'}); |
|
$out .= "\t"; |
|
$out .= $$SPAN{'rhs'} if defined($$SPAN{'rhs'}); |
|
$out .= "\n"; |
|
$OPEN[$$SPAN{'depth'}] = 1 if defined($$SPAN{'opening'}); |
|
if(defined($$SPAN{'closing'})) { |
|
for(my $d=$max_depth;$d>=0;$d--) { |
|
$OPEN[$d] = 0 if defined($$SPAN{'closing'}{$d}); |
|
} |
|
} |
|
} |
|
return $out; |
|
} |
|
|
|
sub hs_rule_type { |
|
my ($RULE) = @_; |
|
|
|
my $type = ""; |
|
|
|
|
|
my %NT; |
|
my $total_word_count = 0; |
|
my $word_count = 0; |
|
my $nt_count = 0; |
|
for(my $i=0;$i<scalar @{$$RULE{'rule_rhs'}};$i++) { |
|
if (defined($$RULE{'alignment'}{$i})) { |
|
$type .= $word_count if $word_count > 0; |
|
$word_count = 0; |
|
my $nt = chr(97+$nt_count++); |
|
$NT{$$RULE{'alignment'}{$i}} = $nt; |
|
$type .= $nt; |
|
} |
|
else { |
|
$word_count++; |
|
$total_word_count++; |
|
} |
|
} |
|
$type .= $word_count if $word_count > 0; |
|
|
|
$type .= ":".$total_word_count.":".$nt_count.":"; |
|
|
|
|
|
$word_count = 0; |
|
$total_word_count = 0; |
|
for(my $i=0;$i<scalar(@{$$RULE{'spans'}});$i++) { |
|
my $SUBSPAN = ${$$RULE{'spans'}}[$i]; |
|
if (defined($$RULE{'alignedSpan'}{$i})) { |
|
$type .= $word_count if $word_count > 0; |
|
$word_count = 0; |
|
$type .= $NT{$i}; |
|
} |
|
else { |
|
$word_count++; |
|
$total_word_count++; |
|
} |
|
} |
|
$type .= $word_count if $word_count > 0; |
|
$type .= ":".$total_word_count; |
|
return $type; |
|
} |
|
|
|
|
|
sub hs_compute_depth { |
|
my ($start,$end,$depth,$CHART) = @_; |
|
if (!defined($$CHART{$start}{$end})) { |
|
print STDERR "warning: illegal span ($start,$end)\n"; |
|
return; |
|
} |
|
my $RULE = $$CHART{$start}{$end}; |
|
|
|
$$RULE{'depth'} = $depth; |
|
|
|
for(my $i=0;$i<scalar @{$$RULE{'rule_rhs'}};$i++) { |
|
|
|
if (defined($$RULE{'alignment'}{$i})) { |
|
my $SUBSPAN = $$RULE{'spans'}[$$RULE{'alignment'}{$i}]; |
|
&hs_compute_depth($$SUBSPAN{'from'},$$SUBSPAN{'to'},$depth+1,$CHART); |
|
} |
|
} |
|
} |
|
|
|
|
|
sub hs_recompute_depth { |
|
my ($start,$end,$CHART,$max_depth) = @_; |
|
if (!defined($$CHART{$start}{$end})) { |
|
print STDERR "warning: illegal span ($start,$end)\n"; |
|
return 0; |
|
} |
|
my $RULE = $$CHART{$start}{$end}; |
|
|
|
my $min_sub_depth = $max_depth+1; |
|
for(my $i=0;$i<scalar @{$$RULE{'rule_rhs'}};$i++) { |
|
|
|
if (defined($$RULE{'alignment'}{$i})) { |
|
my $SUBSPAN = $$RULE{'spans'}[$$RULE{'alignment'}{$i}]; |
|
my $sub_depth = &hs_recompute_depth($$SUBSPAN{'from'},$$SUBSPAN{'to'},$CHART,$max_depth); |
|
$min_sub_depth = $sub_depth if $sub_depth < $min_sub_depth; |
|
} |
|
} |
|
$$RULE{'depth'} = $min_sub_depth-1; |
|
return $$RULE{'depth'}; |
|
} |
|
|
|
|
|
sub hs_get_children { |
|
my ($start,$end,$CHART) = @_; |
|
if (!defined($$CHART{$start}{$end})) { |
|
print STDERR "warning: illegal span ($start,$end)\n"; |
|
return -1; |
|
} |
|
my $RULE = $$CHART{$start}{$end}; |
|
|
|
my @CHILDREN = (); |
|
$$RULE{'children'} = \@CHILDREN; |
|
|
|
for(my $i=0;$i<scalar @{$$RULE{'rule_rhs'}};$i++) { |
|
|
|
if (defined($$RULE{'alignment'}{$i})) { |
|
my $SUBSPAN = $$RULE{'spans'}[$$RULE{'alignment'}{$i}]; |
|
my $child = &hs_get_children($$SUBSPAN{'from'},$$SUBSPAN{'to'},$CHART); |
|
push @CHILDREN, $child unless $child == -1; |
|
} |
|
} |
|
return $$RULE{'id'}; |
|
} |
|
|
|
|
|
sub hs_create_out_span { |
|
my ($start,$end,$CHART,$MATRIX) = @_; |
|
if (!defined($$CHART{$start}{$end})) { |
|
print STDERR "warning: illegal span ($start,$end)\n"; |
|
return; |
|
} |
|
my $RULE = $$CHART{$start}{$end}; |
|
|
|
my %SPAN; |
|
$SPAN{'start'} = $start; |
|
$SPAN{'end'} = $end; |
|
$SPAN{'depth'} = $$RULE{'depth'}; |
|
$SPAN{'lhs'} = $$RULE{'rule_lhs'}; |
|
$SPAN{'opening'} = 1; |
|
push @{$MATRIX},\%SPAN; |
|
$$RULE{'start_div'} = $#{$MATRIX}; |
|
my $THIS_SPAN = \%SPAN; |
|
|
|
my $terminal = 1; |
|
for(my $i=0;$i<scalar @{$$RULE{'rule_rhs'}};$i++) { |
|
|
|
if (defined($$RULE{'alignment'}{$i})) { |
|
my $SUBSPAN = $$RULE{'spans'}[$$RULE{'alignment'}{$i}]; |
|
&hs_create_out_span($$SUBSPAN{'from'},$$SUBSPAN{'to'},$CHART,$MATRIX); |
|
$terminal = 0; |
|
} |
|
|
|
else { |
|
|
|
if (!$terminal) { |
|
my %SPAN; |
|
$SPAN{'start'} = $start; |
|
$SPAN{'end'} = $end; |
|
$SPAN{'depth'} = $$RULE{'depth'}; |
|
push @{$MATRIX},\%SPAN; |
|
$THIS_SPAN = \%SPAN; |
|
} |
|
$$THIS_SPAN{'rhs'} .= " " if defined($$THIS_SPAN{'rhs'}); |
|
$$THIS_SPAN{'rhs'} .= $$RULE{"rule_rhs"}[$i]; |
|
$terminal = 1; |
|
} |
|
} |
|
$THIS_SPAN = $$MATRIX[scalar(@{$MATRIX})-1]; |
|
$$RULE{'end_div'} = $#{$MATRIX}; |
|
$$THIS_SPAN{'closing'}{$$RULE{'depth'}} = 1; |
|
} |
|
|
|
|
|
sub hs_create_in_span { |
|
my ($start,$end,$CHART,$MATRIX) = @_; |
|
if (!defined($$CHART{$start}{$end})) { |
|
print STDERR "warning: illegal span ($start,$end)\n"; |
|
return; |
|
} |
|
my $RULE = $$CHART{$start}{$end}; |
|
|
|
my %SPAN; |
|
$SPAN{'start'} = $start; |
|
$SPAN{'end'} = $end; |
|
$SPAN{'depth'} = $$RULE{'depth'}; |
|
$SPAN{'lhs'} = $$RULE{'rule_lhs'}; |
|
$SPAN{'opening'} = 1; |
|
push @{$MATRIX},\%SPAN; |
|
$$RULE{'start_div_in'} = $#{$MATRIX}; |
|
my $THIS_SPAN = \%SPAN; |
|
|
|
my $terminal = 1; |
|
|
|
for(my $i=0;$i<scalar(@{$$RULE{'spans'}});$i++) { |
|
my $SUBSPAN = ${$$RULE{'spans'}}[$i]; |
|
if (defined($$RULE{'alignedSpan'}{$i})) { |
|
&hs_create_in_span($$SUBSPAN{'from'},$$SUBSPAN{'to'},$CHART,$MATRIX); |
|
$terminal = 0; |
|
} |
|
else { |
|
|
|
if (!$terminal) { |
|
my %SPAN; |
|
$SPAN{'start'} = $start; |
|
$SPAN{'end'} = $end; |
|
$SPAN{'depth'} = $$RULE{'depth'}; |
|
push @{$MATRIX},\%SPAN; |
|
$THIS_SPAN = \%SPAN; |
|
} |
|
$$THIS_SPAN{'rhs'} .= " " if defined($$THIS_SPAN{'rhs'}); |
|
$$THIS_SPAN{'rhs'} .= $$SUBSPAN{'word'}; |
|
$terminal = 1; |
|
} |
|
} |
|
$THIS_SPAN = $$MATRIX[scalar(@{$MATRIX})-1]; |
|
$$RULE{'end_div_in'} = $#{$MATRIX}; |
|
$$THIS_SPAN{'closing'}{$$RULE{'depth'}} = 1; |
|
} |
|
|
|
sub process_search_graph { |
|
my ($search_graph_file) = @_; |
|
open(OSG,$search_graph) || die("ERROR: could not open search graph file '$search_graph_file'"); |
|
`mkdir -p $dir/search-graph`; |
|
my $last_sentence = -1; |
|
while(<OSG>) { |
|
my ($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$heuristic_rule_score,$from,$to,$children,$hyp_score); |
|
if (/^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): pC=([\de\-\.]+), c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] \<\</) { |
|
($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$heuristic_rule_score,$from,$to,$children,$hyp_score) = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12); |
|
} |
|
elsif (/^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\de\-\.]+)\] core/ || |
|
/^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): c=([\de\-\.]+) core=\(.*\) \[(\d+)\.\.(\d+)\] (.*)\[total=([\de\-\.]+)\] core/) { |
|
($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$from,$to,$children,$hyp_score) = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12); |
|
$heuristic_rule_score = $rule_score; |
|
} |
|
else { |
|
die("ERROR: buggy search graph line: $_"); |
|
} |
|
chop($alignment) if $alignment; |
|
chop($children) if $children; |
|
$recomb = 0 unless $recomb; |
|
$children = "" unless defined $children; |
|
$alignment = "" unless defined $alignment; |
|
if ($last_sentence != $sentence) { |
|
close(SENTENCE) if $sentence; |
|
open(SENTENCE,">$dir/search-graph/graph.$sentence"); |
|
$last_sentence = $sentence; |
|
} |
|
print SENTENCE "$id\t$recomb\t$from\t$to\t$output\t$alignment\t$children\t$rule_score\t$heuristic_rule_score\t$hyp_score\t$lhs\n"; |
|
} |
|
close(OSG); |
|
close(SENTENCE); |
|
} |
|
|