sakharamg
/

NMTKD

Model card Files Files and versions Community

NMTKD / translation /tools /mosesdecoder /scripts /ems /support /analysis.perl

sakharamg

Uploading all files

158b61b about 2 years ago

raw

history blame contribute delete

39.6 kB

	#!/usr/bin/env perl
	#
	# This file is part of moses. Its use is licensed under the GNU Lesser General
	# Public License version 2.1 or, at your option, any later version.

	use warnings;
	use strict;
	use Getopt::Long "GetOptions";

	my $MAX_LENGTH = 4;

	my ($system,$system_alignment,$segmentation,$reference,$dir,$input,$corpus,$ttable,@FACTORED_TTABLE,$score_options,$hierarchical,$output_corpus,$alignment,$biconcor,$input_factors,$input_factor_names,$output_factor_names,$precision_by_coverage,$precision_by_coverage_factor,$coverage_dir,$search_graph);
	if (!&GetOptions('system=s' => \$system, # raw output from decoder
	'system-alignment=s' => \$system_alignment, # word alignment of system output
	'reference=s' => \$reference, # tokenized reference
	'dir=s' => \$dir, # directory for storing results
	'input-factors=i' => \$input_factors, # list of input factors
	'input-factor-names=s' => \$input_factor_names,
	'output-factor-names=s' => \$output_factor_names,
	'precision-by-coverage' => \$precision_by_coverage, # added report for input words
	'precision-by-coverage-factor=i' => \$precision_by_coverage_factor, # sub-reports
	'input=s' => \$input, # tokenized input (as for decoder)
	'segmentation=s' => \$segmentation, # system output with segmentation markup
	'input-corpus=s' => \$corpus, # input side of parallel training corpus
	'ttable=s' => \$ttable, # phrase translation table used for decoding
	'factored-ttable=s' => \@FACTORED_TTABLE, # factored phrase translation table
	'score-options=s' => \$score_options, # score options to detect p(e\|f) score
	'output-corpus=s' => \$output_corpus, # output side of parallel training corpus
	'alignment-file=s' => \$alignment, # alignment of parallel corpus
	'coverage=s' => \$coverage_dir, # already computed coverage, stored in this dir
	'biconcor=s' => \$biconcor, # binary for bilingual concordancer
	'search-graph=s' => \$search_graph, # visualization of search graph
	'hierarchical' => \$hierarchical) \|\| # hierarchical model?
	!defined($dir)) {
	die("ERROR: syntax: analysis.perl -system FILE -reference FILE -dir DIR [-input FILE] [-input-corpus FILE] [-ttable FILE] [-score-options SETTINGS] [-segmentation FILE] [-output-corpus FILE] [-alignment-file FILE] [-biconcor BIN]");
	}

	`mkdir -p $dir`;

	# factor names
	if (defined($input_factor_names) && defined($output_factor_names)) {
	open(FACTOR,">$dir/factor-names") or die "Cannot open: $!";
	print FACTOR $input_factor_names."\n";
	print FACTOR $output_factor_names."\n";
	close(FACTOR);
	}

	# compare system output against reference translation
	my(@SYSTEM,@REFERENCE);
	my (%PRECISION_CORRECT,%PRECISION_TOTAL,
	%RECALL_CORRECT,%RECALL_TOTAL);
	if (defined($system) \|\| defined($reference)) {
	die("you need to you specify both system and reference, not just either")
	unless defined($system) && defined($reference);

	die("can't open system file $system") if ! -e $system;
	@SYSTEM = `cat $system`;
	chop(@SYSTEM);

	if (! -e $reference && -e $reference.".ref0") {
	for(my $i=0;-e $reference.".ref".$i;$i++) {
	my @REF = `cat $reference.ref$i`;
	chop(@REF);
	for(my $j=0;$j<scalar(@REF);$j++) {
	push @{$REFERENCE[$j]}, $REF[$j];
	}
	}
	}
	else {
	die("can't open system file $reference") if ! -e $reference;
	@REFERENCE = `cat $reference`;
	chop(@REFERENCE);
	}

	for(my $i=0;$i<scalar @SYSTEM;$i++) {
	&add_match($SYSTEM[$i],$REFERENCE[$i],
	\%PRECISION_CORRECT,\%PRECISION_TOTAL);
	&add_match($REFERENCE[$i],$SYSTEM[$i],
	\%RECALL_CORRECT,\%RECALL_TOTAL);
	}

	open(SUMMARY,">$dir/summary") or die "Cannot open: $!";
	&best_matches(\%PRECISION_CORRECT,\%PRECISION_TOTAL,"$dir/n-gram-precision");
	&best_matches(\%RECALL_CORRECT,\%RECALL_TOTAL,"$dir/n-gram-recall");
	&bleu_annotation();
	close(SUMMARY);
	}

	# segmentation
	if (defined($segmentation)) {
	if (defined($hierarchical)) {
	&hierarchical_segmentation();
	}
	else {
	&segmentation();
	}
	}

	# coverage analysis
	my (%INPUT_PHRASE,%CORPUS_COVERED,%TTABLE_COVERED,%TTABLE_ENTROPY);
	if (!defined($coverage_dir) && (defined($ttable) \|\| defined($corpus))) {
	if (!defined($input)) {
	die("ERROR: when specifying either ttable or input-corpus, please also specify input\n");
	}
	$MAX_LENGTH = 7;
	&input_phrases();
	&ttable_coverage("0",$ttable) if defined($ttable);
	&corpus_coverage() if defined($corpus);
	&input_annotation();

	# corpus coverage for non-surface factors
	if (defined($input_factors)) {
	for(my $factor=1;$factor<$input_factors;$factor++) {
	&input_phrases($factor);
	&corpus_coverage($factor);
	}
	}

	# factored ttable coverage
	foreach my $factored_ttable (@FACTORED_TTABLE) {
	die("factored ttable must be specified as factor:file -- $ttable")
	unless $factored_ttable =~ /^([\d,]+)\:(.+)/; # factor:ttable
	my ($factor,$file) = ($1,$2);
	next if defined($ttable) && $file eq $ttable; # no need to do this twice
	&input_phrases($factor);
	&ttable_coverage($factor,$file);
	}
	}

	if (defined($precision_by_coverage)) {
	&precision_by_coverage("ttable");
	&precision_by_coverage("corpus");
	}

	# bilingual concordance -- not used by experiment.perl
	if (defined($corpus) && defined($output_corpus) && defined($alignment) && defined($biconcor)) {
	`$biconcor -s $dir/biconcor -c $corpus -t $output_corpus -a $alignment`;
	}

	# process search graph for visualization
	if (defined($search_graph)) {
	&process_search_graph($search_graph);
	}

	sub best_matches {
	my ($CORRECT,$TOTAL,$out) = @_;
	my $type = ($out =~ /precision/) ? "precision" : "recall";
	for(my $length=1;$length<=$MAX_LENGTH;$length++) {
	my ($total,$correct) = (0,0);
	open(OUT,">$out.$length") or die "Cannot open: $!";
	foreach my $ngram (keys %{$$TOTAL{$length}}) {
	printf OUT "%d\t%d\t%s\n",
	$$TOTAL{$length}{$ngram},
	$$CORRECT{$length}{$ngram},
	$ngram;
	$total += $$TOTAL{$length}{$ngram};
	$correct += $$CORRECT{$length}{$ngram};
	}
	close(OUT);
	print SUMMARY "$type-$length-total: $total\n";
	print SUMMARY "$type-$length-correct: $correct\n";
	}
	}

	# get all the n-grams from the input corpus
	sub input_phrases {
	my ($factor) = (@_);
	%INPUT_PHRASE = ();

	open(INPUT,$input) or die "Can't read input $input";
	while(my $line = <INPUT>) {
	chop($line);
	$line = &get_factor_phrase($factor,$line);
	&extract_n_grams($line,\%INPUT_PHRASE);
	}
	close(INPUT);
	}

	# reduce a factorized phrase into the factors of interest
	sub get_factor_phrase {
	my ($factor,$line) = @_;

	# clean line
	$line =~ s/[\r\n]+//g;
	$line =~ s/\s+/ /;
	$line =~ s/^ //;
	$line =~ s/ $//;

	# only surface? delete remaining factors
	if (!defined($factor) \|\| $factor eq "0") {
	$line =~ s/\\|\S+//g;
	return $line;
	}
	my $factored_line = "";

	# reduce each word
	foreach (split(/ /,$line)) {
	$factored_line .= &get_factor_word($factor,$_) . " ";
	}

	chop($factored_line);
	return $factored_line;
	}

	# reduce a factorized word into the factors of interest
	sub get_factor_word {
	my ($factor,$word) = @_;

	my @WORD = split(/\\|/,$word);
	my $fword = "";
	foreach (split(/,/,$factor)) {
	$fword .= $WORD[$_]."\|";
	}
	chop($fword);
	return $fword;
	}

	sub factor_ext {
	my ($factor) = @_;
	return "" if !defined($factor) \|\| $factor eq "0";
	return ".".$factor;
	}

	sub bleu_annotation {
	open(OUT,"\| sort -r >$dir/bleu-annotation") or die "Cannot open: $!";
	for(my $i=0;$i<scalar @SYSTEM;$i++) {
	my $system = $SYSTEM[$i];
	$system =~ s/\s+/ /g;
	$system =~ s/^ //;
	$system =~ s/ $//;
	my (%SYS_NGRAM,%REF_NGRAM);
	&extract_n_grams( $system, \%SYS_NGRAM );
	&extract_n_grams_arrayopt( $REFERENCE[$i], \%REF_NGRAM, "max" );

	my @WORD = split(/ /,$system);
	my @MATCH;
	for(my $i=0;$i<scalar @WORD;$i++) {
	$MATCH[$i] = 0;
	}

	my $bleu = 1;
	for(my $length=1;$length<=$MAX_LENGTH && $length <= scalar @WORD;$length++) {
	my $ngram_correct = 1;
	for(my $i=0;$i<=scalar @WORD-$length;$i++) {
	my $ngram = "";
	for(my $n=0;$n<$length;$n++) {
	$ngram .= " " if $n>0;
	$ngram .= $WORD[$i+$n];
	}
	$REF_NGRAM{$length}{$ngram}--;
	if ($REF_NGRAM{$length}{$ngram} >= 0) {
	$ngram_correct++;
	for(my $n=0;$n<$length;$n++) {
	$MATCH[$i+$n] = $length;
	}
	}
	}
	$bleu *= ($ngram_correct/(scalar(@WORD)-$length+2));
	}
	$bleu = $bleu ** (1/4);

	my $ref_length = 9999;
	if (ref($REFERENCE[$i]) eq 'ARRAY') {
	foreach my $ref (@{$REFERENCE[$i]}) {
	my @RW = split(/ /,$ref);
	$ref_length = scalar(@RW) if scalar(@RW) < $ref_length;
	}
	}
	else {
	my @RW = split(/ /,$REFERENCE[$i]);
	$ref_length = scalar(@RW);
	}

	if (scalar(@WORD) < $ref_length && scalar(@WORD)>0) {
	$bleu *= exp(1-$ref_length/scalar(@WORD));
	}

	printf OUT "%5.4f\t%d\t",$bleu,$i;
	for(my $i=0;$i<scalar @WORD;$i++) {
	print OUT " " if $i;
	print OUT "$WORD[$i]\|$MATCH[$i]";
	}
	if (ref($REFERENCE[$i]) eq 'ARRAY') {
	foreach my $ref (@{$REFERENCE[$i]}) {
	print OUT "\t".$ref;
	}
	}
	else {
	print OUT "\t".$REFERENCE[$i]
	}
	print OUT "\n";
	}
	close(OUT);
	}

	sub add_match {
	my ($system,$reference,$CORRECT,$TOTAL) = @_;
	my (%SYS_NGRAM,%REF_NGRAM);
	&extract_n_grams_arrayopt( $system, \%SYS_NGRAM, "min" );
	&extract_n_grams_arrayopt( $reference, \%REF_NGRAM, "max" );
	foreach my $length (keys %SYS_NGRAM) {
	foreach my $ngram (keys %{$SYS_NGRAM{$length}}) {
	my $sys_count = $SYS_NGRAM{$length}{$ngram};
	my $ref_count = 0;
	$ref_count = $REF_NGRAM{$length}{$ngram} if defined($REF_NGRAM{$length}{$ngram});
	my $match_count = ($sys_count > $ref_count) ? $ref_count : $sys_count;

	$$CORRECT{$length}{$ngram} += $match_count;
	$$TOTAL{$length}{$ngram} += $sys_count;
	#print "$length:$ngram $sys_count $ref_count\n";
	}
	}
	}

	sub ttable_coverage {
	my ($factor,$ttable) = @_;

	# open file
	if (! -e $ttable && -e $ttable.".gz") {
	open(TTABLE,"gzip -cd $ttable.gz\|") or die "Cannot open: $!";
	}
	elsif ($ttable =~ /.gz$/) {
	open(TTABLE,"gzip -cd $ttable\|") or die "Cannot open: $!";
	}
	else {
	open(TTABLE,$ttable) or die "Can't read ttable $ttable: $!";
	}

	# create report file
	open(REPORT,">$dir/ttable-coverage-by-phrase".&factor_ext($factor)) or die "Cannot open: $!";
	my ($last_in,$last_size,$size) = ("",0);

	my $p_e_given_f_score = 2;
	if ($score_options) {
	if ($score_options =~ /OnlyDirect/) {
	$p_e_given_f_score = 0;
	}
	elsif ($score_options =~ /NoLex/) {
	$p_e_given_f_score = 1;
	}
	}

	my @DISTRIBUTION = ();
	while(<TTABLE>) {
	chop;
	my @COLUMN = split(/ +\\|\\|\\| +/);
	my ($in,$out,$scores) = @COLUMN;
	# handling hierarchical
	$in =~ s/ \[[^ \]]+\]$//; # remove lhs nt
	next if $in =~ /\[[^ \]]+\]\[[^ \]]+\]/; # only consider flat rules
	$in = &get_factor_phrase($factor,$in) if defined($factor) && $factor eq "0";
	$scores = $COLUMN[4] if defined($hierarchical); #scalar @COLUMN == 5;
	my @IN = split(/ /,$in);
	$size = scalar @IN;
	next unless defined($INPUT_PHRASE{$size}{$in});
	$TTABLE_COVERED{$size}{$in}++;
	my @SCORE = split(/ /,$scores);
	if ($in ne $last_in) {
	if ($last_in ne "") {
	my $entropy = &compute_entropy(@DISTRIBUTION);
	printf REPORT "%s\t%d\t%.5f\n",$last_in,$TTABLE_COVERED{$last_size}{$last_in},$entropy;
	$TTABLE_ENTROPY{$last_size}{$last_in} = $entropy;
	@DISTRIBUTION = ();
	}
	$last_in = $in;
	$last_size = $size;
	}
	push @DISTRIBUTION, $SCORE[$p_e_given_f_score]; # forward probability
	}
	my $entropy = &compute_entropy(@DISTRIBUTION);
	printf REPORT "%s\t%d\t%.5f\n",$last_in,$TTABLE_COVERED{$last_size}{$last_in},$entropy;
	$TTABLE_ENTROPY{$last_size}{$last_in} = $entropy;
	close(REPORT);
	close(TTABLE);

	&additional_coverage_reports($factor,"ttable",\%TTABLE_COVERED);
	}

	sub compute_entropy {
	my $z = 0; # normalization
	foreach my $p (@_) {
	$z += $p;
	}
	my $entropy = 0;
	foreach my $p (@_) {
	next if $p == 0;
	$entropy -= ($p/$z)*log($p/$z)/log(2);
	}
	return $entropy;
	}

	sub corpus_coverage {
	my ($factor) = @_;
	%CORPUS_COVERED = ();

	# compute how often input phrases occur in the corpus
	open(CORPUS,$corpus) or die "Can't read corpus $corpus";
	while(<CORPUS>) {
	my $line = &get_factor_phrase($factor,$_);
	my @WORD = split(/ /,$line);
	my $sentence_length = scalar @WORD;
	for(my $start=0;$start < $sentence_length;$start++) {
	my $phrase = "";
	for(my $length=1;$length<$MAX_LENGTH && $start+$length<=$sentence_length;$length++) {
	$phrase .= " " if $length > 1;
	$phrase .= $WORD[$start+$length-1];
	last if !defined($INPUT_PHRASE{$length}{$phrase});
	$CORPUS_COVERED{$length}{$phrase}++;
	}
	}
	}
	close(CORPUS);

	# report occurrence counts for all known input phrases
	open(REPORT,">$dir/corpus-coverage-by-phrase".&factor_ext($factor)) or die "Cannot open: $!";
	foreach my $size (sort {$a <=> $b} keys %INPUT_PHRASE) {
	foreach my $phrase (keys %{$INPUT_PHRASE{$size}}) {
	next unless defined $CORPUS_COVERED{$size}{$phrase};
	printf REPORT "%s\t%d\n", $phrase, $CORPUS_COVERED{$size}{$phrase};
	}
	}
	close(REPORT);

	&additional_coverage_reports($factor,"corpus",\%CORPUS_COVERED);
	}

	sub additional_coverage_reports {
	my ($factor,$name,$COVERED) = @_;

	# unknown word report ---- TODO: extend to rare words?
	open(REPORT,">$dir/$name-unknown".&factor_ext($factor)) or die "Cannot open: $!";
	foreach my $phrase (keys %{$INPUT_PHRASE{1}}) {
	next if defined($$COVERED{1}{$phrase});
	printf REPORT "%s\t%d\n",$phrase,$INPUT_PHRASE{1}{$phrase};
	}
	close(REPORT);

	# summary report
	open(REPORT,">$dir/$name-coverage-summary".&factor_ext($factor)) or die "Cannot open: $!";
	foreach my $size (sort {$a <=> $b} keys %INPUT_PHRASE) {
	my (%COUNT_TYPE,%COUNT_TOKEN);
	foreach my $phrase (keys %{$INPUT_PHRASE{$size}}) {
	my $covered = $$COVERED{$size}{$phrase};
	$covered = 0 unless defined($covered);
	$COUNT_TYPE{$covered}++;
	$COUNT_TOKEN{$covered} += $INPUT_PHRASE{$size}{$phrase};
	}
	foreach my $count (sort {$a <=> $b} keys %COUNT_TYPE) {
	printf REPORT "%d\t%d\t%d\t%d\n",$size,$count,$COUNT_TYPE{$count},$COUNT_TOKEN{$count};
	}
	}
	close(REPORT);
	}

	sub input_annotation {
	open(OUT,">$dir/input-annotation") or die "Cannot open: $!";;
	open(INPUT,$input) or die "Can't read input $input";
	while(<INPUT>) {
	chop;
	s/\\|\S+//g; # remove additional factors
	s/<\S[^>]*>//g; # remove xml markup
	s/\s+/ /g; s/^ //; s/ $//; # remove redundant spaces
	print OUT $_."\t";
	my @WORD = split;
	my $sentence_length = scalar @WORD;
	for(my $start=0;$start < $sentence_length;$start++) {
	my $phrase = "";
	for(my $length=1;$length<$MAX_LENGTH && $start+$length<=$sentence_length;$length++) {
	$phrase .= " " if $length > 1;
	$phrase .= $WORD[$start+$length-1];

	my $ttable_covered = $TTABLE_COVERED{$length}{$phrase};
	my $corpus_covered = $CORPUS_COVERED{$length}{$phrase};
	next unless defined($ttable_covered) \|\| defined($corpus_covered);
	my $ttable_entropy = $TTABLE_ENTROPY{$length}{$phrase} \|\| 0;
	#$ttable_entropy = 0 unless defined($ttable_entropy);
	$ttable_covered = 0 unless defined($ttable_covered);
	$corpus_covered = 0 unless defined($corpus_covered);

	if (defined($TTABLE_COVERED{$length}{$phrase})) {
	printf OUT "%d-%d:%d:%d:%.5f ",$start,$start+$length-1,$corpus_covered,$ttable_covered,$ttable_entropy;
	}
	}
	}
	print OUT "\n";
	}
	close(INPUT);
	close(OUT);
	}

	sub extract_n_grams_arrayopt {
	my ($sentence,$NGRAM,$minmax) = @_;
	if (ref($sentence) eq 'ARRAY') {
	my %MINMAX_NGRAM;
	&extract_n_grams($$sentence[0],\%MINMAX_NGRAM);
	for(my $i=1;$i<scalar(@{$sentence});$i++) {
	my %SET_NGRAM;
	&extract_n_grams($$sentence[$i],\%SET_NGRAM);
	for(my $length=1;$length<=$MAX_LENGTH;$length++) {
	if ($minmax eq "min") {
	foreach my $ngram (keys %{$MINMAX_NGRAM{$length}}) {
	if (!defined($SET_NGRAM{$length}{$ngram})) {
	delete( $MINMAX_NGRAM{$length}{$ngram} );
	}
	elsif($MINMAX_NGRAM{$length}{$ngram} > $SET_NGRAM{$length}{$ngram}) {
	$MINMAX_NGRAM{$length}{$ngram} = $SET_NGRAM{$length}{$ngram};
	}
	}
	}
	else {
	foreach my $ngram (keys %{$SET_NGRAM{$length}}) {
	if (!defined($MINMAX_NGRAM{$length}{$ngram}) \|\|
	$SET_NGRAM{$length}{$ngram} > $MINMAX_NGRAM{$length}{$ngram}) {
	$MINMAX_NGRAM{$length}{$ngram} = $SET_NGRAM{$length}{$ngram};
	}
	}
	}
	}
	}
	for(my $length=1;$length<=$MAX_LENGTH;$length++) {
	foreach my $ngram (keys %{$MINMAX_NGRAM{$length}}) {
	$$NGRAM{$length}{$ngram} += $MINMAX_NGRAM{$length}{$ngram};
	}
	}
	}
	else {
	&extract_n_grams($sentence,$NGRAM);
	}
	}

	sub extract_n_grams {
	my ($sentence,$NGRAM) = @_;

	$sentence =~ s/[\r\n]+//g;
	$sentence =~ s/\s+/ /g;
	$sentence =~ s/^ //;
	$sentence =~ s/ $//;

	my @WORD = split(/ /,$sentence);
	for(my $length=1;$length<=$MAX_LENGTH;$length++) {
	for(my $i=0;$i<=scalar(@WORD)-$length;$i++) {
	my $ngram = "";
	for(my $n=0;$n<$length;$n++) {
	$ngram .= " " if $n>0;
	$ngram .= $WORD[$i+$n];
	}
	$$NGRAM{$length}{$ngram}++;
	}
	}
	}

	sub precision_by_coverage {
	my ($coverage_type) = @_;
	my (%PREC_BY_WORD,%TOTAL_BY_WORD,%LENGTH_BY_WORD,%DELETED_BY_WORD);
	my (%PREC_BY_COVERAGE,%TOTAL_BY_COVERAGE,%LENGTH_BY_COVERAGE,%DELETED_BY_COVERAGE);
	my (%PREC_BY_FACTOR,%TOTAL_BY_FACTOR,%LENGTH_BY_FACTOR,%DELETED_BY_FACTOR);
	my (%PREC_BY_FACTOR_COVERAGE,%TOTAL_BY_FACTOR_COVERAGE,%LENGTH_BY_FACTOR_COVERAGE,%DELETED_BY_FACTOR_COVERAGE);

	# get coverage statistics
	my %COVERAGE;
	print STDERR "".(defined($coverage_dir)?$coverage_dir:$dir)
	."/$coverage_type-coverage-by-phrase";
	open(COVERAGE,(defined($coverage_dir)?$coverage_dir:$dir)
	."/$coverage_type-coverage-by-phrase") or die "Cannot open: $!";
	while(<COVERAGE>) {
	chop;
	my ($phrase,$count) = split(/\t/);
	$COVERAGE{$phrase} = $count;
	}
	close(COVERAGE);

	# go through each line...
	open(FILE,$segmentation) \|\| die("ERROR: could not open segmentation file $segmentation");
	open(INPUT,$input) or die "Can't read input $input";
	open(ALIGNMENT,$system_alignment) or die "Can't read output alignment file $system_alignment";

	# get marked up output
	my $line_count = 0;
	while(my $line = <FILE>) {
	chop($line);

	# get corresponding input line
	my $input = <INPUT>;
	my @INPUT = split(/ /,&get_factor_phrase(0,$input)); # surface
	my @FACTOR = split(/ /,&get_factor_phrase($precision_by_coverage_factor,$input));

	# word alignment
	my $alignment = <ALIGNMENT>;
	my %ALIGNED;
	foreach (split(/ /,$alignment)) {
	my ($input_pos,$output_pos) = split(/\-/,$_);
	push @{$ALIGNED{$input_pos}}, $output_pos;
	}

	# output words
	# @SYSTEM is already collected
	my @OUTPUT = split(/ /,$SYSTEM[$line_count]);

	# compute precision of each ngram
	# @REFERENCE (possibly multiple) is already collected
	my (%SYS_NGRAM,%REF_NGRAM,%PREC_NGRAM);
	&extract_n_grams( $SYSTEM[$line_count], \%SYS_NGRAM );
	&extract_n_grams_arrayopt( $REFERENCE[$line_count++], \%REF_NGRAM, "max" );
	foreach my $ngram (keys %{$SYS_NGRAM{1}}) { # note: only interested in unigram precision
	$PREC_NGRAM{1}{$ngram} = 0;
	if (defined($REF_NGRAM{1}) &&
	defined($REF_NGRAM{1}{$ngram})) {
	my $ref_count = $REF_NGRAM{1}{$ngram};
	my $sys_count = $SYS_NGRAM{1}{$ngram};
	$PREC_NGRAM{1}{$ngram} =
	($ref_count >= $sys_count) ? 1 : $ref_count/$sys_count;
	}
	}
	close(REPORT);

	# process one phrase at a time
	my $output_pos = 0;
	while($line =~ /([^\|]+) \\|(\d+)\-(\d+)\\|\s(.)$/) {
	my ($output,$from,$to) = ($1,$2,$3);
	$line = $4;

	# bug fix: 1-1 unknown word mappings get alignment point
	if ($from == $to && # one
	scalar(split(/ /,$output)) == 1 && # to one
	!defined($ALIGNED{$from})) { # but not aligned
	push @{$ALIGNED{$from}},$output_pos;
	}
	$output_pos += scalar(split(/ /,$output));

	# compute precision for each word
	for(my $i=$from; $i<=$to; $i++) {
	my $coverage = 0;
	$coverage = $COVERAGE{$INPUT[$i]} if defined($COVERAGE{$INPUT[$i]});

	my ($precision,$deleted,$length) = (0,0,0);

	# unaligned? note as deleted
	if (!defined($ALIGNED{$i})) {
	$deleted = 1;
	}
	# aligned
	else {
	foreach my $o (@{$ALIGNED{$i}}) {
	$precision += $PREC_NGRAM{1}{$OUTPUT[$o]};
	}
	$precision /= scalar(@{$ALIGNED{$i}}); # average, if multi-aligned
	$length = scalar(@{$ALIGNED{$i}});
	}

	my $word = $INPUT[$i];
	$word .= "\t".$FACTOR[$i] if $precision_by_coverage_factor;
	$DELETED_BY_WORD{$word} += $deleted;
	$PREC_BY_WORD{$word} += $precision;
	$LENGTH_BY_WORD{$word} += $length;
	$TOTAL_BY_WORD{$word}++;

	$DELETED_BY_COVERAGE{$coverage} += $deleted;
	$PREC_BY_COVERAGE{$coverage} += $precision;
	$LENGTH_BY_COVERAGE{$coverage} += $length;
	$TOTAL_BY_COVERAGE{$coverage}++;

	if ($precision_by_coverage_factor) {
	$DELETED_BY_FACTOR{$FACTOR[$i]} += $deleted;
	$DELETED_BY_FACTOR_COVERAGE{$FACTOR[$i]}{$coverage} += $deleted;
	$PREC_BY_FACTOR{$FACTOR[$i]} += $precision;
	$PREC_BY_FACTOR_COVERAGE{$FACTOR[$i]}{$coverage} += $precision;
	$LENGTH_BY_FACTOR{$FACTOR[$i]} += $length;
	$LENGTH_BY_FACTOR_COVERAGE{$FACTOR[$i]}{$coverage} += $length;
	$TOTAL_BY_FACTOR{$FACTOR[$i]}++;
	$TOTAL_BY_FACTOR_COVERAGE{$FACTOR[$i]}{$coverage}++;
	}
	}
	}
	}
	close(FILE);

	open(REPORT,">$dir/precision-by-$coverage_type-coverage") or die "Cannot open: $!";
	foreach my $coverage (sort {$a <=> $b} keys %TOTAL_BY_COVERAGE) {
	printf REPORT "%d\t%.3f\t%d\t%d\t%d\n", $coverage, $PREC_BY_COVERAGE{$coverage}, $DELETED_BY_COVERAGE{$coverage}, $LENGTH_BY_COVERAGE{$coverage}, $TOTAL_BY_COVERAGE{$coverage};
	}
	close(REPORT);

	open(REPORT,">$dir/precision-by-input-word") or die "Cannot open: $!";
	foreach my $word (keys %TOTAL_BY_WORD) {
	my ($w,$f) = split(/\t/,$word);
	my $coverage = 0;
	$coverage = $COVERAGE{$w} if defined($COVERAGE{$w});
	printf REPORT "%.3f\t%d\t%d\t%d\t%d\t%s\n", $PREC_BY_WORD{$word}, $DELETED_BY_WORD{$word}, $LENGTH_BY_WORD{$word}, $TOTAL_BY_WORD{$word},$coverage,$word;
	}
	close(REPORT);

	if ($precision_by_coverage_factor) {
	open(REPORT,">$dir/precision-by-$coverage_type-coverage.$precision_by_coverage_factor") or die "Cannot open: $!";
	foreach my $factor (sort keys %TOTAL_BY_FACTOR_COVERAGE) {
	foreach my $coverage (sort {$a <=> $b} keys %{$TOTAL_BY_FACTOR_COVERAGE{$factor}}) {
	printf REPORT "%s\t%d\t%.3f\t%d\t%d\t%d\n", $factor, $coverage, $PREC_BY_FACTOR_COVERAGE{$factor}{$coverage}, $DELETED_BY_FACTOR_COVERAGE{$factor}{$coverage}, $LENGTH_BY_FACTOR_COVERAGE{$factor}{$coverage}, $TOTAL_BY_FACTOR_COVERAGE{$factor}{$coverage};
	}
	}
	close(REPORT);
	}
	}

	sub segmentation {
	my %SEGMENTATION;

	open(FILE,$segmentation) \|\| die("ERROR: could not open segmentation file $segmentation");
	open(OUT,">$dir/segmentation-annotation") or die "Cannot open: $!";
	while(<FILE>) {
	chop;
	my $count=0;
	my $out = -1;
	foreach (split) {
	if (/^\\|(\d+)\-(\d+)\\|$/) {
	print OUT " " unless $out-($count-1) == 0;
	printf OUT "%d:%d:%d:%d",$1,$2,$out-($count-1),$out;
	my $in_count = $2-$1+1;
	$SEGMENTATION{$in_count}{$count}++;
	$count = 0;
	}
	else {
	$out++;
	$count++;
	}
	}
	print OUT "\n";
	}
	close(OUT);
	close(FILE);

	open(SUMMARY,">$dir/segmentation") or die "Cannot open: $!";
	foreach my $in (sort { $a <=> $b } keys %SEGMENTATION) {
	foreach my $out (sort { $a <=> $b } keys %{$SEGMENTATION{$in}}) {
	printf SUMMARY "%d\t%d\t%d\n", $in, $out, $SEGMENTATION{$in}{$out};
	}
	}
	close(SUMMARY);

	# TODO: error by segmentation
	}

	# analyze the trace file to collect statistics over the
	# hierarchical derivations and also create segmentation annotation
	sub hierarchical_segmentation {
	my $last_sentence = -1;
	my @DERIVATION;
	my %STATS;
	open(TRACE,$segmentation.".trace") or die "Cannot open: $!";
	open(INPUT_TREE,">$dir/input-tree") or die "Cannot open: $!";
	open(OUTPUT_TREE,">$dir/output-tree") or die "Cannot open: $!";
	open(NODE,">$dir/node") or die "Cannot open: $!";
	while(<TRACE>) {
	my $sentence;
	my %ITEM;
	&hs_scan_line($_, \$sentence, \%ITEM) \|\| die("cannot scan line $_");
	if ($last_sentence >= 0 && $sentence != $last_sentence) {
	&hs_process($last_sentence,\@DERIVATION,\%STATS);
	@DERIVATION = ();
	}
	push @DERIVATION,\%ITEM;
	$last_sentence = $sentence;
	}
	&hs_process($last_sentence,\@DERIVATION,\%STATS);
	close(TRACE);
	close(NODE);
	close(INPUT_TREE);
	close(OUTPUT_TREE);

	open(SUMMARY,">$dir/rule") or die "Cannot open: $!";
	print SUMMARY "sentence-count\t".(++$last_sentence)."\n";
	print SUMMARY "glue-rule\t".$STATS{'glue-rule'}."\n";
	print SUMMARY "depth\t".$STATS{'depth'}."\n";
	foreach (keys %{$STATS{'rule-type'}}) {
	print SUMMARY "rule\t$_\t".$STATS{'rule-type'}{$_}."\n";
	}
	close(SUMMARY);
	}

	# scan a single line of the trace file
	sub hs_scan_line {
	my ($line,$ref_sentence,$ref_item) = @_;

	if ($line =~ /^Trans Opt/) {
	# Old format
	$line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>(.+) :([,\d\- ]*): pC=[\d\.\-e]+, c=/ \|\|
	$line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>\S+ \-\> (.+) :([,\d\- ]*): c=/ \|\|
	$line =~ /^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+) : (\S+) \-\>\S+ \-\> (.+) :([,\d\- ]): term=.: nonterm=.*: c=/ \|\| return 0;
	my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7);

	${$ref_sentence} = $sentence;

	$ref_item->{'start'} = $start;
	$ref_item->{'end'} = $end;
	$ref_item->{'rule_lhs'} = $rule_lhs;

	$rule_rhs =~ s/</</g;
	$rule_rhs =~ s/>/>/g;
	@{$ref_item->{'rule_rhs'}} = split(/ /,$rule_rhs);

	foreach (split(/ /,$alignment)) {
	/(\d+)[\-,](\d+)/ \|\| die("funny alignment: $_\n");
	$ref_item->{'alignment'}{$2} = $1; # target non-terminal to source span
	$ref_item->{'alignedSpan'}{$1} = 1;
	}

	@{$ref_item->{'spans'}} = ();
	foreach my $span (reverse split(/\s+/,$spans)) {
	$span =~ /\[(\d+)\.\.(\d+)\]=(\S+)$/ \|\| die("funny span: $span\n");
	my %SPAN = ( 'from' => $1, 'to' => $2, 'word' => $3 );
	push @{$ref_item->{'spans'}}, \%SPAN;
	}
	} else {
	# New format
	$line =~ /^(\d+) \\|\\|\\| \[\S+\] -> (.+) \\|\\|\\| \[(\S+)\] -> (.+) \\|\\|\\| (.)\\|\\|\\| (.)/ \|\| return 0;
	my ($sentence,$source_rhs,$target_lhs,$target_rhs,$alignment,$source_spans) = ($1,$2,$3,$4,$5,$6);

	${$ref_sentence} = $sentence;

	@{$ref_item->{'spans'}} = ();
	foreach (split(/ /,$source_rhs)) {
	/^\[?([^\]]+)\]?$/;
	my %SPAN = ( 'word' => $1 );
	push @{$ref_item->{'spans'}}, \%SPAN;
	}

	my $i = 0;
	foreach my $span (split(/ /,$source_spans)) {
	$span =~ /(\d+)\.\.(\d+)/ \|\| die("funny span: $span\n");
	$ref_item->{'spans'}[$i]{'from'} = $1;
	$ref_item->{'spans'}[$i]{'to'} = $2;
	if ($i == 0) {
	$ref_item->{'start'} = $1;
	}
	$ref_item->{'end'} = $2;
	$i++;
	}

	$ref_item->{'rule_lhs'} = $target_lhs;

	$target_rhs =~ s/</</g;
	$target_rhs =~ s/>/>/g;
	@{$ref_item->{'rule_rhs'}} = ();
	foreach (split(/ /,$target_rhs)) {
	/^\[?([^\]]+)\]?$/;
	push @{$ref_item->{'rule_rhs'}}, $1;
	}

	foreach (split(/ /,$alignment)) {
	/(\d+)[\-,](\d+)/ \|\| die("funny alignment: $_\n");
	$ref_item->{'alignment'}{$2} = $1; # target non-terminal to source span
	$ref_item->{'alignedSpan'}{$1} = 1;
	}
	}

	return 1;
	}

	# process a single sentence for hierarchical segmentation
	sub hs_process {
	my ($sentence,$DERIVATION,$STATS) = @_;

	my $DROP_RULE = shift @{$DERIVATION}; # get rid of S -> S </s>
	my $max = $$DERIVATION[0]{'end'};

	# consolidate glue rules into one rule
	my %GLUE_RULE;
	$GLUE_RULE{'start'} = 1;
	$GLUE_RULE{'end'} = $max;
	$GLUE_RULE{'rule_lhs'} = "S";
	$GLUE_RULE{'depth'} = 0;
	my $x=0;
	while(1) {
	my $RULE = shift @{$DERIVATION};
	if (scalar(@{$$RULE{'rule_rhs'}}) == 2 &&
	($$RULE{'rule_lhs'} eq "S" &&
	$$RULE{'rule_rhs'}[0] eq "S" &&
	$$RULE{'rule_rhs'}[1] eq "X") \|\|
	($$RULE{'rule_lhs'} eq "Q" &&
	$$RULE{'rule_rhs'}[0] eq "Q")) {
	unshift @{$GLUE_RULE{'spans'}},$$RULE{'spans'}[1];
	push @{$GLUE_RULE{'rule_rhs'}}, $$RULE{'rule_rhs'}[1];
	$GLUE_RULE{'alignment'}{$x} = $x;
	$GLUE_RULE{'alignedSpan'}{$x} = 1;
	$x++;
	}
	else {
	unshift @{$DERIVATION}, $RULE;
	last;
	}
	}
	unshift @{$DERIVATION}, \%GLUE_RULE;
	$$STATS{'glue-rule'} += $x;

	# create chart
	my %CHART;
	foreach my $RULE (@{$DERIVATION}) {
	$CHART{$$RULE{'start'}}{$$RULE{'end'}} = $RULE;
	}

	# compute depth
	&hs_compute_depth(1,$max,0,\%CHART);
	my $max_depth = 0;
	foreach my $RULE (@{$DERIVATION}) {
	next unless defined($$RULE{'depth'}); # better: delete offending rule S -> S <s>
	$max_depth = $$RULE{'depth'} if $$RULE{'depth'} > $max_depth;
	}
	&hs_recompute_depth(1,$max,\%CHART,$max_depth);
	$$STATS{'depth'} += $max_depth;

	# build matrix of divs

	my @MATRIX;
	&hs_create_out_span(1,$max,\%CHART,\@MATRIX);
	print OUTPUT_TREE &hs_output_matrix($sentence,\@MATRIX,$max_depth);

	my @MATRIX_IN;
	&hs_create_in_span(1,$max,\%CHART,\@MATRIX_IN);
	print INPUT_TREE &hs_output_matrix($sentence,\@MATRIX_IN,$max_depth);

	# number rules and get their children
	my $id = 0;
	foreach my $RULE (@{$DERIVATION}) {
	next unless defined($$RULE{'start_div'}); # better: delete offending rule S -> S <s>
	$$STATS{'rule-type'}{&hs_rule_type($RULE)}++ if $id>0;
	$$RULE{'id'} = $id++;
	}
	&hs_get_children(1,$max,\%CHART);

	foreach my $RULE (@{$DERIVATION}) {
	next unless defined($$RULE{'start_div'}); # better: delete offending rule S -> S <s>

	print NODE $sentence." ";
	print NODE $$RULE{'depth'}." ";
	print NODE $$RULE{'start_div'}." ".$$RULE{'end_div'}." ";
	print NODE $$RULE{'start_div_in'}." ".$$RULE{'end_div_in'}." ";
	print NODE join(",",@{$$RULE{'children'}})."\n";
	}
	}

	sub hs_output_matrix {
	my ($sentence,$MATRIX,$max_depth) = @_;
	my @OPEN;
	my $out = "";
	for(my $d=0;$d<=$max_depth;$d++) { push @OPEN, 0; }
	foreach my $SPAN (@$MATRIX) {
	$out .= $sentence."\t";
	for(my $d=0;$d<=$max_depth;$d++) {
	my $class = " ";
	my $closing_flag = 0;
	if (defined($$SPAN{'closing'}) && defined($$SPAN{'closing'}{$d})) {
	$closing_flag = 1;
	}
	if ($d == $$SPAN{'depth'}) {
	if (defined($$SPAN{'opening'}) && $closing_flag) {
	$class = "O";
	}
	elsif(defined($$SPAN{'opening'})) {
	$class = "[";
	}
	elsif($closing_flag) {
	$class = "]";
	}
	else {
	$class = "-";
	}
	}
	elsif ($closing_flag) {
	$class = "]";
	}
	elsif ($OPEN[$d]) {
	$class = "-";
	}
	$out .= $class;
	}
	$out .= "\t";
	$out .= $$SPAN{'lhs'} if defined($$SPAN{'lhs'});
	$out .= "\t";
	$out .= $$SPAN{'rhs'} if defined($$SPAN{'rhs'});
	$out .= "\n";
	$OPEN[$$SPAN{'depth'}] = 1 if defined($$SPAN{'opening'});
	if(defined($$SPAN{'closing'})) {
	for(my $d=$max_depth;$d>=0;$d--) {
	$OPEN[$d] = 0 if defined($$SPAN{'closing'}{$d});
	}
	}
	}
	return $out;
	}

	sub hs_rule_type {
	my ($RULE) = @_;

	my $type = "";

	# output side
	my %NT;
	my $total_word_count = 0;
	my $word_count = 0;
	my $nt_count = 0;
	for(my $i=0;$i<scalar @{$$RULE{'rule_rhs'}};$i++) {
	if (defined($$RULE{'alignment'}{$i})) {
	$type .= $word_count if $word_count > 0;
	$word_count = 0;
	my $nt = chr(97+$nt_count++);
	$NT{$$RULE{'alignment'}{$i}} = $nt;
	$type .= $nt;
	}
	else {
	$word_count++;
	$total_word_count++;
	}
	}
	$type .= $word_count if $word_count > 0;

	$type .= ":".$total_word_count.":".$nt_count.":";

	# input side
	$word_count = 0;
	$total_word_count = 0;
	for(my $i=0;$i<scalar(@{$$RULE{'spans'}});$i++) {
	my $SUBSPAN = ${$$RULE{'spans'}}[$i];
	if (defined($$RULE{'alignedSpan'}{$i})) {
	$type .= $word_count if $word_count > 0;
	$word_count = 0;
	$type .= $NT{$i};
	}
	else {
	$word_count++;
	$total_word_count++;
	}
	}
	$type .= $word_count if $word_count > 0;
	$type .= ":".$total_word_count;
	return $type;
	}

	# compute depth of each node
	sub hs_compute_depth {
	my ($start,$end,$depth,$CHART) = @_;
	if (!defined($$CHART{$start}{$end})) {
	print STDERR "warning: illegal span ($start,$end)\n";
	return;
	}
	my $RULE = $$CHART{$start}{$end};

	$$RULE{'depth'} = $depth;

	for(my $i=0;$i<scalar @{$$RULE{'rule_rhs'}};$i++) {
	# non-terminals
	if (defined($$RULE{'alignment'}{$i})) {
	my $SUBSPAN = $$RULE{'spans'}[$$RULE{'alignment'}{$i}];
	&hs_compute_depth($$SUBSPAN{'from'},$$SUBSPAN{'to'},$depth+1,$CHART);
	}
	}
	}

	# re-assign depth to as deep as possible
	sub hs_recompute_depth {
	my ($start,$end,$CHART,$max_depth) = @_;
	if (!defined($$CHART{$start}{$end})) {
	print STDERR "warning: illegal span ($start,$end)\n";
	return 0;
	}
	my $RULE = $$CHART{$start}{$end};

	my $min_sub_depth = $max_depth+1;
	for(my $i=0;$i<scalar @{$$RULE{'rule_rhs'}};$i++) {
	# non-terminals
	if (defined($$RULE{'alignment'}{$i})) {
	my $SUBSPAN = $$RULE{'spans'}[$$RULE{'alignment'}{$i}];
	my $sub_depth = &hs_recompute_depth($$SUBSPAN{'from'},$$SUBSPAN{'to'},$CHART,$max_depth);
	$min_sub_depth = $sub_depth if $sub_depth < $min_sub_depth;
	}
	}
	$$RULE{'depth'} = $min_sub_depth-1;
	return $$RULE{'depth'};
	}

	# get child dependencies for a sentence
	sub hs_get_children {
	my ($start,$end,$CHART) = @_;
	if (!defined($$CHART{$start}{$end})) {
	print STDERR "warning: illegal span ($start,$end)\n";
	return -1;
	}
	my $RULE = $$CHART{$start}{$end};

	my @CHILDREN = ();
	$$RULE{'children'} = \@CHILDREN;

	for(my $i=0;$i<scalar @{$$RULE{'rule_rhs'}};$i++) {
	# non-terminals
	if (defined($$RULE{'alignment'}{$i})) {
	my $SUBSPAN = $$RULE{'spans'}[$$RULE{'alignment'}{$i}];
	my $child = &hs_get_children($$SUBSPAN{'from'},$$SUBSPAN{'to'},$CHART);
	push @CHILDREN, $child unless $child == -1;
	}
	}
	return $$RULE{'id'};
	}

	# create the span annotation for an output sentence
	sub hs_create_out_span {
	my ($start,$end,$CHART,$MATRIX) = @_;
	if (!defined($$CHART{$start}{$end})) {
	print STDERR "warning: illegal span ($start,$end)\n";
	return;
	}
	my $RULE = $$CHART{$start}{$end};

	my %SPAN;
	$SPAN{'start'} = $start;
	$SPAN{'end'} = $end;
	$SPAN{'depth'} = $$RULE{'depth'};
	$SPAN{'lhs'} = $$RULE{'rule_lhs'};
	$SPAN{'opening'} = 1;
	push @{$MATRIX},\%SPAN;
	$$RULE{'start_div'} = $#{$MATRIX};
	my $THIS_SPAN = \%SPAN;
	# in output order ...
	my $terminal = 1;
	for(my $i=0;$i<scalar @{$$RULE{'rule_rhs'}};$i++) {
	# non-terminals
	if (defined($$RULE{'alignment'}{$i})) {
	my $SUBSPAN = $$RULE{'spans'}[$$RULE{'alignment'}{$i}];
	&hs_create_out_span($$SUBSPAN{'from'},$$SUBSPAN{'to'},$CHART,$MATRIX);
	$terminal = 0;
	}
	# terminals
	else {
	# new sequence of terminals?
	if (!$terminal) {
	my %SPAN;
	$SPAN{'start'} = $start;
	$SPAN{'end'} = $end;
	$SPAN{'depth'} = $$RULE{'depth'};
	push @{$MATRIX},\%SPAN;
	$THIS_SPAN = \%SPAN;
	}
	$$THIS_SPAN{'rhs'} .= " " if defined($$THIS_SPAN{'rhs'});
	$$THIS_SPAN{'rhs'} .= $$RULE{"rule_rhs"}[$i];
	$terminal = 1;
	}
	}
	$THIS_SPAN = $$MATRIX[scalar(@{$MATRIX})-1];
	$$RULE{'end_div'} = $#{$MATRIX};
	$$THIS_SPAN{'closing'}{$$RULE{'depth'}} = 1;
	}

	# create the span annotation for an input sentence
	sub hs_create_in_span {
	my ($start,$end,$CHART,$MATRIX) = @_;
	if (!defined($$CHART{$start}{$end})) {
	print STDERR "warning: illegal span ($start,$end)\n";
	return;
	}
	my $RULE = $$CHART{$start}{$end};

	my %SPAN;
	$SPAN{'start'} = $start;
	$SPAN{'end'} = $end;
	$SPAN{'depth'} = $$RULE{'depth'};
	$SPAN{'lhs'} = $$RULE{'rule_lhs'};
	$SPAN{'opening'} = 1;
	push @{$MATRIX},\%SPAN;
	$$RULE{'start_div_in'} = $#{$MATRIX};
	my $THIS_SPAN = \%SPAN;

	my $terminal = 1;
	# in input order ...
	for(my $i=0;$i<scalar(@{$$RULE{'spans'}});$i++) {
	my $SUBSPAN = ${$$RULE{'spans'}}[$i];
	if (defined($$RULE{'alignedSpan'}{$i})) {
	&hs_create_in_span($$SUBSPAN{'from'},$$SUBSPAN{'to'},$CHART,$MATRIX);
	$terminal = 0;
	}
	else {
	# new sequence of terminals?
	if (!$terminal) {
	my %SPAN;
	$SPAN{'start'} = $start;
	$SPAN{'end'} = $end;
	$SPAN{'depth'} = $$RULE{'depth'};
	push @{$MATRIX},\%SPAN;
	$THIS_SPAN = \%SPAN;
	}
	$$THIS_SPAN{'rhs'} .= " " if defined($$THIS_SPAN{'rhs'});
	$$THIS_SPAN{'rhs'} .= $$SUBSPAN{'word'};
	$terminal = 1;
	}
	}
	$THIS_SPAN = $$MATRIX[scalar(@{$MATRIX})-1];
	$$RULE{'end_div_in'} = $#{$MATRIX};
	$$THIS_SPAN{'closing'}{$$RULE{'depth'}} = 1;
	}

	sub process_search_graph {
	my ($search_graph_file) = @_;
	open(OSG,$search_graph) \|\| die("ERROR: could not open search graph file '$search_graph_file'");
	`mkdir -p $dir/search-graph`;
	my $last_sentence = -1;
	while(<OSG>) {
	my ($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$heuristic_rule_score,$from,$to,$children,$hyp_score);
	if (/^(\d+) (\d+)\-?\>?(\S) (\S+) =\> (.+) :(.): pC=([\de\-\.]+), c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] \<\</) {
	($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$heuristic_rule_score,$from,$to,$children,$hyp_score) = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12);
	}
	elsif (/^(\d+) (\d+)\-?\>?(\S) (\S+) =\> (.+) :(.): c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\de\-\.]+)\] core/ \|\|
	/^(\d+) (\d+)\-?\>?(\S) (\S+) =\> (.+) :(.): c=([\de\-\.]+) core=$.$ \[(\d+)\.\.(\d+)\] (.)\[total=([\de\-\.]+)\] core/) {
	($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$from,$to,$children,$hyp_score) = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12);
	$heuristic_rule_score = $rule_score; # hmmmm....
	}
	else {
	die("ERROR: buggy search graph line: $_");
	}
	chop($alignment) if $alignment;
	chop($children) if $children;
	$recomb = 0 unless $recomb;
	$children = "" unless defined $children;
	$alignment = "" unless defined $alignment;
	if ($last_sentence != $sentence) {
	close(SENTENCE) if $sentence;
	open(SENTENCE,">$dir/search-graph/graph.$sentence");
	$last_sentence = $sentence;
	}
	print SENTENCE "$id\t$recomb\t$from\t$to\t$output\t$alignment\t$children\t$rule_score\t$heuristic_rule_score\t$hyp_score\t$lhs\n";
	}
	close(OSG);
	close(SENTENCE);
	}