File size: 5,969 Bytes
158b61b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
#!/usr/bin/env perl
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
use warnings;
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($RealBin);
die("ERROR: syntax is: parse-en-egret.perl [-forest] [-split-hyphen] [-split-slash] [-mark-split] [-binarize] [-unparseable] [-raw-in PATH] [-raw-out PATH] [-egret-options OPTIONS] [-tree-converter-options OPTIONS] -egret-dir DIR -moses-dir DIR -tree-converter PATH < in > out\n")
unless &GetOptions
('egret-dir=s' => \$EGRET_DIR,
'moses-dir=s' => \$MOSES_DIR,
'tree-converter=s' => \$TREE_CONVERTER,
'forest' => \$FOREST,
'split-hyphen' => \$SPLIT_HYPHEN,
'split-slash' => \$SPLIT_SLASH,
'mark-split' => \$MARK_SPLIT,
'binarize' => \$BINARIZE,
'unparseable' => \$UNPARSEABLE,
'raw-in=s' => \$RAW_IN,
'raw-out=s' => \$RAW_OUT,
'egret-options=s' => \$EGRET_OPTIONS,
'tree-converter-options=s' => \$TREE_CONVERTER_OPTIONS
&& defined($EGRET_DIR) && defined($MOSES_DIR) && defined($TREE_CONVERTER);
die("ERROR: could not find egret directory: '$EGRET_DIR'\n") unless -d $EGRET_DIR;
die("ERROR: could not find moses directory: '$MOSES_DIR'\n") unless -d $MOSES_DIR;
die("ERROR: file not found or not executable: '$TREE_CONVERTER'\n") unless -x $TREE_CONVERTER;
# Pre-processing.
my $tmpEscaped = "/tmp/parse-en-egret.1.$$";
my $tmpDeescaped = "/tmp/parse-en-egret.2.$$";
my $tmpSplitPoints = "/tmp/parse-en-egret.3.$$";
open(ESCAPED, ">>$tmpEscaped");
open(DEESCAPED, "| $RealBin/../../tokenizer/deescape-special-chars.perl > $tmpDeescaped");
open(SPLIT_POINTS, ">>$tmpSplitPoints");
# Unsplit hyphen and slashes and write a file indicating where split points
# are required in later post-processing.
while(<STDIN>) {
print ESCAPED $_;
my @tokens = split;
my $new_token = "";
my $i = 0; # current token index in input sentence
my $j = -1; # current token index in output sentence
my $s = ""; # output sentence
my $t = ""; # split point line
while ($i <= $#tokens) {
if (defined($SPLIT_HYPHEN) && $i <= $#tokens-1 &&
$tokens[$i] eq "\@\-\@") {
my $pos = length $new_token;
$new_token .= "-$tokens[$i+1]";
$t .= "$j,$pos,- ";
$i += 2;
} elsif (defined($SPLIT_SLASH) && $i <= $#tokens-1 &&
$tokens[$i] eq "\@\/\@") {
my $pos = length $new_token;
$new_token .= "/$tokens[$i+1]";
$t .= "$j,$pos,/ ";
$i += 2;
} else {
$s .= "$new_token ";
$new_token = $tokens[$i];
$s .= "$new_token";
$s =~ s/^\s+//;
$t =~ s/^\s+//;
print DEESCAPED "$s\n";
print SPLIT_POINTS "$t\n";
# Construct the parsing / post-processing pipeline:
# Stage 1: Parse (unless the user has provided Egret input via -raw-in option).
my $pipeline = "";
if (defined($RAW_IN)) {
$pipeline .= "cat \"$RAW_IN\" |";
} else {
$pipeline .= "$EGRET_DIR/egret";
$pipeline .= " -lapcfg";
$pipeline .= " -data=$EGRET_DIR/eng_grammar";
$pipeline .= " -printForest" if $FOREST;
$pipeline .= " -i=$tmpDeescaped";
$pipeline .= " $EGRET_OPTIONS" if defined($EGRET_OPTIONS);
$pipeline .= " |";
if (defined($RAW_OUT)) {
$pipeline .= "tee \"$RAW_OUT\" |";
# Stage 2: Convert trees to forests (unless we already have forests)
unless ($FOREST) {
$pipeline .= 'sed \'s/^(//\' |'; # Remove opening (
$pipeline .= 'sed \'s/)$//\' |'; # Remove closing )
$pipeline .= "$TREE_CONVERTER";
$pipeline .= " -input_format penn";
$pipeline .= " -output_format egret";
$pipeline .= " |";
# Stage 3: Postprocess using Moses' postprocess-egret-forests
# This performs some minor transformations to the forest: Moses-style escaping
# of special characters; removal of Egret's "^g" suffixes from constituent
# labels; and marking of slash/hyphen split points (using @ characters).
$pipeline .= "$MOSES_DIR/bin/postprocess-egret-forests";
$pipeline .= " --Escape" if $FOREST;
$pipeline .= " --MarkSplitPoints $tmpSplitPoints";
$pipeline .= " |";
# Stage 4: Postprocess using Travatar's tree-converter.
# This normalizes the forest weights and performs hyphen / slash splitting (if
# requested). The option -tree-converter-options can be used to enable
# additional tree-converter transformations (such as binarization).
#my $output_format = $FOREST ? "egret" : "mosesxml";
my $output_format = $FOREST ? "egret" : "penn";
$pipeline .= "$TREE_CONVERTER";
$pipeline .= " -input_format egret";
$pipeline .= " -output_format $output_format";
# FIXME Single split option
$pipeline .= " -split \@\-\@" if defined($SPLIT_HYPHEN);
$pipeline .= " -split \@\/\@" if defined($SPLIT_SLASH);
$pipeline .= " |";
unless ($FOREST) {
$pipeline .= 'sed \'s/^()$//\' |'; # Remove empty trees (failed parses)
$pipeline .= 'sed \'s/^(/( (/\' |'; # Add Berkeley-style opening ( + blank
$pipeline .= 'sed \'s/)$/))/\' |'; # Add Berkeley-style closing )
$pipeline .= 'sed \'s/^$/(())/\' |'; # Restore empty trees (Berkeley-style)
$pipeline .= "$RealBin/berkeleyparsed2mosesxml.perl |";
$pipeline .= 'sed \'s/^<tree label="TOP"/<tree label="ROOT"/\' |';
# Run the parsing / post-processing pipeline.
open(PARSE, $pipeline);
if ($FOREST) {
while (<PARSE>) {
print $_;
} else {
open(TMPESCAPED, $tmpEscaped);
while (<PARSE>) {
my $outLine = $_;
my $unparsedLine = <TMPESCAPED>;
if ($UNPARSEABLE == 1 && length($outLine) == 1) {
print $unparsedLine;
} else {
print $outLine;
`rm $tmpSplitPoints`;
`rm $tmpDeescaped`;
`rm $tmpEscaped`;