|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
use warnings; |
|
use strict; |
|
use File::Basename; |
|
|
|
sub RunFork($); |
|
sub systemCheck($); |
|
sub NumStr($); |
|
sub DigitStr($); |
|
sub CharStr($); |
|
sub GetSplitVersion($); |
|
|
|
my $alph = "abcdefghijklmnopqrstuvwxyz"; |
|
my @alph = (split(//,$alph)); |
|
|
|
print "Started ".localtime() ."\n"; |
|
|
|
my $numParallel= $ARGV[0]; |
|
$numParallel = 1 if $numParallel < 1; |
|
|
|
my $splitCmd= $ARGV[1]; |
|
my $sortCmd= $ARGV[2]; |
|
my $extractCmd= $ARGV[3]; |
|
|
|
my $target = $ARGV[4]; |
|
my $source = $ARGV[5]; |
|
my $align = $ARGV[6]; |
|
my $extract = $ARGV[7]; |
|
|
|
my $makeTTable = 1; |
|
my $otherExtractArgs= ""; |
|
my $weights = ""; |
|
my $baselineExtract; |
|
my $glueFile; |
|
my $phraseOrientation = 0; |
|
my $phraseOrientationPriorsFile; |
|
my $splitCmdOption = ""; |
|
|
|
my $GZIP_EXEC; |
|
if(`which pigz 2> /dev/null`) { |
|
$GZIP_EXEC = 'pigz'; |
|
} |
|
else { |
|
$GZIP_EXEC = 'gzip'; |
|
} |
|
print STDERR "using $GZIP_EXEC \n"; |
|
|
|
my $isBSDSplit = GetSplitVersion($splitCmd); |
|
print STDERR "isBSDSplit=$isBSDSplit \n"; |
|
|
|
if ($isBSDSplit == 0) { |
|
$splitCmdOption .= "-d"; |
|
} |
|
|
|
my $gzOut = 0; |
|
|
|
for (my $i = 8; $i < $#ARGV + 1; ++$i) |
|
{ |
|
$makeTTable = 0 if $ARGV[$i] eq "--NoTTable"; |
|
if ($ARGV[$i] eq '--BaselineExtract') { |
|
$baselineExtract = $ARGV[++$i]; |
|
next; |
|
} |
|
if ($ARGV[$i] eq '--InstanceWeights') { |
|
$weights = $ARGV[++$i]; |
|
next; |
|
} |
|
if ($ARGV[$i] eq '--GlueGrammar') { |
|
$glueFile = $ARGV[++$i]; |
|
next; |
|
} |
|
$phraseOrientation = 1 if $ARGV[$i] eq "--PhraseOrientation"; |
|
if ($ARGV[$i] eq '--PhraseOrientationPriors') { |
|
$phraseOrientationPriorsFile = $ARGV[++$i]; |
|
next; |
|
} |
|
if ($ARGV[$i] eq '--GZOutput') { |
|
$gzOut = 1; |
|
} |
|
|
|
$otherExtractArgs .= $ARGV[$i] ." "; |
|
} |
|
|
|
die("Need to specify --GZOutput for parallel extract") if ($gzOut == 0); |
|
|
|
my $cmd; |
|
my $TMPDIR=dirname($extract) ."/tmp.$$"; |
|
$cmd = "mkdir -p $TMPDIR; ls -l $TMPDIR"; |
|
print STDERR "Executing: $cmd \n"; |
|
`$cmd`; |
|
|
|
my $totalLines = int(`cat $align | wc -l`); |
|
my $linesPerSplit = int($totalLines / $numParallel) + 1; |
|
|
|
print "total=$totalLines line-per-split=$linesPerSplit \n"; |
|
|
|
my @children; |
|
my $pid; |
|
|
|
if ($numParallel > 1) |
|
{ |
|
$cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $target $TMPDIR/target."; |
|
$pid = RunFork($cmd); |
|
push(@children, $pid); |
|
|
|
$cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $source $TMPDIR/source."; |
|
$pid = RunFork($cmd); |
|
push(@children, $pid); |
|
|
|
$cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $align $TMPDIR/align."; |
|
$pid = RunFork($cmd); |
|
push(@children, $pid); |
|
|
|
if ($weights) { |
|
$cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $weights $TMPDIR/weights."; |
|
$pid = RunFork($cmd); |
|
push(@children, $pid); |
|
} |
|
|
|
|
|
foreach (@children) { |
|
waitpid($_, 0); |
|
} |
|
|
|
} |
|
else |
|
{ |
|
my $numStr = NumStr(0); |
|
|
|
$cmd = "ln -s $target $TMPDIR/target.$numStr"; |
|
`$cmd`; |
|
|
|
$cmd = "ln -s $source $TMPDIR/source.$numStr"; |
|
`$cmd`; |
|
|
|
$cmd = "ln -s $align $TMPDIR/align.$numStr"; |
|
`$cmd`; |
|
|
|
if ($weights) { |
|
$cmd = "ln -s $weights $TMPDIR/weights.$numStr"; |
|
`$cmd`; |
|
} |
|
} |
|
|
|
|
|
@children = (); |
|
for (my $i = 0; $i < $numParallel; ++$i) |
|
{ |
|
my $pid = fork(); |
|
|
|
if ($pid == 0) |
|
{ |
|
my $numStr = NumStr($i); |
|
my $weightsCmd = ""; |
|
if ($weights) { |
|
$weightsCmd = "--InstanceWeights $TMPDIR/weights.$numStr"; |
|
} |
|
|
|
my $glueArg = ""; |
|
if (defined($glueFile)) { |
|
$glueArg = "--GlueGrammar $TMPDIR/glue.$numStr"; |
|
} |
|
|
|
|
|
my $cmd = "$extractCmd $TMPDIR/target.$numStr $TMPDIR/source.$numStr $TMPDIR/align.$numStr $TMPDIR/extract.$numStr $glueArg $otherExtractArgs $weightsCmd --SentenceOffset ".($i*$linesPerSplit)." 2>> /dev/stderr \n"; |
|
`$cmd`; |
|
|
|
exit(); |
|
} |
|
else |
|
{ |
|
push(@children, $pid); |
|
} |
|
} |
|
|
|
|
|
foreach (@children) { |
|
waitpid($_, 0); |
|
} |
|
|
|
|
|
my $catCmd = "gunzip -c "; |
|
my $catInvCmd = $catCmd; |
|
my $catOCmd = $catCmd; |
|
my $catContextCmd = $catCmd; |
|
my $catContextInvCmd = $catCmd; |
|
|
|
for (my $i = 0; $i < $numParallel; ++$i) |
|
{ |
|
my $numStr = NumStr($i); |
|
$catCmd .= "$TMPDIR/extract.$numStr.gz "; |
|
$catInvCmd .= "$TMPDIR/extract.$numStr.inv.gz "; |
|
$catOCmd .= "$TMPDIR/extract.$numStr.o.gz "; |
|
$catContextCmd .= "$TMPDIR/extract.$numStr.context "; |
|
$catContextInvCmd .= "$TMPDIR/extract.$numStr.context.inv "; |
|
} |
|
if (defined($baselineExtract)) { |
|
my $sorted = -e "$baselineExtract.sorted.gz" ? ".sorted" : ""; |
|
$catCmd .= "$baselineExtract$sorted.gz "; |
|
$catInvCmd .= "$baselineExtract.inv$sorted.gz "; |
|
$catOCmd .= "$baselineExtract.o$sorted.gz "; |
|
} |
|
|
|
$catCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | $GZIP_EXEC -c > $extract.sorted.gz 2>> /dev/stderr \n"; |
|
$catInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | $GZIP_EXEC -c > $extract.inv.sorted.gz 2>> /dev/stderr \n"; |
|
$catOCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | $GZIP_EXEC -c > $extract.o.sorted.gz 2>> /dev/stderr \n"; |
|
$catContextCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | uniq | $GZIP_EXEC -c > $extract.context.sorted.gz 2>> /dev/stderr \n"; |
|
$catContextInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | uniq | $GZIP_EXEC -c > $extract.context.inv.sorted.gz 2>> /dev/stderr \n"; |
|
|
|
|
|
@children = (); |
|
if ($makeTTable) |
|
{ |
|
print STDERR "merging extract / extract.inv\n"; |
|
$pid = RunFork($catCmd); |
|
push(@children, $pid); |
|
|
|
$pid = RunFork($catInvCmd); |
|
push(@children, $pid); |
|
} |
|
else { |
|
print STDERR "skipping extract, doing only extract.o\n"; |
|
} |
|
|
|
if ($otherExtractArgs =~ /--FlexibilityScore/) { |
|
$pid = RunFork($catContextCmd); |
|
push(@children, $pid); |
|
|
|
$pid = RunFork($catContextInvCmd); |
|
push(@children, $pid); |
|
} |
|
|
|
my $numStr = NumStr(0); |
|
if (-e "$TMPDIR/extract.$numStr.o.gz") |
|
{ |
|
$pid = RunFork($catOCmd); |
|
push(@children, $pid); |
|
} |
|
|
|
|
|
foreach (@children) { |
|
waitpid($_, 0); |
|
} |
|
|
|
|
|
if (defined($glueFile)) { |
|
my $cmd = "cat $TMPDIR/glue.* | LC_ALL=C sort | uniq > $glueFile"; |
|
print STDERR "Merging glue rules: $cmd \n"; |
|
print STDERR `$cmd`; |
|
} |
|
|
|
|
|
if ($phraseOrientation && defined($phraseOrientationPriorsFile)) { |
|
print STDERR "Merging phrase orientation priors\n"; |
|
|
|
my @orientationPriorsCountFiles = glob("$TMPDIR/*.phraseOrientationPriors"); |
|
my %priorCounts; |
|
|
|
foreach my $filenamePhraseOrientationPriors (@orientationPriorsCountFiles) { |
|
if (-f $filenamePhraseOrientationPriors) { |
|
open my $infilePhraseOrientationPriors, '<', $filenamePhraseOrientationPriors or die "cannot open $filenamePhraseOrientationPriors: $!"; |
|
while (my $line = <$infilePhraseOrientationPriors>) { |
|
print $line; |
|
my ($key, $value) = split / /, $line; |
|
$priorCounts{$key} += $value; |
|
} |
|
close $infilePhraseOrientationPriors; |
|
} |
|
} |
|
|
|
open my $outPhraseOrientationPriors, '>', $phraseOrientationPriorsFile or die "cannot open $phraseOrientationPriorsFile: $!"; |
|
foreach my $key (sort keys %priorCounts) { |
|
print $outPhraseOrientationPriors $key." ".$priorCounts{$key}."\n"; |
|
} |
|
close($outPhraseOrientationPriors); |
|
} |
|
|
|
|
|
$cmd = "rm -rf $TMPDIR \n"; |
|
systemCheck($cmd); |
|
|
|
print STDERR "Finished ".localtime() ."\n"; |
|
|
|
|
|
|
|
|
|
sub RunFork($) |
|
{ |
|
my $cmd = shift; |
|
|
|
my $pid = fork(); |
|
|
|
if ($pid == 0) |
|
{ |
|
print STDERR $cmd; |
|
systemCheck($cmd); |
|
exit(); |
|
} |
|
return $pid; |
|
} |
|
|
|
sub systemCheck($) |
|
{ |
|
my $cmd = shift; |
|
my $retVal = system($cmd); |
|
if ($retVal != 0) |
|
{ |
|
exit(1); |
|
} |
|
} |
|
|
|
sub DigitStr($) |
|
{ |
|
my $i = shift; |
|
my $numStr; |
|
if ($i < 10) { |
|
$numStr = "000000$i"; |
|
} |
|
elsif ($i < 100) { |
|
$numStr = "00000$i"; |
|
} |
|
elsif ($i < 1000) { |
|
$numStr = "0000$i"; |
|
} |
|
elsif ($i < 10000) { |
|
$numStr = "000$i"; |
|
} |
|
elsif ($i < 100000) { |
|
$numStr = "00$i"; |
|
} |
|
elsif ($i < 1000000) { |
|
$numStr = "0$i"; |
|
} |
|
else { |
|
$numStr = $i; |
|
} |
|
return $numStr; |
|
} |
|
|
|
sub CharStr($) |
|
{ |
|
my $i = shift; |
|
my $charStr; |
|
my @bit=(); |
|
|
|
while ($i>0){ |
|
push @bit, $i%26; |
|
$i=int($i/26); |
|
} |
|
my $offset=scalar(@bit); |
|
my $h; |
|
for ($h=6;$h>=$offset;--$h) { $charStr.="a"; } |
|
for ($h=$offset-1;$h>=0;--$h) { $charStr.="$alph[$bit[$h]]"; } |
|
return $charStr; |
|
} |
|
|
|
sub NumStr($) |
|
{ |
|
my $i = shift; |
|
if ($isBSDSplit){ |
|
return CharStr($i); |
|
}else{ |
|
return DigitStr($i); |
|
} |
|
} |
|
|
|
sub GetSplitVersion($) |
|
{ |
|
my $splitCmd = shift; |
|
my $retVal = system("$splitCmd --help > /dev/null"); |
|
if ($retVal != 0) { |
|
return 1; |
|
} |
|
else { |
|
return 0; |
|
} |
|
} |
|
|
|
|