sakharamg's picture
Uploading all files
158b61b
#!/usr/bin/env perl
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
# example
# ./extract-parallel.perl 8 ./coreutils-8.9/src/split "./coreutils-8.9/src/sort --batch-size=253" ./extract ./corpus.5.en ./corpus.5.ar ./align.ar-en.grow-diag-final-and ./extracted 7 --NoFileLimit orientation --GZOutput
use warnings;
use strict;
use File::Basename;
sub RunFork($);
sub systemCheck($);
sub NumStr($);
sub DigitStr($);
sub CharStr($);
sub GetSplitVersion($);
my $alph = "abcdefghijklmnopqrstuvwxyz";
my @alph = (split(//,$alph));
print "Started ".localtime() ."\n";
my $numParallel= $ARGV[0];
$numParallel = 1 if $numParallel < 1;
my $splitCmd= $ARGV[1];
my $sortCmd= $ARGV[2];
my $extractCmd= $ARGV[3];
my $target = $ARGV[4]; # 1st arg of extract argument
my $source = $ARGV[5]; # 2nd arg of extract argument
my $align = $ARGV[6]; # 3rd arg of extract argument
my $extract = $ARGV[7]; # 4th arg of extract argument
my $makeTTable = 1; # whether to build the ttable extract files
my $otherExtractArgs= "";
my $weights = "";
my $baselineExtract;
my $glueFile;
my $phraseOrientation = 0;
my $phraseOrientationPriorsFile;
my $splitCmdOption = "";
my $GZIP_EXEC;
if(`which pigz 2> /dev/null`) {
$GZIP_EXEC = 'pigz';
}
else {
$GZIP_EXEC = 'gzip';
}
print STDERR "using $GZIP_EXEC \n";
my $isBSDSplit = GetSplitVersion($splitCmd);
print STDERR "isBSDSplit=$isBSDSplit \n";
if ($isBSDSplit == 0) {
$splitCmdOption .= "-d";
}
my $gzOut = 0;
for (my $i = 8; $i < $#ARGV + 1; ++$i)
{
$makeTTable = 0 if $ARGV[$i] eq "--NoTTable";
if ($ARGV[$i] eq '--BaselineExtract') {
$baselineExtract = $ARGV[++$i];
next;
}
if ($ARGV[$i] eq '--InstanceWeights') {
$weights = $ARGV[++$i];
next;
}
if ($ARGV[$i] eq '--GlueGrammar') {
$glueFile = $ARGV[++$i];
next;
}
$phraseOrientation = 1 if $ARGV[$i] eq "--PhraseOrientation";
if ($ARGV[$i] eq '--PhraseOrientationPriors') {
$phraseOrientationPriorsFile = $ARGV[++$i];
next;
}
if ($ARGV[$i] eq '--GZOutput') {
$gzOut = 1;
}
$otherExtractArgs .= $ARGV[$i] ." ";
}
die("Need to specify --GZOutput for parallel extract") if ($gzOut == 0);
my $cmd;
my $TMPDIR=dirname($extract) ."/tmp.$$";
$cmd = "mkdir -p $TMPDIR; ls -l $TMPDIR";
print STDERR "Executing: $cmd \n";
`$cmd`;
my $totalLines = int(`cat $align | wc -l`);
my $linesPerSplit = int($totalLines / $numParallel) + 1;
print "total=$totalLines line-per-split=$linesPerSplit \n";
my @children;
my $pid;
if ($numParallel > 1)
{
$cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $target $TMPDIR/target.";
$pid = RunFork($cmd);
push(@children, $pid);
$cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $source $TMPDIR/source.";
$pid = RunFork($cmd);
push(@children, $pid);
$cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $align $TMPDIR/align.";
$pid = RunFork($cmd);
push(@children, $pid);
if ($weights) {
$cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $weights $TMPDIR/weights.";
$pid = RunFork($cmd);
push(@children, $pid);
}
# wait for everything is finished
foreach (@children) {
waitpid($_, 0);
}
}
else
{
my $numStr = NumStr(0);
$cmd = "ln -s $target $TMPDIR/target.$numStr";
`$cmd`;
$cmd = "ln -s $source $TMPDIR/source.$numStr";
`$cmd`;
$cmd = "ln -s $align $TMPDIR/align.$numStr";
`$cmd`;
if ($weights) {
$cmd = "ln -s $weights $TMPDIR/weights.$numStr";
`$cmd`;
}
}
# run extract
@children = ();
for (my $i = 0; $i < $numParallel; ++$i)
{
my $pid = fork();
if ($pid == 0)
{ # child
my $numStr = NumStr($i);
my $weightsCmd = "";
if ($weights) {
$weightsCmd = "--InstanceWeights $TMPDIR/weights.$numStr";
}
my $glueArg = "";
if (defined($glueFile)) {
$glueArg = "--GlueGrammar $TMPDIR/glue.$numStr";
}
#print STDERR "glueArg=$glueArg \n";
my $cmd = "$extractCmd $TMPDIR/target.$numStr $TMPDIR/source.$numStr $TMPDIR/align.$numStr $TMPDIR/extract.$numStr $glueArg $otherExtractArgs $weightsCmd --SentenceOffset ".($i*$linesPerSplit)." 2>> /dev/stderr \n";
`$cmd`;
exit();
}
else
{ # parent
push(@children, $pid);
}
}
# wait for everything is finished
foreach (@children) {
waitpid($_, 0);
}
# merge
my $catCmd = "gunzip -c ";
my $catInvCmd = $catCmd;
my $catOCmd = $catCmd;
my $catContextCmd = $catCmd;
my $catContextInvCmd = $catCmd;
for (my $i = 0; $i < $numParallel; ++$i)
{
my $numStr = NumStr($i);
$catCmd .= "$TMPDIR/extract.$numStr.gz ";
$catInvCmd .= "$TMPDIR/extract.$numStr.inv.gz ";
$catOCmd .= "$TMPDIR/extract.$numStr.o.gz ";
$catContextCmd .= "$TMPDIR/extract.$numStr.context ";
$catContextInvCmd .= "$TMPDIR/extract.$numStr.context.inv ";
}
if (defined($baselineExtract)) {
my $sorted = -e "$baselineExtract.sorted.gz" ? ".sorted" : "";
$catCmd .= "$baselineExtract$sorted.gz ";
$catInvCmd .= "$baselineExtract.inv$sorted.gz ";
$catOCmd .= "$baselineExtract.o$sorted.gz ";
}
$catCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | $GZIP_EXEC -c > $extract.sorted.gz 2>> /dev/stderr \n";
$catInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | $GZIP_EXEC -c > $extract.inv.sorted.gz 2>> /dev/stderr \n";
$catOCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | $GZIP_EXEC -c > $extract.o.sorted.gz 2>> /dev/stderr \n";
$catContextCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | uniq | $GZIP_EXEC -c > $extract.context.sorted.gz 2>> /dev/stderr \n";
$catContextInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | uniq | $GZIP_EXEC -c > $extract.context.inv.sorted.gz 2>> /dev/stderr \n";
@children = ();
if ($makeTTable)
{
print STDERR "merging extract / extract.inv\n";
$pid = RunFork($catCmd);
push(@children, $pid);
$pid = RunFork($catInvCmd);
push(@children, $pid);
}
else {
print STDERR "skipping extract, doing only extract.o\n";
}
if ($otherExtractArgs =~ /--FlexibilityScore/) {
$pid = RunFork($catContextCmd);
push(@children, $pid);
$pid = RunFork($catContextInvCmd);
push(@children, $pid);
}
my $numStr = NumStr(0);
if (-e "$TMPDIR/extract.$numStr.o.gz")
{
$pid = RunFork($catOCmd);
push(@children, $pid);
}
# wait for all sorting to finish
foreach (@children) {
waitpid($_, 0);
}
# merge glue rules
if (defined($glueFile)) {
my $cmd = "cat $TMPDIR/glue.* | LC_ALL=C sort | uniq > $glueFile";
print STDERR "Merging glue rules: $cmd \n";
print STDERR `$cmd`;
}
# merge phrase orientation priors (GHKM extraction)
if ($phraseOrientation && defined($phraseOrientationPriorsFile)) {
print STDERR "Merging phrase orientation priors\n";
my @orientationPriorsCountFiles = glob("$TMPDIR/*.phraseOrientationPriors");
my %priorCounts;
foreach my $filenamePhraseOrientationPriors (@orientationPriorsCountFiles) {
if (-f $filenamePhraseOrientationPriors) {
open my $infilePhraseOrientationPriors, '<', $filenamePhraseOrientationPriors or die "cannot open $filenamePhraseOrientationPriors: $!";
while (my $line = <$infilePhraseOrientationPriors>) {
print $line;
my ($key, $value) = split / /, $line;
$priorCounts{$key} += $value;
}
close $infilePhraseOrientationPriors;
}
}
open my $outPhraseOrientationPriors, '>', $phraseOrientationPriorsFile or die "cannot open $phraseOrientationPriorsFile: $!";
foreach my $key (sort keys %priorCounts) {
print $outPhraseOrientationPriors $key." ".$priorCounts{$key}."\n";
}
close($outPhraseOrientationPriors);
}
# delete temporary files
$cmd = "rm -rf $TMPDIR \n";
systemCheck($cmd);
print STDERR "Finished ".localtime() ."\n";
# -----------------------------------------
# -----------------------------------------
sub RunFork($)
{
my $cmd = shift;
my $pid = fork();
if ($pid == 0)
{ # child
print STDERR $cmd;
systemCheck($cmd);
exit();
}
return $pid;
}
sub systemCheck($)
{
my $cmd = shift;
my $retVal = system($cmd);
if ($retVal != 0)
{
exit(1);
}
}
sub DigitStr($)
{
my $i = shift;
my $numStr;
if ($i < 10) {
$numStr = "000000$i";
}
elsif ($i < 100) {
$numStr = "00000$i";
}
elsif ($i < 1000) {
$numStr = "0000$i";
}
elsif ($i < 10000) {
$numStr = "000$i";
}
elsif ($i < 100000) {
$numStr = "00$i";
}
elsif ($i < 1000000) {
$numStr = "0$i";
}
else {
$numStr = $i;
}
return $numStr;
}
sub CharStr($)
{
my $i = shift;
my $charStr;
my @bit=();
while ($i>0){
push @bit, $i%26;
$i=int($i/26);
}
my $offset=scalar(@bit);
my $h;
for ($h=6;$h>=$offset;--$h) { $charStr.="a"; }
for ($h=$offset-1;$h>=0;--$h) { $charStr.="$alph[$bit[$h]]"; }
return $charStr;
}
sub NumStr($)
{
my $i = shift;
if ($isBSDSplit){
return CharStr($i);
}else{
return DigitStr($i);
}
}
sub GetSplitVersion($)
{
my $splitCmd = shift;
my $retVal = system("$splitCmd --help > /dev/null");
if ($retVal != 0) {
return 1;
}
else {
return 0;
}
}