| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| | use warnings; |
| | use strict; |
| | use File::Basename; |
| |
|
| | sub NumStr($); |
| |
|
| | print "Started ".localtime() ."\n"; |
| |
|
| | my $numParallel = $ARGV[0]; |
| | my $splitCmd = $ARGV[1]; |
| | my $trainCmd = $ARGV[2]; |
| | my $inputExt = $ARGV[3]; |
| | my $outputExt = $ARGV[4]; |
| | my $corpus = $ARGV[5]; |
| | my $align = $ARGV[6]; |
| |
|
| | my $TMPDIR=dirname($align) ."/tmp.$$"; |
| | mkdir $TMPDIR; |
| |
|
| | my $scriptDir=dirname($trainCmd) ."/.."; |
| |
|
| | |
| | my $totalLines = int(`wc -l $corpus.$inputExt`); |
| | my $linesPerSplit = int($totalLines / $numParallel) + 1; |
| |
|
| | my $cmd = "$splitCmd -d -l $linesPerSplit -a 5 $corpus.$inputExt $TMPDIR/source."; |
| | `$cmd`; |
| |
|
| | $cmd = "$splitCmd -d -l $linesPerSplit -a 5 $corpus.$outputExt $TMPDIR/target."; |
| | `$cmd`; |
| |
|
| | for (my $i = 0; $i < $numParallel; ++$i) |
| | { |
| | my $numStr = NumStr($i); |
| | rename("$TMPDIR/source.$numStr", "$TMPDIR/$numStr.source"); |
| | rename("$TMPDIR/target.$numStr", "$TMPDIR/$numStr.target"); |
| | } |
| |
|
| | |
| | my $isParent = 1; |
| | my @childs; |
| | for (my $i = 0; $i < $numParallel; ++$i) |
| | { |
| | my $pid = fork(); |
| |
|
| | if ($pid == 0) |
| | { |
| | $isParent = 0; |
| |
|
| | my $numStr = NumStr($i); |
| | my $cmd = "$trainCmd -dont-zip -last-step 1 -scripts-root-dir $scriptDir -f source -e target -alignment grow-diag-final-and -parts 3 -reordering msd-bidirectional-fe -corpus $TMPDIR/$numStr -corpus-dir $TMPDIR/prepared.$numStr \n"; |
| | print $cmd; |
| | `$cmd`; |
| |
|
| | $cmd = "$trainCmd -dont-zip -first-step 2 -last-step 2 -scripts-root-dir $scriptDir -f source -e target -alignment grow-diag-final-and -parts 3 -reordering msd-bidirectional-fe -corpus-dir $TMPDIR/prepared.$numStr -giza-e2f $TMPDIR/giza.$numStr -direction 2 \n"; |
| | print $cmd; |
| | `$cmd`; |
| |
|
| | $cmd = "$trainCmd -dont-zip -first-step 2 -last-step 2 -scripts-root-dir $scriptDir -f source -e target -alignment grow-diag-final-and -parts 3 -reordering msd-bidirectional-fe -corpus-dir $TMPDIR/prepared.$numStr -giza-f2e $TMPDIR/giza-inverse.$numStr -direction 1 \n"; |
| | print $cmd; |
| | `$cmd`; |
| |
|
| | $cmd = "$trainCmd -dont-zip -first-step 3 -last-step 3 -scripts-root-dir $scriptDir -f source -e target -alignment grow-diag-final-and -parts 3 -reordering msd-bidirectional-fe -giza-e2f $TMPDIR/giza.$numStr -giza-f2e $TMPDIR/giza-inverse.$numStr -alignment-file $TMPDIR/aligned.$numStr -alignment grow-diag-final-and \n"; |
| | print $cmd; |
| | `$cmd`; |
| |
|
| | exit(); |
| | } |
| | else |
| | { |
| | push(@childs, $pid); |
| | } |
| |
|
| | } |
| |
|
| | |
| | if ($isParent) |
| | { |
| | foreach (@childs) { |
| | waitpid($_, 0); |
| | } |
| | } |
| | else |
| | { |
| | die "shouldn't be here"; |
| | } |
| |
|
| | |
| | my $cmd = "cat "; |
| | for (my $i = 0; $i < $numParallel; ++$i) |
| | { |
| | my $numStr = NumStr($i); |
| | $cmd .= "$TMPDIR/aligned.$numStr.grow-diag-final-and "; |
| | } |
| | $cmd .= " > $align \n"; |
| | print $cmd; |
| | `$cmd`; |
| |
|
| | sub NumStr($) |
| | { |
| | my $i = shift; |
| | my $numStr; |
| | if ($i < 10) { |
| | $numStr = "000000$i"; |
| | } |
| | elsif ($i < 100) { |
| | $numStr = "00000$i"; |
| | } |
| | elsif ($i < 1000) { |
| | $numStr = "0000$i"; |
| | } |
| | elsif ($i < 10000) { |
| | $numStr = "000$i"; |
| | } |
| | elsif ($i < 100000) { |
| | $numStr = "00$i"; |
| | } |
| | elsif ($i < 1000000) { |
| | $numStr = "0$i"; |
| | } |
| | else { |
| | $numStr = $i; |
| | } |
| | return $numStr; |
| | } |
| |
|
| |
|