#!/usr/bin/env perl use warnings; #sed replacement for -w perl parameter use Cwd; use File::Basename; # This program is like run.pl except rather than just running on a local # machine, it can be configured to run on remote machines via ssh. # It requires that you have set up passwordless access to those machines, # and that Kaldi is running from a location that is accessible via the # same path on those machines (presumably via an NFS mount). # # It looks for a file .queue/machines that should have, on each line, the name # of a machine that you can ssh to (which may include this machine). It doesn't # have to be a fully qualified name. # # Later we may extend this so that on each line of .queue/machines you # can specify various resources that each machine has, such as how # many slots and how much memory, and make it wait if machines are # busy. But for now it simply ssh's to a machine from those in the list. # The command-line interface of this program is the same as run.pl; # see run.pl for more information about the usage. @ARGV < 2 && die "usage: ssh.pl log-file command-line arguments..."; $jobstart = 1; $jobend = 1; $qsub_opts=""; # These will be ignored. # First parse an option like JOB=1:4, and any # options that would normally be given to # ssh.pl, which we will just discard. if (@ARGV > 0) { while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { # parse any options # that would normally go to qsub, but which will be ignored here. $switch = shift @ARGV; if ($switch eq "-V") { $qsub_opts .= "-V "; } else { $option = shift @ARGV; if ($switch eq "-sync" && $option =~ m/^[yY]/) { $qsub_opts .= "-sync "; # Note: in the # corresponding code in queue.pl it says instead, just "$sync = 1;". } $qsub_opts .= "$switch $option "; if ($switch eq "-pe") { # e.g. -pe smp 5 $option2 = shift @ARGV; $qsub_opts .= "$option2 "; } } } if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:10 $jobname = $1; $jobstart = $2; $jobend = $3; shift; if ($jobstart > $jobend) { die "run.pl: invalid job range $ARGV[0]"; } if ($jobstart <= 0) { die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility)"; } } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1. $jobname = $1; $jobstart = $2; $jobend = $2; shift; } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) { print STDERR "Warning: suspicious first argument to run.pl: $ARGV[0]\n"; } } if ($qsub_opts ne "") { print STDERR "Warning: ssh.pl ignoring options \"$qsub_opts\"\n"; } { # Read .queue/machines if (!open(Q, "<.queue/machines")) { print STDERR "ssh.pl: expected the file .queue/machines to exist.\n"; exit(1); } @machines = (); while () { chop; if ($_ ne "") { @A = split; if (@A != 1) { die "ssh.pl: bad line '$_' in .queue/machines."; } if ($A[0] !~ m/^[a-z0-9\.\-]+/) { die "ssh.pl: invalid machine name '$A[0]'"; } push @machines, $A[0]; } } if (@machines == 0) { die "ssh.pl: no machines listed in .queue/machines"; } } $logfile = shift @ARGV; if (defined $jobname && $logfile !~ m/$jobname/ && $jobend > $jobstart) { print STDERR "ssh.pl: you are trying to run a parallel job but " . "you are putting the output into just one log file ($logfile)\n"; exit(1); } { $offset = 0; # $offset will be an offset added to any index from the job-id # specified if the user does JOB=1:10. The main point of this is # that there are instances where a script will manually submit a # number of jobs to the queue, e.g. with log files foo.1.log, # foo.2.log and so on, and we don't want all of these to go # to the first machine. @A = split(".", basename($logfile)); # if $logfile looks like foo.9.log, add 9 to $offset. foreach $a (@A) { if ($a =~ m/^\d+$/) { $offset += $a; } } } $cmd = ""; foreach $x (@ARGV) { if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } elsif ($x =~ m:\":) { $cmd .= "'$x' "; } else { $cmd .= "\"$x\" "; } } for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { $childpid = fork(); if (!defined $childpid) { die "Error forking in ssh.pl (writing to $logfile)"; } if ($childpid == 0) { # We're in the child... this branch executes the job and returns (possibly # with an error status). if (defined $jobname) { $cmd =~ s/$jobname/$jobid/g; $logfile =~ s/$jobname/$jobid/g; } { # work out the machine to ssh to. $local_offset = $offset + $jobid - 1; # subtract 1 since jobs never start # from 0; we'd like the first job # to normally run on the first # machine. $num_machines = scalar @machines; # in the next line, the "+ $num_machines" is in case $local_offset is # negative, to ensure the modulus is calculated in the mathematical way, not # in the C way where (negative number % positive number) is negative. $machines_index = ($local_offset + $num_machines) % $num_machines; $machine = $machines[$machines_index]; } if (!open(S, "|ssh $machine bash")) { print STDERR "ssh.pl failed to ssh to $machine"; exit(1); # exits from the forked process within ssh.pl. } $cwd = getcwd(); $logdir = dirname($logfile); # Below, we're printing into ssh which has opened a bash session; these are # bash commands. print S "set -e\n"; # if any of the later commands fails, we want it to exit. print S "cd $cwd\n"; print S ". ./path.sh\n"; print S "mkdir -p $logdir\n"; print S "time1=\`date +\"%s\"\`\n"; print S "( echo '#' Running on \`hostname\`\n"; print S " echo '#' Started at \`date\`\n"; print S " echo -n '# '; cat <$logfile\n"; print S "set +e\n"; # we don't want bash to exit if the next line fails. # in the next line, || true means allow this one to fail and not have bash exit immediately. print S " ( $cmd ) 2>>$logfile >>$logfile\n"; print S "ret=\$?\n"; print S "set -e\n"; # back into mode where it will exit on error. print S "time2=\`date +\"%s\"\`\n"; print S "echo '#' Accounting: time=\$((\$time2-\$time1)) threads=1 >>$logfile\n"; print S "echo '#' Finished at \`date\` with status \$ret >>$logfile\n"; print S "exit \$ret"; # return with the status the command exited with. $ret = close(S); $ssh_return_status = $?; # see http://perldoc.perl.org/functions/close.html for explanation of return # status of close() and the variables it sets. if (! $ret && $! != 0) { die "ssh.pl: unexpected problem ssh'ing to machine $machine"; } if ($ssh_return_status != 0) { exit(1); } # exit with error status from this forked process. else { exit(0); } # else exit with non-error status. } } $ret = 0; $numfail = 0; for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { $r = wait(); if ($r == -1) { die "Error waiting for child process"; } # should never happen. if ($? != 0) { $numfail++; $ret = 1; } # The child process failed. } if ($ret != 0) { $njobs = $jobend - $jobstart + 1; if ($njobs == 1) { if (defined $jobname) { $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with # that job. } print STDERR "ssh.pl: job failed, log is in $logfile\n"; if ($logfile =~ m/JOB/) { print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script."; } } else { $logfile =~ s/$jobname/*/g; print STDERR "ssh.pl: $numfail / $njobs failed, log is in $logfile\n"; } } exit ($ret);