blob: 72109278f0acedc88d3172e33b4b02604247bf9d [file] [log] [blame]
#!/usr/bin/perl -w
#*******************************************************************************
#* Copyright (c) 2011 IBM Corporation.
#* All rights reserved. This program and the accompanying materials
#* are made available under the terms of the Eclipse Public License v1.0
#* which accompanies this distribution, and is available at
#* http://www.eclipse.org/legal/epl-v10.html
#*
#* Contributors:
#* IBM Corporation - Initial Implementation
#*******************************************************************************/
use strict;
use File::Temp qw/tempfile/;
use Text::ParseWords;
use Cwd;
my $patint="([\\+\\-\\d]+)"; # Pattern for Integer number
my $patnode="([\^\\s]+(\\.[\^\\s]*)*)"; # Pattern for domain name (a.b.c)
my $portbase=50000;
my $portrange=10000;
my $verbose=0;
my $TOTAL_PROCS=0;
my @JOB;
my $line;
my $pid;
my $ROUTING_FILE;
my $debuggerId;
my $debuggerPath;
my @debuggerArgs;
my @child_pids;
#####################################################################
#
# Script to start the SDM and generate a routing table. Used when the
# ompi-ps command can't be used to obtain job information, such as
# interactive launch via job scheduler.
#
# The routing table is called 'routing_file' and it is generated in
# the current working directory. The sdm's working directory must be
# the same location if they are to find the table. Also, any old
# routing tables should be removed before starting the sdm.
#
# Routing table format is:
#
# num_tasks
# task_num host_name port_num
# ...
#
# where:
# num_tasks is the total number of tasks in the MPI job
# task_num is the task number for a process (e.g. 0, 1, 2, etc.)
# host_name is the hostname of the node the process is running on
# port_num is a semi-random port number that the debugger will listen on
#
#####################################################################
sub get_node_map {
my ($node) = @_;
my $rank;
my $line;
# find proc info
while ($line=<IN>) {
if ($line=~/.*Process rank: $patint/) {
$rank = $1;
$JOB[$rank] = $node;
print "found proc $rank\n" if ($verbose);
} elsif ($line =~ /^$/) {
print "found end of node map\n" if ($verbose);
return;
}
}
}
sub get_job_map {
my $node;
my $nprocs;
my $line;
# find node/proc info
while ($line=<IN>) {
if ($line=~/^ Data for node: (Name: )?$patnode\s*Num procs: $patint/) {
($node,$nprocs) = ($2, $4);
print "found node $node, procs $nprocs\n" if ($verbose);
$TOTAL_PROCS += $nprocs;
get_node_map($node);
} elsif ($line =~ /^ =+$/) {
print "found end of table\n" if ($verbose);
return;
}
}
}
sub generate_routing_file {
my ($file) = @_;
open(OUT,"> $file") || die "cannot open file $file";
printf(OUT "%d\n", $TOTAL_PROCS);
for (my $count=0; $count < $TOTAL_PROCS; $count++) {
printf(OUT "%d %s %d\n",$count,$JOB[$count],$portbase+int(rand($portrange)));
}
close(OUT);
}
if ($#ARGV < 1) {
die " Usage: $0 mpi_cmd [mpi_args ...]\n";
}
my $launchMode = $ENV{'PTP_LAUNCH_MODE'};
my $launchCommand = shift(@ARGV);
if ($launchMode eq 'debug') {
$debuggerId = $ENV{'PTP_DEBUGGER_ID'};
$debuggerPath = $ENV{'PTP_DEBUG_EXEC_PATH'};
@debuggerArgs = shellwords($ENV{'PTP_DEBUG_EXEC_ARGS'});
$ROUTING_FILE = getcwd() . "/routes_" . $ENV{'PTP_JOBID'};
push(@ARGV, "-mca", "orte_show_resolved_nodenames", "1", "-display-map");
push(@debuggerArgs, "--routing_file=$ROUTING_FILE");
#
# If PTP_DEBUG_START_MASTER is set then the debugger is asking us to start the master SDM.
# Otherwise we assume the master SDM is started elsewhere...
#
if (exists $ENV{'PTP_DEBUG_START_MASTER'}) {
$pid = fork();
if ($pid == 0) {
exec($debuggerPath, "--master", @debuggerArgs);
exit(1);
}
push(@child_pids, $pid);
}
}
# Set autoflush to pass output as soon as possble
$|=1;
$pid = fork();
if ( $pid == 0 ) {
printf("#PTP job_id=%d\n", $$);
if ($launchMode eq 'debug') {
my $launchArgs = join(" ", @ARGV);
my $dbgArgs = join(" ", @debuggerArgs);
if (open(IN,"$launchCommand $launchArgs $debuggerPath $dbgArgs 2>&1 |")) {
while ($line=<IN>) {
chomp($line);
if ($line=~/=*\s*JOB MAP\s*=*/) {
print "found job map\n" if ($verbose);
get_job_map();
generate_routing_file($ROUTING_FILE);
} else {
print "$line\n";
}
}
close(IN);
unlink($ROUTING_FILE);
exit(0);
}
} else {
exec($launchCommand, @ARGV);
exit(1);
}
}
push(@child_pids, $pid);
foreach (@child_pids) {
waitpid($_, 0);
}
exit($? >> 8);