qdo revision 11320
12292SN/A#! /usr/bin/env python
22329SN/A
32292SN/A# Copyright (c) 2004-2005, 2007 The Regents of The University of Michigan
42292SN/A# All rights reserved.
52292SN/A#
62292SN/A# Redistribution and use in source and binary forms, with or without
72292SN/A# modification, are permitted provided that the following conditions are
82292SN/A# met: redistributions of source code must retain the above copyright
92292SN/A# notice, this list of conditions and the following disclaimer;
102292SN/A# redistributions in binary form must reproduce the above copyright
112292SN/A# notice, this list of conditions and the following disclaimer in the
122292SN/A# documentation and/or other materials provided with the distribution;
132292SN/A# neither the name of the copyright holders nor the names of its
142292SN/A# contributors may be used to endorse or promote products derived from
152292SN/A# this software without specific prior written permission.
162292SN/A#
172292SN/A# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
182292SN/A# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
192292SN/A# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
202292SN/A# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
212292SN/A# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
222292SN/A# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
232292SN/A# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
242292SN/A# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
252292SN/A# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
262292SN/A# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
272689Sktlim@umich.edu# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
282689Sktlim@umich.edu#
292689Sktlim@umich.edu# Authors: Steve Reinhardt
302292SN/A#          Ali Saidi
312292SN/A
322292SN/A# Important!
332292SN/A# This script expects a simple $ prompt, if you are using a shell other than
342292SN/A# sh which defaults to this you'll need to add something like the following
352329SN/A# to your bashrc/bash_profile script:
362292SN/A#if [ "$OAR_USER" = "xxxx" ]; then
372292SN/A#   PS1='$ '
382292SN/A
392329SN/A
402292SN/Aimport sys
412292SN/Aimport os
422292SN/Aimport re
432669Sktlim@umich.eduimport time
442669Sktlim@umich.eduimport optparse
452292SN/A
462292SN/Aimport pexpect
472329SN/A
482329SN/Aprogname = os.path.basename(sys.argv[0])
492329SN/A
502329SN/Ausage = "%prog [options] command [command arguments]"
512329SN/Aoptparser = optparse.OptionParser(usage=usage)
522329SN/Aoptparser.allow_interspersed_args=False
532329SN/Aoptparser.add_option('-e', dest='stderr_file',
542329SN/A                     help='command stderr output file')
552329SN/Aoptparser.add_option('-o', dest='stdout_file',
562329SN/A                     help='command stdout output file')
572292SN/Aoptparser.add_option('-l', dest='save_log', action='store_true',
582292SN/A                     help='save oarsub output log file')
592292SN/Aoptparser.add_option('-N', dest='job_name',
602292SN/A                     help='oarsub job name')
612292SN/Aoptparser.add_option('-q', dest='dest_queue',
622292SN/A                     help='oarsub destination queue')
632292SN/Aoptparser.add_option('--qwait', dest='oarsub_timeout', type='int',
642733Sktlim@umich.edu                     help='oarsub queue wait timeout', default=30*60)
652292SN/Aoptparser.add_option('-t', dest='cmd_timeout', type='int',
662292SN/A                     help='command execution timeout', default=600*60)
672292SN/A
682292SN/A(options, cmd) = optparser.parse_args()
692292SN/A
702292SN/Aif cmd == []:
712292SN/A    print >>sys.stderr, "%s: missing command" % progname
722292SN/A    sys.exit(1)
732292SN/A
742292SN/A# If we want to do this, need to add check here to make sure cmd[0] is
752292SN/A# a valid PBS job name, else oarsub will die on us.
762292SN/A#
772292SN/A#if not options.job_name:
782292SN/A#    options.job_name = cmd[0]
792292SN/A
802727Sktlim@umich.educwd = os.getcwd()
812727Sktlim@umich.edu
822727Sktlim@umich.edu# Deal with systems where /n is a symlink to /.automount
832292SN/Aif cwd.startswith('/.automount/'):
842733Sktlim@umich.edu    cwd = cwd.replace('/.automount/', '/n/', 1)
852292SN/A
862292SN/Aif not cwd.startswith('/n/poolfs/'):
872292SN/A    print >>sys.stderr, "Error: current directory must be under /n/poolfs."
882292SN/A    sys.exit(1)
892292SN/A
902348SN/A# The Shell class wraps pexpect.spawn with some handy functions that
912307SN/A# assume the thing on the other end is a Bourne/bash shell.
922307SN/Aclass Shell(pexpect.spawn):
932348SN/A    # Regexp to match the shell prompt.  We change the prompt to
942307SN/A    # something fixed and distinctive to make it easier to match
952307SN/A    # reliably.
962348SN/A    prompt_re = re.compile('qdo\$ ')
972307SN/A
982307SN/A    def __init__(self, cmd):
992292SN/A        # initialize base pexpect.spawn object
1002292SN/A        try:
1012292SN/A            pexpect.spawn.__init__(self, cmd)
1022292SN/A        except pexpect.ExceptionPexpect, exc:
1032292SN/A            print "%s:" % progname, exc
1042292SN/A            sys.exit(1)
1052292SN/A        # full_output accumulates the full output of the session
1062292SN/A        self.full_output = ""
1072292SN/A        self.quick_timeout = 15
1082292SN/A        # wait for a prompt, then change it
1092292SN/A        try:
1102292SN/A            self.expect('\$ ', options.oarsub_timeout)
1112292SN/A        except pexpect.TIMEOUT:
1122292SN/A            print >>sys.stderr, "%s: oarsub timed out." % progname
1132292SN/A            self.kill(9)
1142292SN/A            self.safe_close()
1152292SN/A            sys.exit(1)
1162329SN/A        self.do_command('unset PROMPT_COMMAND; PS1="qdo$ "')
1172292SN/A
1182292SN/A    # version of expect that updates full_output too
1192292SN/A    def expect(self, regexp, timeout = -1):
1202292SN/A        pexpect.spawn.expect(self, regexp, timeout)
1212292SN/A        self.full_output += self.before + self.after
1222292SN/A
1232292SN/A    # Just issue a command and wait for the next prompt.
1242292SN/A    # Returns a string containing the output of the command.
1252292SN/A    def do_bare_command(self, cmd, timeout = -1):
1262292SN/A        global full_output
1272292SN/A        self.sendline(cmd)
1282292SN/A        # read back the echo of the command
1292292SN/A        self.readline()
1302292SN/A        # wait for the next prompt
1312790Sktlim@umich.edu        self.expect(self.prompt_re, timeout)
1322790Sktlim@umich.edu        output = self.before.rstrip()
1332669Sktlim@umich.edu        return output
1342669Sktlim@umich.edu
1352292SN/A    # Issue a command, then query its exit status.
1362292SN/A    # Returns a (string, int) tuple with the command output and the status.
1372292SN/A    def do_command(self, cmd, timeout = -1):
1382292SN/A        # do the command itself
1392292SN/A        output = self.do_bare_command(cmd, timeout)
1402292SN/A        # collect status
1412292SN/A        status = int(self.do_bare_command("echo $?", self.quick_timeout))
1422292SN/A        return (output, status)
1432292SN/A
1442292SN/A    # Check to see if the given directory exists.
1452292SN/A    def dir_exists(self, dirname):
1462292SN/A        (output, status) = shell.do_command('[ -d %s ]' % dirname,
1472292SN/A                                            self.quick_timeout)
1482292SN/A        return status == 0
1492292SN/A
1502292SN/A    # Don't actually try to close it.. just wait until it closes by itself
1512292SN/A    # We can't actually kill the pid which is what it's trying to do, and if
1522292SN/A    # we call wait we could be in an unfortunate situation of it printing input
1532292SN/A    # right as we call wait, so the input is never read and the process never ends
1542292SN/A    def safe_close(self):
1552292SN/A        count = 0
1562292SN/A        while self.isalive() and count < 10:
1572292SN/A            time.sleep(1)
1582329SN/A        self.close(force=False)
1592292SN/A
1602292SN/A# Spawn the interactive pool job.
1612292SN/A
1622348SN/A# Hack to do link on poolfs... disabled for now since
1632292SN/A# compiler/linker/library versioning problems between poolfs and
1642292SN/A# nodes.  May never work since poolfs is x86-64 and nodes are 32-bit.
1652292SN/Aif False and len(cmd) > 50:
1662348SN/A    shell_cmd = 'ssh -t poolfs /bin/sh -l'
1672292SN/A    print "%s: running %s on poolfs" % (progname, cmd[0])
1682292SN/Aelse:
1692292SN/A    shell_cmd = 'oarsub -I'
1702348SN/A    if options.job_name:
1712292SN/A        shell_cmd += ' -n "%s"' % options.job_name
1722292SN/A    if options.dest_queue:
1732292SN/A        shell_cmd += ' -q ' + options.dest_queue
1742292SN/A    shell_cmd += ' -d %s' % cwd
1752292SN/A
1762292SN/Ashell = Shell(shell_cmd)
1772292SN/A
1782292SN/Atry:
1792292SN/A    # chdir to cwd
1802292SN/A    (output, status) = shell.do_command('cd ' + cwd)
1812292SN/A
1822292SN/A    if status != 0:
1832292SN/A        raise OSError, "Can't chdir to %s" % cwd
1842292SN/A
1852292SN/A    # wacky hack: sometimes scons will create an output directory then
1862292SN/A    # fork a job to generate files in that directory, and the job will
1872292SN/A    # get run before the directory creation propagates through NFS.
1882292SN/A    # This hack looks for a '-o' option indicating an output file and
1892292SN/A    # waits for the corresponding directory to appear if necessary.
1902292SN/A    try:
1912292SN/A        if 'cc' in cmd[0] or 'g++' in cmd[0]:
1922292SN/A            output_dir = os.path.dirname(cmd[cmd.index('-o')+1])
1932292SN/A        elif 'm5' in cmd[0]:
1942292SN/A            output_dir = cmd[cmd.index('-d')+1]
1952292SN/A        else:
1962292SN/A            output_dir = None
1972292SN/A    except (ValueError, IndexError):
1982292SN/A        # no big deal if there's no '-o'/'-d' or if it's the final argument
1992292SN/A        output_dir = None
2002292SN/A
2012292SN/A    if output_dir:
2022292SN/A        secs_waited = 0
2032292SN/A        while not shell.dir_exists(output_dir) and secs_waited < 90:
2042292SN/A            time.sleep(5)
2052292SN/A            secs_waited += 5
2062678Sktlim@umich.edu        if secs_waited > 30:
2072678Sktlim@umich.edu            print "waited", secs_waited, "seconds for", output_dir
2082292SN/A
2092292SN/A    # run command
2102698Sktlim@umich.edu    if options.stdout_file:
2112678Sktlim@umich.edu        cmd += ['>', options.stdout_file]
2122678Sktlim@umich.edu    if options.stderr_file:
2132698Sktlim@umich.edu        cmd += ['2>', options.stderr_file]
2142693Sktlim@umich.edu    try:
2152693Sktlim@umich.edu        (output, status) = shell.do_command(' '.join(cmd), options.cmd_timeout)
2162292SN/A    except pexpect.TIMEOUT:
2172292SN/A            print >>sys.stderr, "%s: command timed out after %d seconds." \
2182292SN/A                  % (progname, options.cmd_timeout)
2192693Sktlim@umich.edu            shell.sendline('~.') # oarsub/ssh termination escape sequence
2202693Sktlim@umich.edu            shell.safe_close()
2212693Sktlim@umich.edu            status = 3
2222292SN/A    if output:
2232292SN/A        print output
2242292SN/Afinally:
2252292SN/A    # end job
2262292SN/A    if shell.isalive():
2272292SN/A        shell.sendline('exit')
2282292SN/A        shell.expect('Disconnected from OAR job .*')
2292292SN/A        shell.safe_close()
2302292SN/A
2312329SN/A    # if there was an error, log the output even if not requested
2322329SN/A    if status != 0 or options.save_log:
2332329SN/A        log = file('qdo-log.' + str(os.getpid()), 'w')
2342329SN/A        log.write(shell.full_output)
2352292SN/A        log.close()
2362292SN/Adel shell
2372733Sktlim@umich.edu
2382292SN/Asys.exit(status)
2392292SN/A