qdo revision 11320
12292SN/A#! /usr/bin/env python 22329SN/A 32292SN/A# Copyright (c) 2004-2005, 2007 The Regents of The University of Michigan 42292SN/A# All rights reserved. 52292SN/A# 62292SN/A# Redistribution and use in source and binary forms, with or without 72292SN/A# modification, are permitted provided that the following conditions are 82292SN/A# met: redistributions of source code must retain the above copyright 92292SN/A# notice, this list of conditions and the following disclaimer; 102292SN/A# redistributions in binary form must reproduce the above copyright 112292SN/A# notice, this list of conditions and the following disclaimer in the 122292SN/A# documentation and/or other materials provided with the distribution; 132292SN/A# neither the name of the copyright holders nor the names of its 142292SN/A# contributors may be used to endorse or promote products derived from 152292SN/A# this software without specific prior written permission. 162292SN/A# 172292SN/A# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 182292SN/A# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 192292SN/A# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 202292SN/A# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 212292SN/A# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 222292SN/A# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 232292SN/A# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 242292SN/A# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 252292SN/A# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 262292SN/A# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 272689Sktlim@umich.edu# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 282689Sktlim@umich.edu# 292689Sktlim@umich.edu# Authors: Steve Reinhardt 302292SN/A# Ali Saidi 312292SN/A 322292SN/A# Important! 332292SN/A# This script expects a simple $ prompt, if you are using a shell other than 342292SN/A# sh which defaults to this you'll need to add something like the following 352329SN/A# to your bashrc/bash_profile script: 362292SN/A#if [ "$OAR_USER" = "xxxx" ]; then 372292SN/A# PS1='$ ' 382292SN/A 392329SN/A 402292SN/Aimport sys 412292SN/Aimport os 422292SN/Aimport re 432669Sktlim@umich.eduimport time 442669Sktlim@umich.eduimport optparse 452292SN/A 462292SN/Aimport pexpect 472329SN/A 482329SN/Aprogname = os.path.basename(sys.argv[0]) 492329SN/A 502329SN/Ausage = "%prog [options] command [command arguments]" 512329SN/Aoptparser = optparse.OptionParser(usage=usage) 522329SN/Aoptparser.allow_interspersed_args=False 532329SN/Aoptparser.add_option('-e', dest='stderr_file', 542329SN/A help='command stderr output file') 552329SN/Aoptparser.add_option('-o', dest='stdout_file', 562329SN/A help='command stdout output file') 572292SN/Aoptparser.add_option('-l', dest='save_log', action='store_true', 582292SN/A help='save oarsub output log file') 592292SN/Aoptparser.add_option('-N', dest='job_name', 602292SN/A help='oarsub job name') 612292SN/Aoptparser.add_option('-q', dest='dest_queue', 622292SN/A help='oarsub destination queue') 632292SN/Aoptparser.add_option('--qwait', dest='oarsub_timeout', type='int', 642733Sktlim@umich.edu help='oarsub queue wait timeout', default=30*60) 652292SN/Aoptparser.add_option('-t', dest='cmd_timeout', type='int', 662292SN/A help='command execution timeout', default=600*60) 672292SN/A 682292SN/A(options, cmd) = optparser.parse_args() 692292SN/A 702292SN/Aif cmd == []: 712292SN/A print >>sys.stderr, "%s: missing command" % progname 722292SN/A sys.exit(1) 732292SN/A 742292SN/A# If we want to do this, need to add check here to make sure cmd[0] is 752292SN/A# a valid PBS job name, else oarsub will die on us. 762292SN/A# 772292SN/A#if not options.job_name: 782292SN/A# options.job_name = cmd[0] 792292SN/A 802727Sktlim@umich.educwd = os.getcwd() 812727Sktlim@umich.edu 822727Sktlim@umich.edu# Deal with systems where /n is a symlink to /.automount 832292SN/Aif cwd.startswith('/.automount/'): 842733Sktlim@umich.edu cwd = cwd.replace('/.automount/', '/n/', 1) 852292SN/A 862292SN/Aif not cwd.startswith('/n/poolfs/'): 872292SN/A print >>sys.stderr, "Error: current directory must be under /n/poolfs." 882292SN/A sys.exit(1) 892292SN/A 902348SN/A# The Shell class wraps pexpect.spawn with some handy functions that 912307SN/A# assume the thing on the other end is a Bourne/bash shell. 922307SN/Aclass Shell(pexpect.spawn): 932348SN/A # Regexp to match the shell prompt. We change the prompt to 942307SN/A # something fixed and distinctive to make it easier to match 952307SN/A # reliably. 962348SN/A prompt_re = re.compile('qdo\$ ') 972307SN/A 982307SN/A def __init__(self, cmd): 992292SN/A # initialize base pexpect.spawn object 1002292SN/A try: 1012292SN/A pexpect.spawn.__init__(self, cmd) 1022292SN/A except pexpect.ExceptionPexpect, exc: 1032292SN/A print "%s:" % progname, exc 1042292SN/A sys.exit(1) 1052292SN/A # full_output accumulates the full output of the session 1062292SN/A self.full_output = "" 1072292SN/A self.quick_timeout = 15 1082292SN/A # wait for a prompt, then change it 1092292SN/A try: 1102292SN/A self.expect('\$ ', options.oarsub_timeout) 1112292SN/A except pexpect.TIMEOUT: 1122292SN/A print >>sys.stderr, "%s: oarsub timed out." % progname 1132292SN/A self.kill(9) 1142292SN/A self.safe_close() 1152292SN/A sys.exit(1) 1162329SN/A self.do_command('unset PROMPT_COMMAND; PS1="qdo$ "') 1172292SN/A 1182292SN/A # version of expect that updates full_output too 1192292SN/A def expect(self, regexp, timeout = -1): 1202292SN/A pexpect.spawn.expect(self, regexp, timeout) 1212292SN/A self.full_output += self.before + self.after 1222292SN/A 1232292SN/A # Just issue a command and wait for the next prompt. 1242292SN/A # Returns a string containing the output of the command. 1252292SN/A def do_bare_command(self, cmd, timeout = -1): 1262292SN/A global full_output 1272292SN/A self.sendline(cmd) 1282292SN/A # read back the echo of the command 1292292SN/A self.readline() 1302292SN/A # wait for the next prompt 1312790Sktlim@umich.edu self.expect(self.prompt_re, timeout) 1322790Sktlim@umich.edu output = self.before.rstrip() 1332669Sktlim@umich.edu return output 1342669Sktlim@umich.edu 1352292SN/A # Issue a command, then query its exit status. 1362292SN/A # Returns a (string, int) tuple with the command output and the status. 1372292SN/A def do_command(self, cmd, timeout = -1): 1382292SN/A # do the command itself 1392292SN/A output = self.do_bare_command(cmd, timeout) 1402292SN/A # collect status 1412292SN/A status = int(self.do_bare_command("echo $?", self.quick_timeout)) 1422292SN/A return (output, status) 1432292SN/A 1442292SN/A # Check to see if the given directory exists. 1452292SN/A def dir_exists(self, dirname): 1462292SN/A (output, status) = shell.do_command('[ -d %s ]' % dirname, 1472292SN/A self.quick_timeout) 1482292SN/A return status == 0 1492292SN/A 1502292SN/A # Don't actually try to close it.. just wait until it closes by itself 1512292SN/A # We can't actually kill the pid which is what it's trying to do, and if 1522292SN/A # we call wait we could be in an unfortunate situation of it printing input 1532292SN/A # right as we call wait, so the input is never read and the process never ends 1542292SN/A def safe_close(self): 1552292SN/A count = 0 1562292SN/A while self.isalive() and count < 10: 1572292SN/A time.sleep(1) 1582329SN/A self.close(force=False) 1592292SN/A 1602292SN/A# Spawn the interactive pool job. 1612292SN/A 1622348SN/A# Hack to do link on poolfs... disabled for now since 1632292SN/A# compiler/linker/library versioning problems between poolfs and 1642292SN/A# nodes. May never work since poolfs is x86-64 and nodes are 32-bit. 1652292SN/Aif False and len(cmd) > 50: 1662348SN/A shell_cmd = 'ssh -t poolfs /bin/sh -l' 1672292SN/A print "%s: running %s on poolfs" % (progname, cmd[0]) 1682292SN/Aelse: 1692292SN/A shell_cmd = 'oarsub -I' 1702348SN/A if options.job_name: 1712292SN/A shell_cmd += ' -n "%s"' % options.job_name 1722292SN/A if options.dest_queue: 1732292SN/A shell_cmd += ' -q ' + options.dest_queue 1742292SN/A shell_cmd += ' -d %s' % cwd 1752292SN/A 1762292SN/Ashell = Shell(shell_cmd) 1772292SN/A 1782292SN/Atry: 1792292SN/A # chdir to cwd 1802292SN/A (output, status) = shell.do_command('cd ' + cwd) 1812292SN/A 1822292SN/A if status != 0: 1832292SN/A raise OSError, "Can't chdir to %s" % cwd 1842292SN/A 1852292SN/A # wacky hack: sometimes scons will create an output directory then 1862292SN/A # fork a job to generate files in that directory, and the job will 1872292SN/A # get run before the directory creation propagates through NFS. 1882292SN/A # This hack looks for a '-o' option indicating an output file and 1892292SN/A # waits for the corresponding directory to appear if necessary. 1902292SN/A try: 1912292SN/A if 'cc' in cmd[0] or 'g++' in cmd[0]: 1922292SN/A output_dir = os.path.dirname(cmd[cmd.index('-o')+1]) 1932292SN/A elif 'm5' in cmd[0]: 1942292SN/A output_dir = cmd[cmd.index('-d')+1] 1952292SN/A else: 1962292SN/A output_dir = None 1972292SN/A except (ValueError, IndexError): 1982292SN/A # no big deal if there's no '-o'/'-d' or if it's the final argument 1992292SN/A output_dir = None 2002292SN/A 2012292SN/A if output_dir: 2022292SN/A secs_waited = 0 2032292SN/A while not shell.dir_exists(output_dir) and secs_waited < 90: 2042292SN/A time.sleep(5) 2052292SN/A secs_waited += 5 2062678Sktlim@umich.edu if secs_waited > 30: 2072678Sktlim@umich.edu print "waited", secs_waited, "seconds for", output_dir 2082292SN/A 2092292SN/A # run command 2102698Sktlim@umich.edu if options.stdout_file: 2112678Sktlim@umich.edu cmd += ['>', options.stdout_file] 2122678Sktlim@umich.edu if options.stderr_file: 2132698Sktlim@umich.edu cmd += ['2>', options.stderr_file] 2142693Sktlim@umich.edu try: 2152693Sktlim@umich.edu (output, status) = shell.do_command(' '.join(cmd), options.cmd_timeout) 2162292SN/A except pexpect.TIMEOUT: 2172292SN/A print >>sys.stderr, "%s: command timed out after %d seconds." \ 2182292SN/A % (progname, options.cmd_timeout) 2192693Sktlim@umich.edu shell.sendline('~.') # oarsub/ssh termination escape sequence 2202693Sktlim@umich.edu shell.safe_close() 2212693Sktlim@umich.edu status = 3 2222292SN/A if output: 2232292SN/A print output 2242292SN/Afinally: 2252292SN/A # end job 2262292SN/A if shell.isalive(): 2272292SN/A shell.sendline('exit') 2282292SN/A shell.expect('Disconnected from OAR job .*') 2292292SN/A shell.safe_close() 2302292SN/A 2312329SN/A # if there was an error, log the output even if not requested 2322329SN/A if status != 0 or options.save_log: 2332329SN/A log = file('qdo-log.' + str(os.getpid()), 'w') 2342329SN/A log.write(shell.full_output) 2352292SN/A log.close() 2362292SN/Adel shell 2372733Sktlim@umich.edu 2382292SN/Asys.exit(status) 2392292SN/A