qdo revision 11320
12101SN/A#! /usr/bin/env python
22084SN/A
35268Sksewell@umich.edu# Copyright (c) 2004-2005, 2007 The Regents of The University of Michigan
45268Sksewell@umich.edu# All rights reserved.
55268Sksewell@umich.edu#
65268Sksewell@umich.edu# Redistribution and use in source and binary forms, with or without
75268Sksewell@umich.edu# modification, are permitted provided that the following conditions are
85268Sksewell@umich.edu# met: redistributions of source code must retain the above copyright
95268Sksewell@umich.edu# notice, this list of conditions and the following disclaimer;
105268Sksewell@umich.edu# redistributions in binary form must reproduce the above copyright
115268Sksewell@umich.edu# notice, this list of conditions and the following disclaimer in the
125268Sksewell@umich.edu# documentation and/or other materials provided with the distribution;
135268Sksewell@umich.edu# neither the name of the copyright holders nor the names of its
145268Sksewell@umich.edu# contributors may be used to endorse or promote products derived from
155268Sksewell@umich.edu# this software without specific prior written permission.
165268Sksewell@umich.edu#
175268Sksewell@umich.edu# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
185268Sksewell@umich.edu# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
195268Sksewell@umich.edu# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
205268Sksewell@umich.edu# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
215268Sksewell@umich.edu# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
225268Sksewell@umich.edu# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
235268Sksewell@umich.edu# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
245268Sksewell@umich.edu# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
255268Sksewell@umich.edu# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
265268Sksewell@umich.edu# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
275268Sksewell@umich.edu# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
285268Sksewell@umich.edu#
295268Sksewell@umich.edu# Authors: Steve Reinhardt
302754Sksewell@umich.edu#          Ali Saidi
312084SN/A
322084SN/A# Important!
332084SN/A# This script expects a simple $ prompt, if you are using a shell other than
342084SN/A# sh which defaults to this you'll need to add something like the following
352084SN/A# to your bashrc/bash_profile script:
362084SN/A#if [ "$OAR_USER" = "xxxx" ]; then
372084SN/A#   PS1='$ '
382084SN/A
392084SN/A
402101SN/Aimport sys
412084SN/Aimport os
422084SN/Aimport re
432084SN/Aimport time
442084SN/Aimport optparse
452084SN/A
462084SN/Aimport pexpect
472084SN/A
482101SN/Aprogname = os.path.basename(sys.argv[0])
492084SN/A
502084SN/Ausage = "%prog [options] command [command arguments]"
512084SN/Aoptparser = optparse.OptionParser(usage=usage)
522084SN/Aoptparser.allow_interspersed_args=False
532084SN/Aoptparser.add_option('-e', dest='stderr_file',
542084SN/A                     help='command stderr output file')
552084SN/Aoptparser.add_option('-o', dest='stdout_file',
5612616Sgabeblack@google.com                     help='command stdout output file')
5712616Sgabeblack@google.comoptparser.add_option('-l', dest='save_log', action='store_true',
582084SN/A                     help='save oarsub output log file')
5912616Sgabeblack@google.comoptparser.add_option('-N', dest='job_name',
602084SN/A                     help='oarsub job name')
612084SN/Aoptparser.add_option('-q', dest='dest_queue',
622084SN/A                     help='oarsub destination queue')
632084SN/Aoptparser.add_option('--qwait', dest='oarsub_timeout', type='int',
642084SN/A                     help='oarsub queue wait timeout', default=30*60)
652084SN/Aoptparser.add_option('-t', dest='cmd_timeout', type='int',
662084SN/A                     help='command execution timeout', default=600*60)
672686Sksewell@umich.edu
682084SN/A(options, cmd) = optparser.parse_args()
692084SN/A
702084SN/Aif cmd == []:
712084SN/A    print >>sys.stderr, "%s: missing command" % progname
722084SN/A    sys.exit(1)
732101SN/A
742101SN/A# If we want to do this, need to add check here to make sure cmd[0] is
752084SN/A# a valid PBS job name, else oarsub will die on us.
762750Sksewell@umich.edu#
772750Sksewell@umich.edu#if not options.job_name:
782084SN/A#    options.job_name = cmd[0]
792084SN/A
802084SN/Acwd = os.getcwd()
812084SN/A
822084SN/A# Deal with systems where /n is a symlink to /.automount
832084SN/Aif cwd.startswith('/.automount/'):
842084SN/A    cwd = cwd.replace('/.automount/', '/n/', 1)
8512234Sgabeblack@google.com
862084SN/Aif not cwd.startswith('/n/poolfs/'):
872239SN/A    print >>sys.stderr, "Error: current directory must be under /n/poolfs."
882084SN/A    sys.exit(1)
892084SN/A
902084SN/A# The Shell class wraps pexpect.spawn with some handy functions that
912750Sksewell@umich.edu# assume the thing on the other end is a Bourne/bash shell.
922750Sksewell@umich.educlass Shell(pexpect.spawn):
932750Sksewell@umich.edu    # Regexp to match the shell prompt.  We change the prompt to
942750Sksewell@umich.edu    # something fixed and distinctive to make it easier to match
952750Sksewell@umich.edu    # reliably.
962750Sksewell@umich.edu    prompt_re = re.compile('qdo\$ ')
972750Sksewell@umich.edu
982750Sksewell@umich.edu    def __init__(self, cmd):
992750Sksewell@umich.edu        # initialize base pexpect.spawn object
1002750Sksewell@umich.edu        try:
1012750Sksewell@umich.edu            pexpect.spawn.__init__(self, cmd)
1022750Sksewell@umich.edu        except pexpect.ExceptionPexpect, exc:
1032084SN/A            print "%s:" % progname, exc
1042084SN/A            sys.exit(1)
1052101SN/A        # full_output accumulates the full output of the session
1062750Sksewell@umich.edu        self.full_output = ""
1072750Sksewell@umich.edu        self.quick_timeout = 15
1082750Sksewell@umich.edu        # wait for a prompt, then change it
1092750Sksewell@umich.edu        try:
1102750Sksewell@umich.edu            self.expect('\$ ', options.oarsub_timeout)
1112750Sksewell@umich.edu        except pexpect.TIMEOUT:
1122239SN/A            print >>sys.stderr, "%s: oarsub timed out." % progname
1132750Sksewell@umich.edu            self.kill(9)
1142750Sksewell@umich.edu            self.safe_close()
1152750Sksewell@umich.edu            sys.exit(1)
1162750Sksewell@umich.edu        self.do_command('unset PROMPT_COMMAND; PS1="qdo$ "')
1172750Sksewell@umich.edu
1182750Sksewell@umich.edu    # version of expect that updates full_output too
1192750Sksewell@umich.edu    def expect(self, regexp, timeout = -1):
1202750Sksewell@umich.edu        pexpect.spawn.expect(self, regexp, timeout)
1212084SN/A        self.full_output += self.before + self.after
1222084SN/A
1232084SN/A    # Just issue a command and wait for the next prompt.
1242084SN/A    # Returns a string containing the output of the command.
1252084SN/A    def do_bare_command(self, cmd, timeout = -1):
1262084SN/A        global full_output
1272084SN/A        self.sendline(cmd)
1283951Sgblack@eecs.umich.edu        # read back the echo of the command
1292084SN/A        self.readline()
1302084SN/A        # wait for the next prompt
1312084SN/A        self.expect(self.prompt_re, timeout)
1322084SN/A        output = self.before.rstrip()
1332084SN/A        return output
1342084SN/A
1352084SN/A    # Issue a command, then query its exit status.
1362470SN/A    # Returns a (string, int) tuple with the command output and the status.
1372686Sksewell@umich.edu    def do_command(self, cmd, timeout = -1):
1382470SN/A        # do the command itself
1392470SN/A        output = self.do_bare_command(cmd, timeout)
140        # collect status
141        status = int(self.do_bare_command("echo $?", self.quick_timeout))
142        return (output, status)
143
144    # Check to see if the given directory exists.
145    def dir_exists(self, dirname):
146        (output, status) = shell.do_command('[ -d %s ]' % dirname,
147                                            self.quick_timeout)
148        return status == 0
149
150    # Don't actually try to close it.. just wait until it closes by itself
151    # We can't actually kill the pid which is what it's trying to do, and if
152    # we call wait we could be in an unfortunate situation of it printing input
153    # right as we call wait, so the input is never read and the process never ends
154    def safe_close(self):
155        count = 0
156        while self.isalive() and count < 10:
157            time.sleep(1)
158        self.close(force=False)
159
160# Spawn the interactive pool job.
161
162# Hack to do link on poolfs... disabled for now since
163# compiler/linker/library versioning problems between poolfs and
164# nodes.  May never work since poolfs is x86-64 and nodes are 32-bit.
165if False and len(cmd) > 50:
166    shell_cmd = 'ssh -t poolfs /bin/sh -l'
167    print "%s: running %s on poolfs" % (progname, cmd[0])
168else:
169    shell_cmd = 'oarsub -I'
170    if options.job_name:
171        shell_cmd += ' -n "%s"' % options.job_name
172    if options.dest_queue:
173        shell_cmd += ' -q ' + options.dest_queue
174    shell_cmd += ' -d %s' % cwd
175
176shell = Shell(shell_cmd)
177
178try:
179    # chdir to cwd
180    (output, status) = shell.do_command('cd ' + cwd)
181
182    if status != 0:
183        raise OSError, "Can't chdir to %s" % cwd
184
185    # wacky hack: sometimes scons will create an output directory then
186    # fork a job to generate files in that directory, and the job will
187    # get run before the directory creation propagates through NFS.
188    # This hack looks for a '-o' option indicating an output file and
189    # waits for the corresponding directory to appear if necessary.
190    try:
191        if 'cc' in cmd[0] or 'g++' in cmd[0]:
192            output_dir = os.path.dirname(cmd[cmd.index('-o')+1])
193        elif 'm5' in cmd[0]:
194            output_dir = cmd[cmd.index('-d')+1]
195        else:
196            output_dir = None
197    except (ValueError, IndexError):
198        # no big deal if there's no '-o'/'-d' or if it's the final argument
199        output_dir = None
200
201    if output_dir:
202        secs_waited = 0
203        while not shell.dir_exists(output_dir) and secs_waited < 90:
204            time.sleep(5)
205            secs_waited += 5
206        if secs_waited > 30:
207            print "waited", secs_waited, "seconds for", output_dir
208
209    # run command
210    if options.stdout_file:
211        cmd += ['>', options.stdout_file]
212    if options.stderr_file:
213        cmd += ['2>', options.stderr_file]
214    try:
215        (output, status) = shell.do_command(' '.join(cmd), options.cmd_timeout)
216    except pexpect.TIMEOUT:
217            print >>sys.stderr, "%s: command timed out after %d seconds." \
218                  % (progname, options.cmd_timeout)
219            shell.sendline('~.') # oarsub/ssh termination escape sequence
220            shell.safe_close()
221            status = 3
222    if output:
223        print output
224finally:
225    # end job
226    if shell.isalive():
227        shell.sendline('exit')
228        shell.expect('Disconnected from OAR job .*')
229        shell.safe_close()
230
231    # if there was an error, log the output even if not requested
232    if status != 0 or options.save_log:
233        log = file('qdo-log.' + str(os.getpid()), 'w')
234        log.write(shell.full_output)
235        log.close()
236del shell
237
238sys.exit(status)
239