job.py revision 1881:fc205a7edd58
1#!/usr/bin/env python 2# Copyright (c) 2005 The Regents of The University of Michigan 3# All rights reserved. 4# 5# Redistribution and use in source and binary forms, with or without 6# modification, are permitted provided that the following conditions are 7# met: redistributions of source code must retain the above copyright 8# notice, this list of conditions and the following disclaimer; 9# redistributions in binary form must reproduce the above copyright 10# notice, this list of conditions and the following disclaimer in the 11# documentation and/or other materials provided with the distribution; 12# neither the name of the copyright holders nor the names of its 13# contributors may be used to endorse or promote products derived from 14# this software without specific prior written permission. 15# 16# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27# 28# Authors: Nathan Binkert 29# Steve Reinhardt 30# Ali Saidi 31 32import os, os.path, shutil, signal, socket, sys 33from os import environ as env 34from os.path import join as joinpath, expanduser 35 36def date(): 37 import time 38 return time.strftime('%a %b %e %H:%M:%S %Z %Y', time.localtime()) 39 40def cleandir(dir): 41 for root, dirs, files in os.walk(dir, False): 42 for name in files: 43 os.remove(joinpath(root, name)) 44 for name in dirs: 45 os.rmdir(joinpath(root, name)) 46 47class rsync: 48 def __init__(self): 49 self.sudo = False 50 self.rsync = 'rsync' 51 self.compress = False 52 self.archive = True 53 self.delete = False 54 self.options = '' 55 56 def do(self, src, dst): 57 args = [] 58 if self.sudo: 59 args.append('sudo') 60 61 args.append(self.rsync) 62 if (self.archive): 63 args.append('-a') 64 if (self.compress): 65 args.append('-z') 66 if (self.delete): 67 args.append('--delete') 68 if len(self.options): 69 args.append(self.options) 70 args.append(src) 71 args.append(dst) 72 73 return os.spawnvp(os.P_WAIT, args[0], args) 74 75class JobDir(object): 76 def __init__(self, dir): 77 self.dir = dir 78 79 def file(self, filename): 80 return joinpath(self.dir, filename) 81 82 def create(self): 83 if os.path.exists(self.dir): 84 if not os.path.isdir(self.dir): 85 sys.exit('%s is not a directory. Cannot build job' % self.dir) 86 else: 87 os.mkdir(self.dir) 88 89 def exists(self): 90 return os.path.isdir(self.dir) 91 92 def clean(self): 93 cleandir(self.dir) 94 95 def hasfile(self, filename): 96 return os.path.isfile(self.file(filename)) 97 98 def echofile(self, filename, string): 99 filename = self.file(filename) 100 try: 101 f = file(filename, 'w') 102 print >>f, string 103 f.flush() 104 f.close() 105 except IOError,e: 106 sys.exit(e) 107 108 def rmfile(self, filename): 109 filename = self.file(filename) 110 if os.path.isfile(filename): 111 os.unlink(filename) 112 113 def readval(self, filename): 114 filename = self.file(filename) 115 f = file(filename, 'r') 116 value = f.readline().strip() 117 f.close() 118 return value 119 120 def setstatus(self, string): 121 filename = self.file('.status') 122 try: 123 f = file(filename, 'a') 124 print >>f, string 125 f.flush() 126 f.close() 127 except IOError,e: 128 sys.exit(e) 129 130 def getstatus(self): 131 filename = self.file('.status') 132 try: 133 f = file(filename, 'r') 134 except IOError, e: 135 return 'none' 136 137 # fast forward to the end 138 for line in f: pass 139 140 # the first word on the last line is the status 141 return line.split(' ')[0] 142 143 def __str__(self): 144 return self.dir 145 146if __name__ == '__main__': 147 rootdir = env.setdefault('ROOTDIR', os.getcwd()) 148 pbs_jobid = env['PBS_JOBID'] 149 pbs_jobname = env['PBS_JOBNAME'] 150 basedir = joinpath(rootdir, 'Base') 151 jobname = env.setdefault('JOBNAME', pbs_jobname) 152 jobfile = env.setdefault('JOBFILE', joinpath(basedir, 'test.py')) 153 outdir = env.setdefault('OUTPUT_DIR', joinpath(rootdir, jobname)) 154 env['POOLJOB'] = 'True' 155 156 if os.path.isdir("/work"): 157 workbase = "/work" 158 else: 159 workbase = "/tmp/" 160 161 workdir = joinpath(workbase, '%s.%s' % (env['USER'], pbs_jobid)) 162 host = socket.gethostname() 163 164 os.umask(0022) 165 166 jobdir = JobDir(outdir) 167 168 started = date() 169 jobdir.echofile('.running', started) 170 jobdir.rmfile('.queued') 171 jobdir.echofile('.pbs_jobid', pbs_jobid) 172 jobdir.echofile('.pbs_jobname', pbs_jobid) 173 jobdir.echofile('.host', host) 174 175 jobdir.setstatus('running on %s on %s' % (host, started)) 176 177 if os.path.isdir(workdir): 178 cleandir(workdir) 179 else: 180 os.mkdir(workdir) 181 182 if False and os.path.isdir('/z/dist'): 183 sync = rsync() 184 sync.delete = True 185 sync.sudo = True 186 sync.do('poolfs::dist/m5/', '/z/dist/m5/') 187 188 try: 189 os.chdir(workdir) 190 except OSError,e: 191 sys.exit(e) 192 193 os.symlink(jobdir.file('output'), 'status.out') 194 195 args = [ joinpath(basedir, 'm5'), joinpath(basedir, 'run.py') ] 196 if not len(args): 197 sys.exit("no arguments") 198 199 print 'starting job... %s' % started 200 print ' '.join(args) 201 print 202 sys.stdout.flush() 203 204 childpid = os.fork() 205 if not childpid: 206 # Execute command 207 sys.stdin.close() 208 fd = os.open(jobdir.file("output"), 209 os.O_WRONLY | os.O_CREAT | os.O_TRUNC) 210 os.dup2(fd, sys.stdout.fileno()) 211 os.dup2(fd, sys.stderr.fileno()) 212 os.execvp(args[0], args) 213 214 def handler(signum, frame): 215 if childpid != 0: 216 os.kill(childpid, signum) 217 218 signal.signal(signal.SIGHUP, handler) 219 signal.signal(signal.SIGINT, handler) 220 signal.signal(signal.SIGQUIT, handler) 221 signal.signal(signal.SIGTERM, handler) 222 signal.signal(signal.SIGSTOP, handler) 223 signal.signal(signal.SIGCONT, handler) 224 signal.signal(signal.SIGUSR1, handler) 225 signal.signal(signal.SIGUSR2, handler) 226 227 done = 0 228 while not done: 229 try: 230 thepid,ec = os.waitpid(childpid, 0) 231 if ec: 232 print 'Exit code ', ec 233 status = 'failure' 234 else: 235 status = 'success' 236 done = 1 237 except OSError: 238 pass 239 240 complete = date() 241 print '\njob complete... %s' % complete 242 jobdir.echofile('.%s' % status, complete) 243 jobdir.rmfile('.running') 244 jobdir.setstatus('%s on %s' % (status, complete)) 245