job.py revision 1916:fe8d4e92c0a7
1#!/usr/bin/env python 2# Copyright (c) 2005 The Regents of The University of Michigan 3# All rights reserved. 4# 5# Redistribution and use in source and binary forms, with or without 6# modification, are permitted provided that the following conditions are 7# met: redistributions of source code must retain the above copyright 8# notice, this list of conditions and the following disclaimer; 9# redistributions in binary form must reproduce the above copyright 10# notice, this list of conditions and the following disclaimer in the 11# documentation and/or other materials provided with the distribution; 12# neither the name of the copyright holders nor the names of its 13# contributors may be used to endorse or promote products derived from 14# this software without specific prior written permission. 15# 16# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27# 28# Authors: Nathan Binkert 29# Steve Reinhardt 30# Ali Saidi 31 32import os, os.path, shutil, signal, socket, sys 33from os import environ as env 34from os.path import join as joinpath, expanduser 35 36def date(): 37 import time 38 return time.strftime('%a %b %e %H:%M:%S %Z %Y', time.localtime()) 39 40def cleandir(dir): 41 for root, dirs, files in os.walk(dir, False): 42 for name in files: 43 os.remove(joinpath(root, name)) 44 for name in dirs: 45 os.rmdir(joinpath(root, name)) 46 47class rsync: 48 def __init__(self): 49 self.sudo = False 50 self.rsync = 'rsync' 51 self.compress = False 52 self.archive = True 53 self.delete = False 54 self.options = '' 55 56 def do(self, src, dst): 57 args = [] 58 if self.sudo: 59 args.append('sudo') 60 61 args.append(self.rsync) 62 if (self.archive): 63 args.append('-a') 64 if (self.compress): 65 args.append('-z') 66 if (self.delete): 67 args.append('--delete') 68 if len(self.options): 69 args.append(self.options) 70 args.append(src) 71 args.append(dst) 72 73 return os.spawnvp(os.P_WAIT, args[0], args) 74 75class JobDir(object): 76 def __init__(self, dir): 77 self.dir = dir 78 79 def file(self, filename): 80 return joinpath(self.dir, filename) 81 82 def create(self): 83 if os.path.exists(self.dir): 84 if not os.path.isdir(self.dir): 85 sys.exit('%s is not a directory. Cannot build job' % self.dir) 86 else: 87 os.mkdir(self.dir) 88 89 def exists(self): 90 return os.path.isdir(self.dir) 91 92 def clean(self): 93 cleandir(self.dir) 94 95 def hasfile(self, filename): 96 return os.path.isfile(self.file(filename)) 97 98 def echofile(self, filename, string): 99 filename = self.file(filename) 100 try: 101 f = file(filename, 'w') 102 print >>f, string 103 f.flush() 104 f.close() 105 except IOError,e: 106 sys.exit(e) 107 108 def rmfile(self, filename): 109 filename = self.file(filename) 110 if os.path.isfile(filename): 111 os.unlink(filename) 112 113 def readval(self, filename): 114 filename = self.file(filename) 115 f = file(filename, 'r') 116 value = f.readline().strip() 117 f.close() 118 return value 119 120 def setstatus(self, string): 121 filename = self.file('.status') 122 try: 123 f = file(filename, 'a') 124 print >>f, string 125 f.flush() 126 f.close() 127 except IOError,e: 128 sys.exit(e) 129 130 def getstatus(self): 131 filename = self.file('.status') 132 try: 133 f = file(filename, 'r') 134 except IOError, e: 135 return 'none' 136 137 # fast forward to the end 138 for line in f: pass 139 140 # the first word on the last line is the status 141 return line.split(' ')[0] 142 143 def __str__(self): 144 return self.dir 145 146if __name__ == '__main__': 147 rootdir = env.setdefault('ROOTDIR', os.getcwd()) 148 pbs_jobid = env['PBS_JOBID'] 149 pbs_jobname = env['PBS_JOBNAME'] 150 basedir = joinpath(rootdir, 'Base') 151 jobname = env.setdefault('JOBNAME', pbs_jobname) 152 jobfile = env.setdefault('JOBFILE', joinpath(rootdir, 'Test.py')) 153 outdir = env.setdefault('OUTPUT_DIR', joinpath(rootdir, jobname)) 154 env['POOLJOB'] = 'True' 155 156 if os.path.isdir("/work"): 157 workbase = "/work" 158 else: 159 workbase = "/tmp/" 160 161 workdir = joinpath(workbase, '%s.%s' % (env['USER'], pbs_jobid)) 162 host = socket.gethostname() 163 164 os.umask(0022) 165 166 jobdir = JobDir(outdir) 167 168 started = date() 169 jobdir.echofile('.running', started) 170 jobdir.rmfile('.queued') 171 jobdir.echofile('.host', host) 172 173 jobdir.setstatus('running on %s on %s' % (host, started)) 174 175 if os.path.isdir(workdir): 176 cleandir(workdir) 177 else: 178 os.mkdir(workdir) 179 180 if False and os.path.isdir('/z/dist'): 181 sync = rsync() 182 sync.delete = True 183 sync.sudo = True 184 sync.do('poolfs::dist/m5/', '/z/dist/m5/') 185 186 try: 187 os.chdir(workdir) 188 except OSError,e: 189 sys.exit(e) 190 191 os.symlink(jobdir.file('output'), 'status.out') 192 193 args = [ joinpath(basedir, 'm5'), joinpath(basedir, 'run.py') ] 194 if not len(args): 195 sys.exit("no arguments") 196 197 print 'starting job... %s' % started 198 print ' '.join(args) 199 print 200 sys.stdout.flush() 201 202 childpid = os.fork() 203 if not childpid: 204 # Execute command 205 sys.stdin.close() 206 fd = os.open(jobdir.file("output"), 207 os.O_WRONLY | os.O_CREAT | os.O_TRUNC) 208 os.dup2(fd, sys.stdout.fileno()) 209 os.dup2(fd, sys.stderr.fileno()) 210 os.execvp(args[0], args) 211 212 def handler(signum, frame): 213 if childpid != 0: 214 os.kill(childpid, signum) 215 216 signal.signal(signal.SIGHUP, handler) 217 signal.signal(signal.SIGINT, handler) 218 signal.signal(signal.SIGQUIT, handler) 219 signal.signal(signal.SIGTERM, handler) 220 signal.signal(signal.SIGCONT, handler) 221 signal.signal(signal.SIGUSR1, handler) 222 signal.signal(signal.SIGUSR2, handler) 223 224 done = 0 225 while not done: 226 try: 227 thepid,ec = os.waitpid(childpid, 0) 228 if ec: 229 print 'Exit code ', ec 230 status = 'failure' 231 else: 232 status = 'success' 233 done = 1 234 except OSError: 235 pass 236 237 complete = date() 238 print '\njob complete... %s' % complete 239 jobdir.echofile('.%s' % status, complete) 240 jobdir.rmfile('.running') 241 jobdir.setstatus('%s on %s' % (status, complete)) 242