job.py revision 1881:fc205a7edd58
1#!/usr/bin/env python
2# Copyright (c) 2005 The Regents of The University of Michigan
3# All rights reserved.
4#
5# Redistribution and use in source and binary forms, with or without
6# modification, are permitted provided that the following conditions are
7# met: redistributions of source code must retain the above copyright
8# notice, this list of conditions and the following disclaimer;
9# redistributions in binary form must reproduce the above copyright
10# notice, this list of conditions and the following disclaimer in the
11# documentation and/or other materials provided with the distribution;
12# neither the name of the copyright holders nor the names of its
13# contributors may be used to endorse or promote products derived from
14# this software without specific prior written permission.
15#
16# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27#
28# Authors: Nathan Binkert
29#          Steve Reinhardt
30#          Ali Saidi
31
32import os, os.path, shutil, signal, socket, sys
33from os import environ as env
34from os.path import join as joinpath, expanduser
35
36def date():
37    import time
38    return time.strftime('%a %b %e %H:%M:%S %Z %Y', time.localtime())
39
40def cleandir(dir):
41    for root, dirs, files in os.walk(dir, False):
42        for name in files:
43            os.remove(joinpath(root, name))
44        for name in dirs:
45            os.rmdir(joinpath(root, name))
46
47class rsync:
48    def __init__(self):
49        self.sudo = False
50        self.rsync = 'rsync'
51        self.compress = False
52        self.archive = True
53        self.delete = False
54        self.options = ''
55
56    def do(self, src, dst):
57        args = []
58        if self.sudo:
59            args.append('sudo')
60
61        args.append(self.rsync)
62        if (self.archive):
63            args.append('-a')
64        if (self.compress):
65            args.append('-z')
66        if (self.delete):
67            args.append('--delete')
68        if len(self.options):
69            args.append(self.options)
70        args.append(src)
71        args.append(dst)
72
73        return os.spawnvp(os.P_WAIT, args[0], args)
74
75class JobDir(object):
76    def __init__(self, dir):
77        self.dir = dir
78
79    def file(self, filename):
80        return joinpath(self.dir, filename)
81
82    def create(self):
83        if os.path.exists(self.dir):
84            if not os.path.isdir(self.dir):
85                sys.exit('%s is not a directory.  Cannot build job' % self.dir)
86        else:
87            os.mkdir(self.dir)
88
89    def exists(self):
90        return os.path.isdir(self.dir)
91
92    def clean(self):
93        cleandir(self.dir)
94
95    def hasfile(self, filename):
96        return os.path.isfile(self.file(filename))
97
98    def echofile(self, filename, string):
99        filename = self.file(filename)
100        try:
101            f = file(filename, 'w')
102            print >>f, string
103            f.flush()
104            f.close()
105        except IOError,e:
106            sys.exit(e)
107
108    def rmfile(self, filename):
109        filename = self.file(filename)
110        if os.path.isfile(filename):
111            os.unlink(filename)
112
113    def readval(self, filename):
114        filename = self.file(filename)
115        f = file(filename, 'r')
116        value = f.readline().strip()
117        f.close()
118        return value
119
120    def setstatus(self, string):
121        filename = self.file('.status')
122        try:
123            f = file(filename, 'a')
124            print >>f, string
125            f.flush()
126            f.close()
127        except IOError,e:
128            sys.exit(e)
129
130    def getstatus(self):
131        filename = self.file('.status')
132        try:
133            f = file(filename, 'r')
134        except IOError, e:
135            return 'none'
136
137        # fast forward to the end
138        for line in f: pass
139
140        # the first word on the last line is the status
141        return line.split(' ')[0]
142
143    def __str__(self):
144        return self.dir
145
146if __name__ == '__main__':
147    rootdir = env.setdefault('ROOTDIR', os.getcwd())
148    pbs_jobid = env['PBS_JOBID']
149    pbs_jobname = env['PBS_JOBNAME']
150    basedir = joinpath(rootdir, 'Base')
151    jobname = env.setdefault('JOBNAME', pbs_jobname)
152    jobfile = env.setdefault('JOBFILE', joinpath(basedir, 'test.py'))
153    outdir = env.setdefault('OUTPUT_DIR', joinpath(rootdir, jobname))
154    env['POOLJOB'] = 'True'
155
156    if os.path.isdir("/work"):
157        workbase = "/work"
158    else:
159        workbase = "/tmp/"
160
161    workdir = joinpath(workbase, '%s.%s' % (env['USER'], pbs_jobid))
162    host = socket.gethostname()
163
164    os.umask(0022)
165
166    jobdir = JobDir(outdir)
167
168    started = date()
169    jobdir.echofile('.running', started)
170    jobdir.rmfile('.queued')
171    jobdir.echofile('.pbs_jobid', pbs_jobid)
172    jobdir.echofile('.pbs_jobname', pbs_jobid)
173    jobdir.echofile('.host', host)
174
175    jobdir.setstatus('running on %s on %s' % (host, started))
176
177    if os.path.isdir(workdir):
178        cleandir(workdir)
179    else:
180        os.mkdir(workdir)
181
182    if False and os.path.isdir('/z/dist'):
183        sync = rsync()
184        sync.delete = True
185        sync.sudo = True
186        sync.do('poolfs::dist/m5/', '/z/dist/m5/')
187
188    try:
189        os.chdir(workdir)
190    except OSError,e:
191        sys.exit(e)
192
193    os.symlink(jobdir.file('output'), 'status.out')
194
195    args = [ joinpath(basedir, 'm5'), joinpath(basedir, 'run.py') ]
196    if not len(args):
197        sys.exit("no arguments")
198
199    print 'starting job... %s' % started
200    print ' '.join(args)
201    print
202    sys.stdout.flush()
203
204    childpid = os.fork()
205    if not childpid:
206        # Execute command
207        sys.stdin.close()
208        fd = os.open(jobdir.file("output"),
209                     os.O_WRONLY | os.O_CREAT | os.O_TRUNC)
210        os.dup2(fd, sys.stdout.fileno())
211        os.dup2(fd, sys.stderr.fileno())
212        os.execvp(args[0], args)
213
214    def handler(signum, frame):
215        if childpid != 0:
216            os.kill(childpid, signum)
217
218    signal.signal(signal.SIGHUP, handler)
219    signal.signal(signal.SIGINT, handler)
220    signal.signal(signal.SIGQUIT, handler)
221    signal.signal(signal.SIGTERM, handler)
222    signal.signal(signal.SIGSTOP, handler)
223    signal.signal(signal.SIGCONT, handler)
224    signal.signal(signal.SIGUSR1, handler)
225    signal.signal(signal.SIGUSR2, handler)
226
227    done = 0
228    while not done:
229        try:
230            thepid,ec = os.waitpid(childpid, 0)
231            if ec:
232                print 'Exit code ', ec
233                status = 'failure'
234            else:
235                status = 'success'
236            done = 1
237        except OSError:
238            pass
239
240    complete = date()
241    print '\njob complete... %s' % complete
242    jobdir.echofile('.%s' % status, complete)
243    jobdir.rmfile('.running')
244    jobdir.setstatus('%s on %s' % (status, complete))
245