gem5-dist.sh revision 11291:9d2364203316
1#! /bin/bash
2
3#
4# Copyright (c) 2015 ARM Limited
5# All rights reserved
6#
7# The license below extends only to copyright in the software and shall
8# not be construed as granting a license to any other intellectual
9# property including but not limited to intellectual property relating
10# to a hardware implementation of the functionality of the software
11# licensed hereunder.  You may use the software subject to the license
12# terms below provided that you ensure that this notice is replicated
13# unmodified and in its entirety in all distributions of the software,
14# modified or unmodified, in source code or in binary form.
15#
16# Copyright (c) 2015 University of Illinois Urbana Champaign
17# All rights reserved
18#
19# Redistribution and use in source and binary forms, with or without
20# modification, are permitted provided that the following conditions are
21# met: redistributions of source code must retain the above copyright
22# notice, this list of conditions and the following disclaimer;
23# redistributions in binary form must reproduce the above copyright
24# notice, this list of conditions and the following disclaimer in the
25# documentation and/or other materials provided with the distribution;
26# neither the name of the copyright holders nor the names of its
27# contributors may be used to endorse or promote products derived from
28# this software without specific prior written permission.
29#
30# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
31# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
32# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
33# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
34# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
35# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
36# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
37# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
38# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
39# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
40# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41#
42# Authors: Gabor Dozsa
43#          Mohammad Alian
44
45
46# This is a wrapper script to run a dist gem5 simulations.
47# See the usage_func() below for hints on how to use it. Also,
48# there are some examples in the util/dist directory (e.g.
49# see util/dist/test-2nodes-AArch64.sh)
50#
51#
52# Allocated hosts/cores are assumed to be listed in the LSB_MCPU_HOSTS
53# environment variable (which is what LSF does by default).
54# E.g. LSB_MCPU_HOSTS=\"hname1 2 hname2 4\" means we have altogether 6 slots
55# allocated to launch the gem5 processes, 2 of them are on host hname1
56# and 4 of them are on host hname2.
57# If LSB_MCPU_HOSTS environment variable is not defined then we launch all
58# processes on the localhost.
59#
60# Each gem5 process are passed in a unique rank ID [0..N-1] via the kernel
61# boot params. The total number of gem5 processes is also passed in.
62# These values can be used in the boot script to configure the MAC/IP
63# addresses - among other things (see util/dist/bootscript.rcS).
64#
65# Each gem5 process will create an m5out.$GEM5_RANK directory for
66# the usual output files. Furthermore, there will be a separate log file
67# for each ssh session (we use ssh to start gem5 processes) and one for
68# the server. These are called log.$GEM5_RANK and log.switch.
69#
70
71
72# print help
73usage_func ()
74{
75    echo "Usage:$0 [-debug] [-n nnodes] [-r rundir] [-c ckptdir] [-p port] [-sw switch]  [--sw-args sw_args] [-fs fullsystem]  [--fs-args fs_args] [--cf-args conf_args] [--m5-args m5_args] -x gem5_exe "
76    echo "     -debug    : debug mode (start gem5 in gdb)"
77    echo "     nnodes    : number of gem5 processes"
78    echo "     rundir    : run simulation under this path. If not specified, current dir will be used"
79    echo "     ckptdir   : dump/restore checkpoints to/from this path. If not specified, current dir will be used"
80
81    echo "     fullsystem: fullsystem config file"
82    echo "     fs_args   : fullsystem config specific argument list: arg1 arg2 ..."
83    echo "     port      : switch listen port"
84    echo "     switch    : switch config file"
85    echo "     sw_args   : switch config specific argument list: arg1 arg2 ..."
86    echo "     conf_args : common (for both fullsystem and switch) config argument list: arg1 arg2 ..."
87    echo "     gem5_exe  : gem5 executable (full path required)"
88    echo "     m5_args   : common m5 argument list (e.g. debug flags): arg1 arg2 ..."
89    echo "Note: if no LSF slots allocation is found all proceses are launched on the localhost."
90}
91
92# Process (optional) command line options
93FS_ARGS=" "
94SW_ARGS=" "
95CF_ARGS=" "
96M5_ARGS=" "
97while (($# > 0))
98do
99    case "x$1" in
100        x-debug)
101            GEM5_DEBUG="-debug"
102            shift 1
103            ;;
104        x-n|x-nodes)
105            NNODES=$2
106            shift 2
107            ;;
108        x-r|x-rundir)
109            RUN_DIR=$2
110            shift 2
111            ;;
112        x-c|x-ckptdir)
113            CKPT_DIR=$2
114            shift 2
115            ;;
116        x-p|x-port)
117            SW_PORT=$2
118            shift 2
119            ;;
120        x-s|x-switch)
121            SW_CONFIG=$2
122            shift 2
123            ;;
124	x--sw-args)
125	    CUR_ARGS="SW_ARGS"
126	    shift 1
127	    ;;
128        x-f|x-fullsystem)
129            FS_CONFIG=$2
130            shift 2
131            ;;
132	x--fs-args)
133	    CUR_ARGS="FS_ARGS"
134	    shift 1
135	    ;;
136	x--cf-args)
137	    CUR_ARGS="CF_ARGS"
138	    shift 1
139	    ;;
140	x--m5-args)
141	    CUR_ARGS="M5_ARGS"
142	    shift 1
143	    ;;
144	x-x)
145	    GEM5_EXE=$2
146	    shift 2
147	    ;;
148	x-*)
149	    [ -n "$CUR_ARGS" ] || { echo "Unexpected arg: $1"; usage_func; exit -1; }
150	    case "x$2" in
151		x-*|x)
152		    eval $CUR_ARGS=\"${!CUR_ARGS} $1\"
153		    shift 1
154		    ;;
155		*)
156		    eval $CUR_ARGS=\"${!CUR_ARGS} $1 $2\"
157		    shift 2
158		    ;;
159	    esac
160	    ;;
161        *)
162            echo "Unknown arg: $1"
163	    usage_func
164	    exit 1
165            ;;
166    esac
167done
168
169# Default values to use (in case they are not defined as command line options)
170DEFAULT_FS_CONFIG=$M5_PATH/configs/example/fs.py
171DEFAULT_SW_CONFIG=$M5_PATH/configs/example/sw.py
172DEFAULT_SW_PORT=2200
173
174[ -z "$FS_CONFIG" ] && FS_CONFIG=$DEFAULT_FS_CONFIG
175[ -z "$SW_CONFIG" ] && SW_CONFIG=$DEFAULT_SW_CONFIG
176[ -z "$SW_PORT" ] && SW_PORT=$DEFAULT_SW_PORT
177[ -z "$NNODES" ] && NNODES=2
178[ -z "$RUN_DIR" ] && RUN_DIR=$(pwd)
179[ -z "$CKPT_DIR" ] && CKPT_DIR=$(pwd)
180
181#  Check if all the executables we need exist
182[ -f "$FS_CONFIG" ] || { echo "FS config ${FS_CONFIG} not found"; exit 1; }
183[ -f "$SW_CONFIG" ] || { echo "Switch config ${SW_CONFIG} not found"; exit 1; }
184[ -x "$GEM5_EXE" ]   || { echo "Executable ${GEM5_EXE} not found"; exit 1; }
185# make sure that RUN_DIR exists
186mkdir -p $RUN_DIR > /dev/null 2>&1
187
188declare -a SSH_PIDS
189declare -a HOSTS
190declare -a NCORES
191
192# Find out which cluster hosts/slots are allocated or
193# use localhost if there is no LSF allocation.
194# We assume that allocated slots are listed in the LSB_MCPU_HOSTS
195# environment variable in the form:
196# host1 nslots1 host2 nslots2 ...
197# (This is what LSF does by default.)
198NH=0
199[ "x$LSB_MCPU_HOSTS" != "x" ] || LSB_MCPU_HOSTS="127.0.0.1 $NNODES"
200host=""
201for hc in $LSB_MCPU_HOSTS
202do
203    if [ "x$host" == "x" ]
204    then
205        host=$hc
206        HOSTS+=($hc)
207    else
208        NCORES+=($hc)
209        ((NH+=hc))
210        host=""
211    fi
212done
213((NNODES==NH)) || { echo "(E) Number of cluster slots ($NH) and gem5 instances ($N) differ"; exit -1; }
214
215# function to clean up and abort if something goes wrong
216abort_func ()
217{
218    echo
219    echo "KILLED $(date)"
220    # Try to Kill the server first. That should trigger an exit for all connected
221    # gem5 processes.
222    [ "x$SW_PID" != "x" ] && kill $SW_PID 2>/dev/null
223    sleep 20
224    # (try to) kill gem5 processes - just in case something went wrong with the
225    # server triggered exit
226    bname=$(basename $GEM5_EXE)
227    killall -q -s SIGKILL $bname
228    for h in ${HOSTS[@]}
229    do
230	ssh $h killall -q -s SIGKILL $bname
231    done
232    sleep 5
233    # kill the watchdog
234    [ "x$WATCHDOG_PID" != "x" ] && kill $WATCHDOG_PID 2>/dev/null
235    exit -1
236}
237
238# We need a watchdog to trigger full clean up if a gem5 process dies
239watchdog_func ()
240{
241    while true
242    do
243        sleep 30
244        ((NDEAD=0))
245        for p in ${SSH_PIDS[*]}
246        do
247            kill -0 $p 2>/dev/null || ((NDEAD+=1))
248        done
249        kill -0 $SW_PID || ((NDEAD+=1))
250        if ((NDEAD>0))
251        then
252            # we may be in the middle of an orderly termination,
253            # give it some time to complete before reporting abort
254            sleep 60
255            echo -n "(I) (some) gem5 process(es) exited"
256            abort_func
257        fi
258    done
259}
260
261# This function launches the gem5 processes. The only purpose is to enable
262# launching gem5 processes under gdb control for debugging
263start_func ()
264{
265      local N=$1
266      local HOST=$2
267      local ENV_ARGS=$3
268      shift 3
269      if [ "x$GEM5_DEBUG" != "x" ]
270      then
271	      echo "DEBUG starting terminal..."
272	      MY_ARGS="$@"
273	      xterm -e "gdb --args $MY_ARGS" &
274      else
275        ssh $HOST $ENV_ARGS "$@" &> $RUN_DIR/log.$N &
276      fi
277}
278
279# block till the gem5 process starts
280connected ()
281{
282    FILE=$1
283    STRING=$2
284    echo -n "waiting for $3 to start "
285    while : ;
286    do
287        kill -0 $4 || { echo "Failed to start $3"; exit -1; }
288        [[ -f "$FILE" ]] &&                                                   \
289        grep -q "$STRING" "$FILE" &&                                          \
290        echo -e "\nnode #$3 started" &&                                       \
291        break
292
293        sleep 2
294        echo -n "."
295    done
296}
297
298# Trigger full clean up in case we are being killed by external signal
299trap 'abort_func' INT TERM
300
301# env args to be passed explicitly to gem5 processes started via ssh
302ENV_ARGS="LD_LIBRARY_PATH=$LD_LIBRARY_PATH M5_PATH=$M5_PATH"
303
304#cleanup log files before starting gem5 processes
305rm $RUN_DIR/log.switch > /dev/null 2>&1
306
307# make sure that CKPT_DIR exists
308mkdir -p $CKPT_DIR/m5out.switch > /dev/null 2>&1
309# launch switch gem5
310SW_HOST=${HOSTS[0]}
311echo "launch switch gem5 process on $SW_HOST ..."
312start_func "switch" $SW_HOST "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.switch   \
313          $M5_ARGS                                                            \
314          $SW_CONFIG                                                          \
315          $SW_ARGS                                                            \
316          $CF_ARGS                                                            \
317          --checkpoint-dir=$CKPT_DIR/m5out.switch                             \
318          --is-switch                                                         \
319          --dist-size=$NNODES                                                 \
320          --dist-server-port=$SW_PORT
321SW_PID=$!
322
323# block here till switch process starts
324connected $RUN_DIR/log.switch "tcp_iface listening on port" "switch" $SW_PID
325LINE=$(grep -r "tcp_iface listening on port" $RUN_DIR/log.switch)
326
327IFS=' ' read -ra ADDR <<< "$LINE"
328# actual port that switch is listening on may be different
329# from what we specified if the port was busy
330SW_PORT=${ADDR[5]}
331
332# Now launch all the gem5 processes with ssh.
333echo "START $(date)"
334n=0
335for ((i=0; i < ${#HOSTS[@]}; i++))
336do
337    h=${HOSTS[$i]}
338    for ((j=0; j < ${NCORES[i]}; j++))
339    do
340        #cleanup log files before starting gem5 processes
341        rm $RUN_DIR/log.$n > /dev/null 2>&1
342        # make sure that CKPT_DIR exists
343        mkdir -p $CKPT_DIR/m5out.$n > /dev/null 2>&1
344	    echo "starting gem5 on $h ..."
345	    start_func $n $h "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.$n       \
346                       $M5_ARGS                                               \
347                       $FS_CONFIG                                             \
348                       $FS_ARGS                                               \
349                       $CF_ARGS                                               \
350                       --checkpoint-dir=$CKPT_DIR/m5out.$n                    \
351	               --dist                                                 \
352	               --dist-rank=$n                                         \
353	               --dist-size=$NNODES                                    \
354                       --dist-server-name=${HOSTS[0]}                         \
355                       --dist-server-port=$SW_PORT
356	    SSH_PIDS[$n]=$!
357	((n+=1))
358    done
359done
360
361# Wait here if it is a debug session
362[ "x$GEM5_DEBUG" == "x" ] || {  echo "DEBUG session"; wait $SW_PID; exit -1; }
363
364# start watchdog to trigger complete abort (after a grace period) if any
365# gem5 process dies
366watchdog_func &
367WATCHDOG_PID=$!
368
369# wait for exit statuses
370((NFAIL=0))
371for p in ${SSH_PIDS[*]}
372do
373    wait $p || ((NFAIL+=1))
374done
375wait $SW_PID || ((NFAIL+=1))
376
377# all done, let's terminate the watchdog
378kill $WATCHDOG_PID 2>/dev/null
379
380if ((NFAIL==0))
381then
382    echo "EXIT $(date)"
383else
384    echo "ABORT $(date)"
385fi
386