gem5-dist.sh (11291:9d2364203316) gem5-dist.sh (11444:219c5fe8fa0e)
1#! /bin/bash
2
3#
4# Copyright (c) 2015 ARM Limited
5# All rights reserved
6#
7# The license below extends only to copyright in the software and shall
8# not be construed as granting a license to any other intellectual
9# property including but not limited to intellectual property relating
10# to a hardware implementation of the functionality of the software
11# licensed hereunder. You may use the software subject to the license
12# terms below provided that you ensure that this notice is replicated
13# unmodified and in its entirety in all distributions of the software,
14# modified or unmodified, in source code or in binary form.
15#
16# Copyright (c) 2015 University of Illinois Urbana Champaign
17# All rights reserved
18#
19# Redistribution and use in source and binary forms, with or without
20# modification, are permitted provided that the following conditions are
21# met: redistributions of source code must retain the above copyright
22# notice, this list of conditions and the following disclaimer;
23# redistributions in binary form must reproduce the above copyright
24# notice, this list of conditions and the following disclaimer in the
25# documentation and/or other materials provided with the distribution;
26# neither the name of the copyright holders nor the names of its
27# contributors may be used to endorse or promote products derived from
28# this software without specific prior written permission.
29#
30# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
31# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
32# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
33# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
34# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
35# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
36# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
37# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
38# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
39# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
40# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41#
42# Authors: Gabor Dozsa
43# Mohammad Alian
44
45
46# This is a wrapper script to run a dist gem5 simulations.
47# See the usage_func() below for hints on how to use it. Also,
48# there are some examples in the util/dist directory (e.g.
49# see util/dist/test-2nodes-AArch64.sh)
50#
51#
52# Allocated hosts/cores are assumed to be listed in the LSB_MCPU_HOSTS
53# environment variable (which is what LSF does by default).
54# E.g. LSB_MCPU_HOSTS=\"hname1 2 hname2 4\" means we have altogether 6 slots
55# allocated to launch the gem5 processes, 2 of them are on host hname1
56# and 4 of them are on host hname2.
57# If LSB_MCPU_HOSTS environment variable is not defined then we launch all
58# processes on the localhost.
59#
60# Each gem5 process are passed in a unique rank ID [0..N-1] via the kernel
61# boot params. The total number of gem5 processes is also passed in.
62# These values can be used in the boot script to configure the MAC/IP
63# addresses - among other things (see util/dist/bootscript.rcS).
64#
65# Each gem5 process will create an m5out.$GEM5_RANK directory for
66# the usual output files. Furthermore, there will be a separate log file
67# for each ssh session (we use ssh to start gem5 processes) and one for
68# the server. These are called log.$GEM5_RANK and log.switch.
69#
70
71
72# print help
73usage_func ()
74{
75 echo "Usage:$0 [-debug] [-n nnodes] [-r rundir] [-c ckptdir] [-p port] [-sw switch] [--sw-args sw_args] [-fs fullsystem] [--fs-args fs_args] [--cf-args conf_args] [--m5-args m5_args] -x gem5_exe "
76 echo " -debug : debug mode (start gem5 in gdb)"
77 echo " nnodes : number of gem5 processes"
78 echo " rundir : run simulation under this path. If not specified, current dir will be used"
79 echo " ckptdir : dump/restore checkpoints to/from this path. If not specified, current dir will be used"
80
81 echo " fullsystem: fullsystem config file"
82 echo " fs_args : fullsystem config specific argument list: arg1 arg2 ..."
83 echo " port : switch listen port"
84 echo " switch : switch config file"
85 echo " sw_args : switch config specific argument list: arg1 arg2 ..."
86 echo " conf_args : common (for both fullsystem and switch) config argument list: arg1 arg2 ..."
87 echo " gem5_exe : gem5 executable (full path required)"
88 echo " m5_args : common m5 argument list (e.g. debug flags): arg1 arg2 ..."
89 echo "Note: if no LSF slots allocation is found all proceses are launched on the localhost."
90}
91
92# Process (optional) command line options
93FS_ARGS=" "
94SW_ARGS=" "
95CF_ARGS=" "
96M5_ARGS=" "
97while (($# > 0))
98do
99 case "x$1" in
100 x-debug)
101 GEM5_DEBUG="-debug"
102 shift 1
103 ;;
104 x-n|x-nodes)
105 NNODES=$2
106 shift 2
107 ;;
108 x-r|x-rundir)
109 RUN_DIR=$2
110 shift 2
111 ;;
112 x-c|x-ckptdir)
113 CKPT_DIR=$2
114 shift 2
115 ;;
116 x-p|x-port)
117 SW_PORT=$2
118 shift 2
119 ;;
120 x-s|x-switch)
121 SW_CONFIG=$2
122 shift 2
123 ;;
124 x--sw-args)
125 CUR_ARGS="SW_ARGS"
126 shift 1
127 ;;
128 x-f|x-fullsystem)
129 FS_CONFIG=$2
130 shift 2
131 ;;
132 x--fs-args)
133 CUR_ARGS="FS_ARGS"
134 shift 1
135 ;;
136 x--cf-args)
137 CUR_ARGS="CF_ARGS"
138 shift 1
139 ;;
140 x--m5-args)
141 CUR_ARGS="M5_ARGS"
142 shift 1
143 ;;
144 x-x)
145 GEM5_EXE=$2
146 shift 2
147 ;;
148 x-*)
149 [ -n "$CUR_ARGS" ] || { echo "Unexpected arg: $1"; usage_func; exit -1; }
150 case "x$2" in
151 x-*|x)
152 eval $CUR_ARGS=\"${!CUR_ARGS} $1\"
153 shift 1
154 ;;
155 *)
156 eval $CUR_ARGS=\"${!CUR_ARGS} $1 $2\"
157 shift 2
158 ;;
159 esac
160 ;;
161 *)
162 echo "Unknown arg: $1"
163 usage_func
164 exit 1
165 ;;
166 esac
167done
168
169# Default values to use (in case they are not defined as command line options)
170DEFAULT_FS_CONFIG=$M5_PATH/configs/example/fs.py
1#! /bin/bash
2
3#
4# Copyright (c) 2015 ARM Limited
5# All rights reserved
6#
7# The license below extends only to copyright in the software and shall
8# not be construed as granting a license to any other intellectual
9# property including but not limited to intellectual property relating
10# to a hardware implementation of the functionality of the software
11# licensed hereunder. You may use the software subject to the license
12# terms below provided that you ensure that this notice is replicated
13# unmodified and in its entirety in all distributions of the software,
14# modified or unmodified, in source code or in binary form.
15#
16# Copyright (c) 2015 University of Illinois Urbana Champaign
17# All rights reserved
18#
19# Redistribution and use in source and binary forms, with or without
20# modification, are permitted provided that the following conditions are
21# met: redistributions of source code must retain the above copyright
22# notice, this list of conditions and the following disclaimer;
23# redistributions in binary form must reproduce the above copyright
24# notice, this list of conditions and the following disclaimer in the
25# documentation and/or other materials provided with the distribution;
26# neither the name of the copyright holders nor the names of its
27# contributors may be used to endorse or promote products derived from
28# this software without specific prior written permission.
29#
30# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
31# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
32# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
33# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
34# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
35# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
36# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
37# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
38# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
39# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
40# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41#
42# Authors: Gabor Dozsa
43# Mohammad Alian
44
45
46# This is a wrapper script to run a dist gem5 simulations.
47# See the usage_func() below for hints on how to use it. Also,
48# there are some examples in the util/dist directory (e.g.
49# see util/dist/test-2nodes-AArch64.sh)
50#
51#
52# Allocated hosts/cores are assumed to be listed in the LSB_MCPU_HOSTS
53# environment variable (which is what LSF does by default).
54# E.g. LSB_MCPU_HOSTS=\"hname1 2 hname2 4\" means we have altogether 6 slots
55# allocated to launch the gem5 processes, 2 of them are on host hname1
56# and 4 of them are on host hname2.
57# If LSB_MCPU_HOSTS environment variable is not defined then we launch all
58# processes on the localhost.
59#
60# Each gem5 process are passed in a unique rank ID [0..N-1] via the kernel
61# boot params. The total number of gem5 processes is also passed in.
62# These values can be used in the boot script to configure the MAC/IP
63# addresses - among other things (see util/dist/bootscript.rcS).
64#
65# Each gem5 process will create an m5out.$GEM5_RANK directory for
66# the usual output files. Furthermore, there will be a separate log file
67# for each ssh session (we use ssh to start gem5 processes) and one for
68# the server. These are called log.$GEM5_RANK and log.switch.
69#
70
71
72# print help
73usage_func ()
74{
75 echo "Usage:$0 [-debug] [-n nnodes] [-r rundir] [-c ckptdir] [-p port] [-sw switch] [--sw-args sw_args] [-fs fullsystem] [--fs-args fs_args] [--cf-args conf_args] [--m5-args m5_args] -x gem5_exe "
76 echo " -debug : debug mode (start gem5 in gdb)"
77 echo " nnodes : number of gem5 processes"
78 echo " rundir : run simulation under this path. If not specified, current dir will be used"
79 echo " ckptdir : dump/restore checkpoints to/from this path. If not specified, current dir will be used"
80
81 echo " fullsystem: fullsystem config file"
82 echo " fs_args : fullsystem config specific argument list: arg1 arg2 ..."
83 echo " port : switch listen port"
84 echo " switch : switch config file"
85 echo " sw_args : switch config specific argument list: arg1 arg2 ..."
86 echo " conf_args : common (for both fullsystem and switch) config argument list: arg1 arg2 ..."
87 echo " gem5_exe : gem5 executable (full path required)"
88 echo " m5_args : common m5 argument list (e.g. debug flags): arg1 arg2 ..."
89 echo "Note: if no LSF slots allocation is found all proceses are launched on the localhost."
90}
91
92# Process (optional) command line options
93FS_ARGS=" "
94SW_ARGS=" "
95CF_ARGS=" "
96M5_ARGS=" "
97while (($# > 0))
98do
99 case "x$1" in
100 x-debug)
101 GEM5_DEBUG="-debug"
102 shift 1
103 ;;
104 x-n|x-nodes)
105 NNODES=$2
106 shift 2
107 ;;
108 x-r|x-rundir)
109 RUN_DIR=$2
110 shift 2
111 ;;
112 x-c|x-ckptdir)
113 CKPT_DIR=$2
114 shift 2
115 ;;
116 x-p|x-port)
117 SW_PORT=$2
118 shift 2
119 ;;
120 x-s|x-switch)
121 SW_CONFIG=$2
122 shift 2
123 ;;
124 x--sw-args)
125 CUR_ARGS="SW_ARGS"
126 shift 1
127 ;;
128 x-f|x-fullsystem)
129 FS_CONFIG=$2
130 shift 2
131 ;;
132 x--fs-args)
133 CUR_ARGS="FS_ARGS"
134 shift 1
135 ;;
136 x--cf-args)
137 CUR_ARGS="CF_ARGS"
138 shift 1
139 ;;
140 x--m5-args)
141 CUR_ARGS="M5_ARGS"
142 shift 1
143 ;;
144 x-x)
145 GEM5_EXE=$2
146 shift 2
147 ;;
148 x-*)
149 [ -n "$CUR_ARGS" ] || { echo "Unexpected arg: $1"; usage_func; exit -1; }
150 case "x$2" in
151 x-*|x)
152 eval $CUR_ARGS=\"${!CUR_ARGS} $1\"
153 shift 1
154 ;;
155 *)
156 eval $CUR_ARGS=\"${!CUR_ARGS} $1 $2\"
157 shift 2
158 ;;
159 esac
160 ;;
161 *)
162 echo "Unknown arg: $1"
163 usage_func
164 exit 1
165 ;;
166 esac
167done
168
169# Default values to use (in case they are not defined as command line options)
170DEFAULT_FS_CONFIG=$M5_PATH/configs/example/fs.py
171DEFAULT_SW_CONFIG=$M5_PATH/configs/example/sw.py
171DEFAULT_SW_CONFIG=$M5_PATH/configs/dist/sw.py
172DEFAULT_SW_PORT=2200
173
174[ -z "$FS_CONFIG" ] && FS_CONFIG=$DEFAULT_FS_CONFIG
175[ -z "$SW_CONFIG" ] && SW_CONFIG=$DEFAULT_SW_CONFIG
176[ -z "$SW_PORT" ] && SW_PORT=$DEFAULT_SW_PORT
177[ -z "$NNODES" ] && NNODES=2
178[ -z "$RUN_DIR" ] && RUN_DIR=$(pwd)
179[ -z "$CKPT_DIR" ] && CKPT_DIR=$(pwd)
180
181# Check if all the executables we need exist
182[ -f "$FS_CONFIG" ] || { echo "FS config ${FS_CONFIG} not found"; exit 1; }
183[ -f "$SW_CONFIG" ] || { echo "Switch config ${SW_CONFIG} not found"; exit 1; }
184[ -x "$GEM5_EXE" ] || { echo "Executable ${GEM5_EXE} not found"; exit 1; }
185# make sure that RUN_DIR exists
186mkdir -p $RUN_DIR > /dev/null 2>&1
187
188declare -a SSH_PIDS
189declare -a HOSTS
190declare -a NCORES
191
192# Find out which cluster hosts/slots are allocated or
193# use localhost if there is no LSF allocation.
194# We assume that allocated slots are listed in the LSB_MCPU_HOSTS
195# environment variable in the form:
196# host1 nslots1 host2 nslots2 ...
197# (This is what LSF does by default.)
198NH=0
199[ "x$LSB_MCPU_HOSTS" != "x" ] || LSB_MCPU_HOSTS="127.0.0.1 $NNODES"
200host=""
201for hc in $LSB_MCPU_HOSTS
202do
203 if [ "x$host" == "x" ]
204 then
205 host=$hc
206 HOSTS+=($hc)
207 else
208 NCORES+=($hc)
209 ((NH+=hc))
210 host=""
211 fi
212done
213((NNODES==NH)) || { echo "(E) Number of cluster slots ($NH) and gem5 instances ($N) differ"; exit -1; }
214
215# function to clean up and abort if something goes wrong
216abort_func ()
217{
218 echo
219 echo "KILLED $(date)"
220 # Try to Kill the server first. That should trigger an exit for all connected
221 # gem5 processes.
222 [ "x$SW_PID" != "x" ] && kill $SW_PID 2>/dev/null
223 sleep 20
224 # (try to) kill gem5 processes - just in case something went wrong with the
225 # server triggered exit
226 bname=$(basename $GEM5_EXE)
227 killall -q -s SIGKILL $bname
228 for h in ${HOSTS[@]}
229 do
230 ssh $h killall -q -s SIGKILL $bname
231 done
232 sleep 5
233 # kill the watchdog
234 [ "x$WATCHDOG_PID" != "x" ] && kill $WATCHDOG_PID 2>/dev/null
235 exit -1
236}
237
238# We need a watchdog to trigger full clean up if a gem5 process dies
239watchdog_func ()
240{
241 while true
242 do
243 sleep 30
244 ((NDEAD=0))
245 for p in ${SSH_PIDS[*]}
246 do
247 kill -0 $p 2>/dev/null || ((NDEAD+=1))
248 done
249 kill -0 $SW_PID || ((NDEAD+=1))
250 if ((NDEAD>0))
251 then
252 # we may be in the middle of an orderly termination,
253 # give it some time to complete before reporting abort
254 sleep 60
255 echo -n "(I) (some) gem5 process(es) exited"
256 abort_func
257 fi
258 done
259}
260
261# This function launches the gem5 processes. The only purpose is to enable
262# launching gem5 processes under gdb control for debugging
263start_func ()
264{
265 local N=$1
266 local HOST=$2
267 local ENV_ARGS=$3
268 shift 3
269 if [ "x$GEM5_DEBUG" != "x" ]
270 then
271 echo "DEBUG starting terminal..."
272 MY_ARGS="$@"
273 xterm -e "gdb --args $MY_ARGS" &
274 else
275 ssh $HOST $ENV_ARGS "$@" &> $RUN_DIR/log.$N &
276 fi
277}
278
279# block till the gem5 process starts
280connected ()
281{
282 FILE=$1
283 STRING=$2
284 echo -n "waiting for $3 to start "
285 while : ;
286 do
287 kill -0 $4 || { echo "Failed to start $3"; exit -1; }
288 [[ -f "$FILE" ]] && \
289 grep -q "$STRING" "$FILE" && \
290 echo -e "\nnode #$3 started" && \
291 break
292
293 sleep 2
294 echo -n "."
295 done
296}
297
298# Trigger full clean up in case we are being killed by external signal
299trap 'abort_func' INT TERM
300
301# env args to be passed explicitly to gem5 processes started via ssh
302ENV_ARGS="LD_LIBRARY_PATH=$LD_LIBRARY_PATH M5_PATH=$M5_PATH"
303
304#cleanup log files before starting gem5 processes
305rm $RUN_DIR/log.switch > /dev/null 2>&1
306
307# make sure that CKPT_DIR exists
308mkdir -p $CKPT_DIR/m5out.switch > /dev/null 2>&1
309# launch switch gem5
310SW_HOST=${HOSTS[0]}
311echo "launch switch gem5 process on $SW_HOST ..."
312start_func "switch" $SW_HOST "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.switch \
313 $M5_ARGS \
314 $SW_CONFIG \
315 $SW_ARGS \
316 $CF_ARGS \
317 --checkpoint-dir=$CKPT_DIR/m5out.switch \
318 --is-switch \
319 --dist-size=$NNODES \
320 --dist-server-port=$SW_PORT
321SW_PID=$!
322
323# block here till switch process starts
324connected $RUN_DIR/log.switch "tcp_iface listening on port" "switch" $SW_PID
325LINE=$(grep -r "tcp_iface listening on port" $RUN_DIR/log.switch)
326
327IFS=' ' read -ra ADDR <<< "$LINE"
328# actual port that switch is listening on may be different
329# from what we specified if the port was busy
330SW_PORT=${ADDR[5]}
331
332# Now launch all the gem5 processes with ssh.
333echo "START $(date)"
334n=0
335for ((i=0; i < ${#HOSTS[@]}; i++))
336do
337 h=${HOSTS[$i]}
338 for ((j=0; j < ${NCORES[i]}; j++))
339 do
340 #cleanup log files before starting gem5 processes
341 rm $RUN_DIR/log.$n > /dev/null 2>&1
342 # make sure that CKPT_DIR exists
343 mkdir -p $CKPT_DIR/m5out.$n > /dev/null 2>&1
344 echo "starting gem5 on $h ..."
345 start_func $n $h "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.$n \
346 $M5_ARGS \
347 $FS_CONFIG \
348 $FS_ARGS \
349 $CF_ARGS \
350 --checkpoint-dir=$CKPT_DIR/m5out.$n \
351 --dist \
352 --dist-rank=$n \
353 --dist-size=$NNODES \
354 --dist-server-name=${HOSTS[0]} \
355 --dist-server-port=$SW_PORT
356 SSH_PIDS[$n]=$!
357 ((n+=1))
358 done
359done
360
361# Wait here if it is a debug session
362[ "x$GEM5_DEBUG" == "x" ] || { echo "DEBUG session"; wait $SW_PID; exit -1; }
363
364# start watchdog to trigger complete abort (after a grace period) if any
365# gem5 process dies
366watchdog_func &
367WATCHDOG_PID=$!
368
369# wait for exit statuses
370((NFAIL=0))
371for p in ${SSH_PIDS[*]}
372do
373 wait $p || ((NFAIL+=1))
374done
375wait $SW_PID || ((NFAIL+=1))
376
377# all done, let's terminate the watchdog
378kill $WATCHDOG_PID 2>/dev/null
379
380if ((NFAIL==0))
381then
382 echo "EXIT $(date)"
383else
384 echo "ABORT $(date)"
385fi
172DEFAULT_SW_PORT=2200
173
174[ -z "$FS_CONFIG" ] && FS_CONFIG=$DEFAULT_FS_CONFIG
175[ -z "$SW_CONFIG" ] && SW_CONFIG=$DEFAULT_SW_CONFIG
176[ -z "$SW_PORT" ] && SW_PORT=$DEFAULT_SW_PORT
177[ -z "$NNODES" ] && NNODES=2
178[ -z "$RUN_DIR" ] && RUN_DIR=$(pwd)
179[ -z "$CKPT_DIR" ] && CKPT_DIR=$(pwd)
180
181# Check if all the executables we need exist
182[ -f "$FS_CONFIG" ] || { echo "FS config ${FS_CONFIG} not found"; exit 1; }
183[ -f "$SW_CONFIG" ] || { echo "Switch config ${SW_CONFIG} not found"; exit 1; }
184[ -x "$GEM5_EXE" ] || { echo "Executable ${GEM5_EXE} not found"; exit 1; }
185# make sure that RUN_DIR exists
186mkdir -p $RUN_DIR > /dev/null 2>&1
187
188declare -a SSH_PIDS
189declare -a HOSTS
190declare -a NCORES
191
192# Find out which cluster hosts/slots are allocated or
193# use localhost if there is no LSF allocation.
194# We assume that allocated slots are listed in the LSB_MCPU_HOSTS
195# environment variable in the form:
196# host1 nslots1 host2 nslots2 ...
197# (This is what LSF does by default.)
198NH=0
199[ "x$LSB_MCPU_HOSTS" != "x" ] || LSB_MCPU_HOSTS="127.0.0.1 $NNODES"
200host=""
201for hc in $LSB_MCPU_HOSTS
202do
203 if [ "x$host" == "x" ]
204 then
205 host=$hc
206 HOSTS+=($hc)
207 else
208 NCORES+=($hc)
209 ((NH+=hc))
210 host=""
211 fi
212done
213((NNODES==NH)) || { echo "(E) Number of cluster slots ($NH) and gem5 instances ($N) differ"; exit -1; }
214
215# function to clean up and abort if something goes wrong
216abort_func ()
217{
218 echo
219 echo "KILLED $(date)"
220 # Try to Kill the server first. That should trigger an exit for all connected
221 # gem5 processes.
222 [ "x$SW_PID" != "x" ] && kill $SW_PID 2>/dev/null
223 sleep 20
224 # (try to) kill gem5 processes - just in case something went wrong with the
225 # server triggered exit
226 bname=$(basename $GEM5_EXE)
227 killall -q -s SIGKILL $bname
228 for h in ${HOSTS[@]}
229 do
230 ssh $h killall -q -s SIGKILL $bname
231 done
232 sleep 5
233 # kill the watchdog
234 [ "x$WATCHDOG_PID" != "x" ] && kill $WATCHDOG_PID 2>/dev/null
235 exit -1
236}
237
238# We need a watchdog to trigger full clean up if a gem5 process dies
239watchdog_func ()
240{
241 while true
242 do
243 sleep 30
244 ((NDEAD=0))
245 for p in ${SSH_PIDS[*]}
246 do
247 kill -0 $p 2>/dev/null || ((NDEAD+=1))
248 done
249 kill -0 $SW_PID || ((NDEAD+=1))
250 if ((NDEAD>0))
251 then
252 # we may be in the middle of an orderly termination,
253 # give it some time to complete before reporting abort
254 sleep 60
255 echo -n "(I) (some) gem5 process(es) exited"
256 abort_func
257 fi
258 done
259}
260
261# This function launches the gem5 processes. The only purpose is to enable
262# launching gem5 processes under gdb control for debugging
263start_func ()
264{
265 local N=$1
266 local HOST=$2
267 local ENV_ARGS=$3
268 shift 3
269 if [ "x$GEM5_DEBUG" != "x" ]
270 then
271 echo "DEBUG starting terminal..."
272 MY_ARGS="$@"
273 xterm -e "gdb --args $MY_ARGS" &
274 else
275 ssh $HOST $ENV_ARGS "$@" &> $RUN_DIR/log.$N &
276 fi
277}
278
279# block till the gem5 process starts
280connected ()
281{
282 FILE=$1
283 STRING=$2
284 echo -n "waiting for $3 to start "
285 while : ;
286 do
287 kill -0 $4 || { echo "Failed to start $3"; exit -1; }
288 [[ -f "$FILE" ]] && \
289 grep -q "$STRING" "$FILE" && \
290 echo -e "\nnode #$3 started" && \
291 break
292
293 sleep 2
294 echo -n "."
295 done
296}
297
298# Trigger full clean up in case we are being killed by external signal
299trap 'abort_func' INT TERM
300
301# env args to be passed explicitly to gem5 processes started via ssh
302ENV_ARGS="LD_LIBRARY_PATH=$LD_LIBRARY_PATH M5_PATH=$M5_PATH"
303
304#cleanup log files before starting gem5 processes
305rm $RUN_DIR/log.switch > /dev/null 2>&1
306
307# make sure that CKPT_DIR exists
308mkdir -p $CKPT_DIR/m5out.switch > /dev/null 2>&1
309# launch switch gem5
310SW_HOST=${HOSTS[0]}
311echo "launch switch gem5 process on $SW_HOST ..."
312start_func "switch" $SW_HOST "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.switch \
313 $M5_ARGS \
314 $SW_CONFIG \
315 $SW_ARGS \
316 $CF_ARGS \
317 --checkpoint-dir=$CKPT_DIR/m5out.switch \
318 --is-switch \
319 --dist-size=$NNODES \
320 --dist-server-port=$SW_PORT
321SW_PID=$!
322
323# block here till switch process starts
324connected $RUN_DIR/log.switch "tcp_iface listening on port" "switch" $SW_PID
325LINE=$(grep -r "tcp_iface listening on port" $RUN_DIR/log.switch)
326
327IFS=' ' read -ra ADDR <<< "$LINE"
328# actual port that switch is listening on may be different
329# from what we specified if the port was busy
330SW_PORT=${ADDR[5]}
331
332# Now launch all the gem5 processes with ssh.
333echo "START $(date)"
334n=0
335for ((i=0; i < ${#HOSTS[@]}; i++))
336do
337 h=${HOSTS[$i]}
338 for ((j=0; j < ${NCORES[i]}; j++))
339 do
340 #cleanup log files before starting gem5 processes
341 rm $RUN_DIR/log.$n > /dev/null 2>&1
342 # make sure that CKPT_DIR exists
343 mkdir -p $CKPT_DIR/m5out.$n > /dev/null 2>&1
344 echo "starting gem5 on $h ..."
345 start_func $n $h "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.$n \
346 $M5_ARGS \
347 $FS_CONFIG \
348 $FS_ARGS \
349 $CF_ARGS \
350 --checkpoint-dir=$CKPT_DIR/m5out.$n \
351 --dist \
352 --dist-rank=$n \
353 --dist-size=$NNODES \
354 --dist-server-name=${HOSTS[0]} \
355 --dist-server-port=$SW_PORT
356 SSH_PIDS[$n]=$!
357 ((n+=1))
358 done
359done
360
361# Wait here if it is a debug session
362[ "x$GEM5_DEBUG" == "x" ] || { echo "DEBUG session"; wait $SW_PID; exit -1; }
363
364# start watchdog to trigger complete abort (after a grace period) if any
365# gem5 process dies
366watchdog_func &
367WATCHDOG_PID=$!
368
369# wait for exit statuses
370((NFAIL=0))
371for p in ${SSH_PIDS[*]}
372do
373 wait $p || ((NFAIL+=1))
374done
375wait $SW_PID || ((NFAIL+=1))
376
377# all done, let's terminate the watchdog
378kill $WATCHDOG_PID 2>/dev/null
379
380if ((NFAIL==0))
381then
382 echo "EXIT $(date)"
383else
384 echo "ABORT $(date)"
385fi