gem5-dist.sh revision 11291:9d2364203316
1#! /bin/bash 2 3# 4# Copyright (c) 2015 ARM Limited 5# All rights reserved 6# 7# The license below extends only to copyright in the software and shall 8# not be construed as granting a license to any other intellectual 9# property including but not limited to intellectual property relating 10# to a hardware implementation of the functionality of the software 11# licensed hereunder. You may use the software subject to the license 12# terms below provided that you ensure that this notice is replicated 13# unmodified and in its entirety in all distributions of the software, 14# modified or unmodified, in source code or in binary form. 15# 16# Copyright (c) 2015 University of Illinois Urbana Champaign 17# All rights reserved 18# 19# Redistribution and use in source and binary forms, with or without 20# modification, are permitted provided that the following conditions are 21# met: redistributions of source code must retain the above copyright 22# notice, this list of conditions and the following disclaimer; 23# redistributions in binary form must reproduce the above copyright 24# notice, this list of conditions and the following disclaimer in the 25# documentation and/or other materials provided with the distribution; 26# neither the name of the copyright holders nor the names of its 27# contributors may be used to endorse or promote products derived from 28# this software without specific prior written permission. 29# 30# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 31# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 32# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 33# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 34# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 35# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 36# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 37# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 38# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 39# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 40# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 41# 42# Authors: Gabor Dozsa 43# Mohammad Alian 44 45 46# This is a wrapper script to run a dist gem5 simulations. 47# See the usage_func() below for hints on how to use it. Also, 48# there are some examples in the util/dist directory (e.g. 49# see util/dist/test-2nodes-AArch64.sh) 50# 51# 52# Allocated hosts/cores are assumed to be listed in the LSB_MCPU_HOSTS 53# environment variable (which is what LSF does by default). 54# E.g. LSB_MCPU_HOSTS=\"hname1 2 hname2 4\" means we have altogether 6 slots 55# allocated to launch the gem5 processes, 2 of them are on host hname1 56# and 4 of them are on host hname2. 57# If LSB_MCPU_HOSTS environment variable is not defined then we launch all 58# processes on the localhost. 59# 60# Each gem5 process are passed in a unique rank ID [0..N-1] via the kernel 61# boot params. The total number of gem5 processes is also passed in. 62# These values can be used in the boot script to configure the MAC/IP 63# addresses - among other things (see util/dist/bootscript.rcS). 64# 65# Each gem5 process will create an m5out.$GEM5_RANK directory for 66# the usual output files. Furthermore, there will be a separate log file 67# for each ssh session (we use ssh to start gem5 processes) and one for 68# the server. These are called log.$GEM5_RANK and log.switch. 69# 70 71 72# print help 73usage_func () 74{ 75 echo "Usage:$0 [-debug] [-n nnodes] [-r rundir] [-c ckptdir] [-p port] [-sw switch] [--sw-args sw_args] [-fs fullsystem] [--fs-args fs_args] [--cf-args conf_args] [--m5-args m5_args] -x gem5_exe " 76 echo " -debug : debug mode (start gem5 in gdb)" 77 echo " nnodes : number of gem5 processes" 78 echo " rundir : run simulation under this path. If not specified, current dir will be used" 79 echo " ckptdir : dump/restore checkpoints to/from this path. If not specified, current dir will be used" 80 81 echo " fullsystem: fullsystem config file" 82 echo " fs_args : fullsystem config specific argument list: arg1 arg2 ..." 83 echo " port : switch listen port" 84 echo " switch : switch config file" 85 echo " sw_args : switch config specific argument list: arg1 arg2 ..." 86 echo " conf_args : common (for both fullsystem and switch) config argument list: arg1 arg2 ..." 87 echo " gem5_exe : gem5 executable (full path required)" 88 echo " m5_args : common m5 argument list (e.g. debug flags): arg1 arg2 ..." 89 echo "Note: if no LSF slots allocation is found all proceses are launched on the localhost." 90} 91 92# Process (optional) command line options 93FS_ARGS=" " 94SW_ARGS=" " 95CF_ARGS=" " 96M5_ARGS=" " 97while (($# > 0)) 98do 99 case "x$1" in 100 x-debug) 101 GEM5_DEBUG="-debug" 102 shift 1 103 ;; 104 x-n|x-nodes) 105 NNODES=$2 106 shift 2 107 ;; 108 x-r|x-rundir) 109 RUN_DIR=$2 110 shift 2 111 ;; 112 x-c|x-ckptdir) 113 CKPT_DIR=$2 114 shift 2 115 ;; 116 x-p|x-port) 117 SW_PORT=$2 118 shift 2 119 ;; 120 x-s|x-switch) 121 SW_CONFIG=$2 122 shift 2 123 ;; 124 x--sw-args) 125 CUR_ARGS="SW_ARGS" 126 shift 1 127 ;; 128 x-f|x-fullsystem) 129 FS_CONFIG=$2 130 shift 2 131 ;; 132 x--fs-args) 133 CUR_ARGS="FS_ARGS" 134 shift 1 135 ;; 136 x--cf-args) 137 CUR_ARGS="CF_ARGS" 138 shift 1 139 ;; 140 x--m5-args) 141 CUR_ARGS="M5_ARGS" 142 shift 1 143 ;; 144 x-x) 145 GEM5_EXE=$2 146 shift 2 147 ;; 148 x-*) 149 [ -n "$CUR_ARGS" ] || { echo "Unexpected arg: $1"; usage_func; exit -1; } 150 case "x$2" in 151 x-*|x) 152 eval $CUR_ARGS=\"${!CUR_ARGS} $1\" 153 shift 1 154 ;; 155 *) 156 eval $CUR_ARGS=\"${!CUR_ARGS} $1 $2\" 157 shift 2 158 ;; 159 esac 160 ;; 161 *) 162 echo "Unknown arg: $1" 163 usage_func 164 exit 1 165 ;; 166 esac 167done 168 169# Default values to use (in case they are not defined as command line options) 170DEFAULT_FS_CONFIG=$M5_PATH/configs/example/fs.py 171DEFAULT_SW_CONFIG=$M5_PATH/configs/example/sw.py 172DEFAULT_SW_PORT=2200 173 174[ -z "$FS_CONFIG" ] && FS_CONFIG=$DEFAULT_FS_CONFIG 175[ -z "$SW_CONFIG" ] && SW_CONFIG=$DEFAULT_SW_CONFIG 176[ -z "$SW_PORT" ] && SW_PORT=$DEFAULT_SW_PORT 177[ -z "$NNODES" ] && NNODES=2 178[ -z "$RUN_DIR" ] && RUN_DIR=$(pwd) 179[ -z "$CKPT_DIR" ] && CKPT_DIR=$(pwd) 180 181# Check if all the executables we need exist 182[ -f "$FS_CONFIG" ] || { echo "FS config ${FS_CONFIG} not found"; exit 1; } 183[ -f "$SW_CONFIG" ] || { echo "Switch config ${SW_CONFIG} not found"; exit 1; } 184[ -x "$GEM5_EXE" ] || { echo "Executable ${GEM5_EXE} not found"; exit 1; } 185# make sure that RUN_DIR exists 186mkdir -p $RUN_DIR > /dev/null 2>&1 187 188declare -a SSH_PIDS 189declare -a HOSTS 190declare -a NCORES 191 192# Find out which cluster hosts/slots are allocated or 193# use localhost if there is no LSF allocation. 194# We assume that allocated slots are listed in the LSB_MCPU_HOSTS 195# environment variable in the form: 196# host1 nslots1 host2 nslots2 ... 197# (This is what LSF does by default.) 198NH=0 199[ "x$LSB_MCPU_HOSTS" != "x" ] || LSB_MCPU_HOSTS="127.0.0.1 $NNODES" 200host="" 201for hc in $LSB_MCPU_HOSTS 202do 203 if [ "x$host" == "x" ] 204 then 205 host=$hc 206 HOSTS+=($hc) 207 else 208 NCORES+=($hc) 209 ((NH+=hc)) 210 host="" 211 fi 212done 213((NNODES==NH)) || { echo "(E) Number of cluster slots ($NH) and gem5 instances ($N) differ"; exit -1; } 214 215# function to clean up and abort if something goes wrong 216abort_func () 217{ 218 echo 219 echo "KILLED $(date)" 220 # Try to Kill the server first. That should trigger an exit for all connected 221 # gem5 processes. 222 [ "x$SW_PID" != "x" ] && kill $SW_PID 2>/dev/null 223 sleep 20 224 # (try to) kill gem5 processes - just in case something went wrong with the 225 # server triggered exit 226 bname=$(basename $GEM5_EXE) 227 killall -q -s SIGKILL $bname 228 for h in ${HOSTS[@]} 229 do 230 ssh $h killall -q -s SIGKILL $bname 231 done 232 sleep 5 233 # kill the watchdog 234 [ "x$WATCHDOG_PID" != "x" ] && kill $WATCHDOG_PID 2>/dev/null 235 exit -1 236} 237 238# We need a watchdog to trigger full clean up if a gem5 process dies 239watchdog_func () 240{ 241 while true 242 do 243 sleep 30 244 ((NDEAD=0)) 245 for p in ${SSH_PIDS[*]} 246 do 247 kill -0 $p 2>/dev/null || ((NDEAD+=1)) 248 done 249 kill -0 $SW_PID || ((NDEAD+=1)) 250 if ((NDEAD>0)) 251 then 252 # we may be in the middle of an orderly termination, 253 # give it some time to complete before reporting abort 254 sleep 60 255 echo -n "(I) (some) gem5 process(es) exited" 256 abort_func 257 fi 258 done 259} 260 261# This function launches the gem5 processes. The only purpose is to enable 262# launching gem5 processes under gdb control for debugging 263start_func () 264{ 265 local N=$1 266 local HOST=$2 267 local ENV_ARGS=$3 268 shift 3 269 if [ "x$GEM5_DEBUG" != "x" ] 270 then 271 echo "DEBUG starting terminal..." 272 MY_ARGS="$@" 273 xterm -e "gdb --args $MY_ARGS" & 274 else 275 ssh $HOST $ENV_ARGS "$@" &> $RUN_DIR/log.$N & 276 fi 277} 278 279# block till the gem5 process starts 280connected () 281{ 282 FILE=$1 283 STRING=$2 284 echo -n "waiting for $3 to start " 285 while : ; 286 do 287 kill -0 $4 || { echo "Failed to start $3"; exit -1; } 288 [[ -f "$FILE" ]] && \ 289 grep -q "$STRING" "$FILE" && \ 290 echo -e "\nnode #$3 started" && \ 291 break 292 293 sleep 2 294 echo -n "." 295 done 296} 297 298# Trigger full clean up in case we are being killed by external signal 299trap 'abort_func' INT TERM 300 301# env args to be passed explicitly to gem5 processes started via ssh 302ENV_ARGS="LD_LIBRARY_PATH=$LD_LIBRARY_PATH M5_PATH=$M5_PATH" 303 304#cleanup log files before starting gem5 processes 305rm $RUN_DIR/log.switch > /dev/null 2>&1 306 307# make sure that CKPT_DIR exists 308mkdir -p $CKPT_DIR/m5out.switch > /dev/null 2>&1 309# launch switch gem5 310SW_HOST=${HOSTS[0]} 311echo "launch switch gem5 process on $SW_HOST ..." 312start_func "switch" $SW_HOST "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.switch \ 313 $M5_ARGS \ 314 $SW_CONFIG \ 315 $SW_ARGS \ 316 $CF_ARGS \ 317 --checkpoint-dir=$CKPT_DIR/m5out.switch \ 318 --is-switch \ 319 --dist-size=$NNODES \ 320 --dist-server-port=$SW_PORT 321SW_PID=$! 322 323# block here till switch process starts 324connected $RUN_DIR/log.switch "tcp_iface listening on port" "switch" $SW_PID 325LINE=$(grep -r "tcp_iface listening on port" $RUN_DIR/log.switch) 326 327IFS=' ' read -ra ADDR <<< "$LINE" 328# actual port that switch is listening on may be different 329# from what we specified if the port was busy 330SW_PORT=${ADDR[5]} 331 332# Now launch all the gem5 processes with ssh. 333echo "START $(date)" 334n=0 335for ((i=0; i < ${#HOSTS[@]}; i++)) 336do 337 h=${HOSTS[$i]} 338 for ((j=0; j < ${NCORES[i]}; j++)) 339 do 340 #cleanup log files before starting gem5 processes 341 rm $RUN_DIR/log.$n > /dev/null 2>&1 342 # make sure that CKPT_DIR exists 343 mkdir -p $CKPT_DIR/m5out.$n > /dev/null 2>&1 344 echo "starting gem5 on $h ..." 345 start_func $n $h "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.$n \ 346 $M5_ARGS \ 347 $FS_CONFIG \ 348 $FS_ARGS \ 349 $CF_ARGS \ 350 --checkpoint-dir=$CKPT_DIR/m5out.$n \ 351 --dist \ 352 --dist-rank=$n \ 353 --dist-size=$NNODES \ 354 --dist-server-name=${HOSTS[0]} \ 355 --dist-server-port=$SW_PORT 356 SSH_PIDS[$n]=$! 357 ((n+=1)) 358 done 359done 360 361# Wait here if it is a debug session 362[ "x$GEM5_DEBUG" == "x" ] || { echo "DEBUG session"; wait $SW_PID; exit -1; } 363 364# start watchdog to trigger complete abort (after a grace period) if any 365# gem5 process dies 366watchdog_func & 367WATCHDOG_PID=$! 368 369# wait for exit statuses 370((NFAIL=0)) 371for p in ${SSH_PIDS[*]} 372do 373 wait $p || ((NFAIL+=1)) 374done 375wait $SW_PID || ((NFAIL+=1)) 376 377# all done, let's terminate the watchdog 378kill $WATCHDOG_PID 2>/dev/null 379 380if ((NFAIL==0)) 381then 382 echo "EXIT $(date)" 383else 384 echo "ABORT $(date)" 385fi 386