| #! /bin/bash |
| |
| # |
| # Copyright (c) 2015 ARM Limited |
| # All rights reserved |
| # |
| # The license below extends only to copyright in the software and shall |
| # not be construed as granting a license to any other intellectual |
| # property including but not limited to intellectual property relating |
| # to a hardware implementation of the functionality of the software |
| # licensed hereunder. You may use the software subject to the license |
| # terms below provided that you ensure that this notice is replicated |
| # unmodified and in its entirety in all distributions of the software, |
| # modified or unmodified, in source code or in binary form. |
| # |
| # Copyright (c) 2015 University of Illinois Urbana Champaign |
| # All rights reserved |
| # |
| # Redistribution and use in source and binary forms, with or without |
| # modification, are permitted provided that the following conditions are |
| # met: redistributions of source code must retain the above copyright |
| # notice, this list of conditions and the following disclaimer; |
| # redistributions in binary form must reproduce the above copyright |
| # notice, this list of conditions and the following disclaimer in the |
| # documentation and/or other materials provided with the distribution; |
| # neither the name of the copyright holders nor the names of its |
| # contributors may be used to endorse or promote products derived from |
| # this software without specific prior written permission. |
| # |
| # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| |
| # This is a wrapper script to run a dist gem5 simulations. |
| # See the usage_func() below for hints on how to use it. Also, |
| # there are some examples in the util/dist directory (e.g. |
| # see util/dist/test-2nodes-AArch64.sh) |
| # |
| # |
| # Allocated hosts/cores are assumed to be listed in the LSB_MCPU_HOSTS |
| # environment variable (which is what LSF does by default). |
| # E.g. LSB_MCPU_HOSTS=\"hname1 2 hname2 4\" means we have altogether 6 slots |
| # allocated to launch the gem5 processes, 2 of them are on host hname1 |
| # and 4 of them are on host hname2. |
| # If LSB_MCPU_HOSTS environment variable is not defined then we launch all |
| # processes on the localhost. |
| # |
| # Each gem5 process are passed in a unique rank ID [0..N-1] via the kernel |
| # boot params. The total number of gem5 processes is also passed in. |
| # These values can be used in the boot script to configure the MAC/IP |
| # addresses - among other things (see util/dist/bootscript.rcS). |
| # |
| # Each gem5 process will create an m5out.$GEM5_RANK directory for |
| # the usual output files. Furthermore, there will be a separate log file |
| # for each ssh session (we use ssh to start gem5 processes) and one for |
| # the server. These are called log.$GEM5_RANK and log.switch. |
| # |
| |
| |
| # print help |
| usage_func () |
| { |
| echo "Usage:$0 [-debug] [-n nnodes] [-r rundir] [-c ckptdir] [-p port] [-sw switch] [--sw-args sw_args] [-fs fullsystem] [--fs-args fs_args] [--cf-args conf_args] [--m5-args m5_args] -x gem5_exe " |
| echo " -debug : debug mode (start gem5 in gdb)" |
| echo " nnodes : number of gem5 processes" |
| echo " rundir : run simulation under this path. If not specified, current dir will be used" |
| echo " ckptdir : dump/restore checkpoints to/from this path. If not specified, current dir will be used" |
| |
| echo " fullsystem: fullsystem config file" |
| echo " fs_args : fullsystem config specific argument list: arg1 arg2 ..." |
| echo " port : switch listen port" |
| echo " switch : switch config file" |
| echo " sw_args : switch config specific argument list: arg1 arg2 ..." |
| echo " conf_args : common (for both fullsystem and switch) config argument list: arg1 arg2 ..." |
| echo " gem5_exe : gem5 executable (full path required)" |
| echo " m5_args : common m5 argument list (e.g. debug flags): arg1 arg2 ..." |
| echo "Note: if no LSF slots allocation is found all proceses are launched on the localhost." |
| } |
| |
| # Process (optional) command line options |
| FS_ARGS=" " |
| SW_ARGS=" " |
| CF_ARGS=" " |
| M5_ARGS=" " |
| while (($# > 0)) |
| do |
| case "x$1" in |
| x-debug) |
| GEM5_DEBUG="-debug" |
| shift 1 |
| ;; |
| x-n|x-nodes) |
| NNODES=$2 |
| shift 2 |
| ;; |
| x-r|x-rundir) |
| RUN_DIR=$2 |
| shift 2 |
| ;; |
| x-c|x-ckptdir) |
| CKPT_DIR=$2 |
| shift 2 |
| ;; |
| x-p|x-port) |
| SW_PORT=$2 |
| shift 2 |
| ;; |
| x-s|x-switch) |
| SW_CONFIG=$2 |
| shift 2 |
| ;; |
| x--sw-args) |
| CUR_ARGS="SW_ARGS" |
| shift 1 |
| ;; |
| x-f|x-fullsystem) |
| FS_CONFIG=$2 |
| shift 2 |
| ;; |
| x--fs-args) |
| CUR_ARGS="FS_ARGS" |
| shift 1 |
| ;; |
| x--cf-args) |
| CUR_ARGS="CF_ARGS" |
| shift 1 |
| ;; |
| x--m5-args) |
| CUR_ARGS="M5_ARGS" |
| shift 1 |
| ;; |
| x-x) |
| GEM5_EXE=$2 |
| shift 2 |
| ;; |
| x-*) |
| [ -n "$CUR_ARGS" ] || { echo "Unexpected arg: $1"; usage_func; exit -1; } |
| case "x$2" in |
| x-*|x) |
| eval $CUR_ARGS=\"${!CUR_ARGS} $1\" |
| shift 1 |
| ;; |
| *) |
| eval $CUR_ARGS=\"${!CUR_ARGS} $1 $2\" |
| shift 2 |
| ;; |
| esac |
| ;; |
| *) |
| echo "Unknown arg: $1" |
| usage_func |
| exit 1 |
| ;; |
| esac |
| done |
| |
| # Default values to use (in case they are not defined as command line options) |
| DEFAULT_FS_CONFIG=$M5_PATH/configs/example/fs.py |
| DEFAULT_SW_CONFIG=$M5_PATH/configs/dist/sw.py |
| DEFAULT_SW_PORT=2200 |
| |
| [ -z "$FS_CONFIG" ] && FS_CONFIG=$DEFAULT_FS_CONFIG |
| [ -z "$SW_CONFIG" ] && SW_CONFIG=$DEFAULT_SW_CONFIG |
| [ -z "$SW_PORT" ] && SW_PORT=$DEFAULT_SW_PORT |
| [ -z "$NNODES" ] && NNODES=2 |
| [ -z "$RUN_DIR" ] && RUN_DIR=$(pwd) |
| [ -z "$CKPT_DIR" ] && CKPT_DIR=$(pwd) |
| |
| # Check if all the executables we need exist |
| [ -f "$FS_CONFIG" ] || { echo "FS config ${FS_CONFIG} not found"; exit 1; } |
| [ -f "$SW_CONFIG" ] || { echo "Switch config ${SW_CONFIG} not found"; exit 1; } |
| [ -x "$GEM5_EXE" ] || { echo "Executable ${GEM5_EXE} not found"; exit 1; } |
| # make sure that RUN_DIR exists |
| mkdir -p $RUN_DIR > /dev/null 2>&1 |
| |
| declare -a SSH_PIDS |
| declare -a HOSTS |
| declare -a NCORES |
| |
| # Find out which cluster hosts/slots are allocated or |
| # use localhost if there is no LSF allocation. |
| # We assume that allocated slots are listed in the LSB_MCPU_HOSTS |
| # environment variable in the form: |
| # host1 nslots1 host2 nslots2 ... |
| # (This is what LSF does by default.) |
| NH=0 |
| [ "x$LSB_MCPU_HOSTS" != "x" ] || LSB_MCPU_HOSTS="127.0.0.1 $NNODES" |
| host="" |
| for hc in $LSB_MCPU_HOSTS |
| do |
| if [ "x$host" == "x" ] |
| then |
| host=$hc |
| HOSTS+=($hc) |
| else |
| NCORES+=($hc) |
| ((NH+=hc)) |
| host="" |
| fi |
| done |
| ((NNODES==NH)) || { echo "(E) Number of cluster slots ($NH) and gem5 instances ($N) differ"; exit -1; } |
| |
| # function to clean up and abort if something goes wrong |
| abort_func () |
| { |
| echo |
| echo "KILLED $(date)" |
| # Try to Kill the server first. That should trigger an exit for all connected |
| # gem5 processes. |
| [ "x$SW_PID" != "x" ] && kill $SW_PID 2>/dev/null |
| sleep 20 |
| # (try to) kill gem5 processes - just in case something went wrong with the |
| # server triggered exit |
| bname=$(basename $GEM5_EXE) |
| killall -q -s SIGKILL $bname |
| for h in ${HOSTS[@]} |
| do |
| ssh $h killall -q -s SIGKILL $bname |
| done |
| sleep 5 |
| # kill the watchdog |
| [ "x$WATCHDOG_PID" != "x" ] && kill $WATCHDOG_PID 2>/dev/null |
| exit -1 |
| } |
| |
| # We need a watchdog to trigger full clean up if a gem5 process dies |
| watchdog_func () |
| { |
| while true |
| do |
| sleep 30 |
| ((NDEAD=0)) |
| for p in ${SSH_PIDS[*]} |
| do |
| kill -0 $p 2>/dev/null || ((NDEAD+=1)) |
| done |
| kill -0 $SW_PID || ((NDEAD+=1)) |
| if ((NDEAD>0)) |
| then |
| # we may be in the middle of an orderly termination, |
| # give it some time to complete before reporting abort |
| sleep 60 |
| echo -n "(I) (some) gem5 process(es) exited" |
| abort_func |
| fi |
| done |
| } |
| |
| # This function launches the gem5 processes. The only purpose is to enable |
| # launching gem5 processes under gdb control for debugging |
| start_func () |
| { |
| local N=$1 |
| local HOST=$2 |
| local ENV_ARGS=$3 |
| shift 3 |
| if [ "x$GEM5_DEBUG" != "x" ] |
| then |
| echo "DEBUG starting terminal..." |
| MY_ARGS="$@" |
| xterm -e "gdb --args $MY_ARGS" & |
| else |
| ssh $HOST $ENV_ARGS "$@" &> $RUN_DIR/log.$N & |
| fi |
| } |
| |
| # block till the gem5 process starts |
| connected () |
| { |
| FILE=$1 |
| STRING=$2 |
| echo -n "waiting for $3 to start " |
| while : ; |
| do |
| kill -0 $4 || { echo "Failed to start $3"; exit -1; } |
| [[ -f "$FILE" ]] && \ |
| grep -q "$STRING" "$FILE" && \ |
| echo -e "\nnode #$3 started" && \ |
| break |
| |
| sleep 2 |
| echo -n "." |
| done |
| } |
| |
| # Trigger full clean up in case we are being killed by external signal |
| trap 'abort_func' INT TERM |
| |
| # env args to be passed explicitly to gem5 processes started via ssh |
| ENV_ARGS="LD_LIBRARY_PATH=$LD_LIBRARY_PATH M5_PATH=$M5_PATH" |
| |
| #cleanup log files before starting gem5 processes |
| rm $RUN_DIR/log.switch > /dev/null 2>&1 |
| |
| # make sure that CKPT_DIR exists |
| mkdir -p $CKPT_DIR/m5out.switch > /dev/null 2>&1 |
| # launch switch gem5 |
| SW_HOST=${HOSTS[0]} |
| echo "launch switch gem5 process on $SW_HOST ..." |
| start_func "switch" $SW_HOST "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.switch \ |
| $M5_ARGS \ |
| $SW_CONFIG \ |
| $SW_ARGS \ |
| $CF_ARGS \ |
| --checkpoint-dir=$CKPT_DIR/m5out.switch \ |
| --is-switch \ |
| --dist-size=$NNODES \ |
| --dist-server-port=$SW_PORT |
| SW_PID=$! |
| |
| # block here till switch process starts |
| connected $RUN_DIR/log.switch "tcp_iface listening on port" "switch" $SW_PID |
| LINE=$(grep -r "tcp_iface listening on port" $RUN_DIR/log.switch) |
| |
| IFS=' ' read -ra ADDR <<< "$LINE" |
| # actual port that switch is listening on may be different |
| # from what we specified if the port was busy |
| SW_PORT=${ADDR[5]} |
| |
| # Now launch all the gem5 processes with ssh. |
| echo "START $(date)" |
| n=0 |
| for ((i=0; i < ${#HOSTS[@]}; i++)) |
| do |
| h=${HOSTS[$i]} |
| for ((j=0; j < ${NCORES[i]}; j++)) |
| do |
| #cleanup log files before starting gem5 processes |
| rm $RUN_DIR/log.$n > /dev/null 2>&1 |
| # make sure that CKPT_DIR exists |
| mkdir -p $CKPT_DIR/m5out.$n > /dev/null 2>&1 |
| echo "starting gem5 on $h ..." |
| start_func $n $h "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.$n \ |
| $M5_ARGS \ |
| $FS_CONFIG \ |
| $FS_ARGS \ |
| $CF_ARGS \ |
| --checkpoint-dir=$CKPT_DIR/m5out.$n \ |
| --dist \ |
| --dist-rank=$n \ |
| --dist-size=$NNODES \ |
| --dist-server-name=${HOSTS[0]} \ |
| --dist-server-port=$SW_PORT |
| SSH_PIDS[$n]=$! |
| ((n+=1)) |
| done |
| done |
| |
| # Wait here if it is a debug session |
| [ "x$GEM5_DEBUG" == "x" ] || { echo "DEBUG session"; wait $SW_PID; exit -1; } |
| |
| # start watchdog to trigger complete abort (after a grace period) if any |
| # gem5 process dies |
| watchdog_func & |
| WATCHDOG_PID=$! |
| |
| # wait for exit statuses |
| ((NFAIL=0)) |
| for p in ${SSH_PIDS[*]} |
| do |
| wait $p || ((NFAIL+=1)) |
| done |
| wait $SW_PID || ((NFAIL+=1)) |
| |
| # all done, let's terminate the watchdog |
| kill $WATCHDOG_PID 2>/dev/null |
| |
| if ((NFAIL==0)) |
| then |
| echo "EXIT $(date)" |
| else |
| echo "ABORT $(date)" |
| fi |