| #!/bin/sh |
| # |
| # Copyright 2015, Daniel Axtens, IBM Corporation |
| # |
| # This program is free software; you can redistribute it and/or modify |
| # it under the terms of the GNU General Public License as published by |
| # the Free Software Foundation; version 2 of the License. |
| # |
| # This program is distributed in the hope that it will be useful, |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| # GNU General Public License for more details. |
| |
| |
| # do we have ./getscom, ./putscom? |
| if [ -x ./getscom ] && [ -x ./putscom ]; then |
| GETSCOM=./getscom |
| PUTSCOM=./putscom |
| elif which getscom > /dev/null; then |
| GETSCOM=$(which getscom) |
| PUTSCOM=$(which putscom) |
| else |
| cat <<EOF |
| Can't find getscom/putscom in . or \$PATH. |
| See https://github.com/open-power/skiboot. |
| The tool is in external/xscom-utils |
| EOF |
| exit 1 |
| fi |
| |
| # We will get 8 HMI events per injection |
| # todo: deal with things being offline |
| expected_hmis=8 |
| COUNT_HMIS() { |
| dmesg | grep -c 'Harmless Hypervisor Maintenance interrupt' |
| } |
| |
| # massively expand snooze delay, allowing injection on all cores |
| ppc64_cpu --smt-snooze-delay=1000000000 |
| |
| # when we exit, restore it |
| trap "ppc64_cpu --smt-snooze-delay=100" 0 1 |
| |
| # for each chip+core combination |
| # todo - less fragile parsing |
| egrep -o 'OCC: Chip [0-9a-f]+ Core [0-9a-f]' < /sys/firmware/opal/msglog | |
| while read chipcore; do |
| chip=$(echo "$chipcore"|awk '{print $3}') |
| core=$(echo "$chipcore"|awk '{print $5}') |
| fir="0x1${core}013100" |
| |
| # verify that Core FIR is zero as expected |
| if [ "$($GETSCOM -c 0x${chip} $fir)" != 0 ]; then |
| echo "FIR was not zero before injection for chip $chip, core $core. Aborting!" |
| echo "Result of $GETSCOM -c 0x${chip} $fir:" |
| $GETSCOM -c 0x${chip} $fir |
| echo "If you get a -5 error, the core may be in idle state. Try stress-ng." |
| echo "Otherwise, try $PUTSCOM -c 0x${chip} $fir 0" |
| exit 1 |
| fi |
| |
| # keep track of the number of HMIs handled |
| old_hmis=$(COUNT_HMIS) |
| |
| # do injection, adding a marker to dmesg for clarity |
| echo "Injecting HMI on core $core, chip $chip" | tee /dev/kmsg |
| # inject a RegFile recoverable error |
| if ! $PUTSCOM -c 0x${chip} $fir 2000000000000000 > /dev/null; then |
| echo "Error injecting. Aborting!" |
| exit 1 |
| fi |
| |
| # now we want to wait for all the HMIs to be processed |
| # we expect one per thread on the core |
| i=0; |
| new_hmis=$(COUNT_HMIS) |
| while [ $new_hmis -lt $((old_hmis + expected_hmis)) ] && [ $i -lt 12 ]; do |
| echo "Seen $((new_hmis - old_hmis)) HMI(s) out of $expected_hmis expected, sleeping" |
| sleep 5; |
| i=$((i + 1)) |
| new_hmis=$(COUNT_HMIS) |
| done |
| if [ $i = 12 ]; then |
| echo "Haven't seen expected $expected_hmis recoveries after 1 min. Aborting." |
| exit 1 |
| fi |
| echo "Processed $expected_hmis events; presumed success. Check dmesg." |
| echo "" |
| done |