bin-there-done-that/miscellaneous/bash/genesis_healthcheck.sh

96 lines
2.4 KiB
Bash
Executable File

#!/bin/bash
# === CONFIG ===
LOG_PATH="$HOME/.genesis_healthcheck"
ERROR_CACHE="$LOG_PATH/transient_errors.log"
STATUS_OUT="$LOG_PATH/status_report.txt"
MAX_RETRIES=3
RETRY_INTERVAL=3
TRANSIENT_TTL=3 # Number of cycles to tolerate transient failures
mkdir -p "$LOG_PATH"
touch "$ERROR_CACHE"
touch "$STATUS_OUT"
# === FUNCTIONS ===
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> "$STATUS_OUT"
}
check_ssh() {
local host=$1
for i in $(seq 1 $MAX_RETRIES); do
ssh -o BatchMode=yes -o ConnectTimeout=3 -o UseDNS=no "$host" 'echo OK' 2>/dev/null && return 0
sleep $RETRY_INTERVAL
done
return 1
}
check_logs_for_errors() {
local host=$1
local file=$2
ssh -o BatchMode=yes -o ConnectTimeout=3 -o UseDNS=no "$host" "grep -i 'error' $file | tail -n 10" 2>/dev/null
}
increment_error_state() {
local key="$1"
grep -q "$key" "$ERROR_CACHE" && \
sed -i "/^$key/c\$key $(($(grep "$key" "$ERROR_CACHE" | cut -d ' ' -f 2)+1))" "$ERROR_CACHE" || \
echo "$key 1" >> "$ERROR_CACHE"
}
reset_error_state() {
grep -v "^$1" "$ERROR_CACHE" > "$ERROR_CACHE.tmp" && mv "$ERROR_CACHE.tmp" "$ERROR_CACHE"
}
is_persistent_failure() {
local key="$1"
local count=$(grep "$key" "$ERROR_CACHE" | awk '{print $2}')
[[ "$count" -ge $TRANSIENT_TTL ]] && return 0 || return 1
}
# === BEGIN ===
echo "Genesis Radio Healthcheck - $(date)" > "$STATUS_OUT"
ALL_OK=true
HOSTS=(shredder mastodon db1 db2)
for host in "${HOSTS[@]}"; do
echo -n "Checking $host... "
if ! check_ssh "$host"; then
increment_error_state "$host-ssh"
if is_persistent_failure "$host-ssh"; then
log "❌ [$host] SSH unreachable after $MAX_RETRIES attempts."
ALL_OK=false
else
log "⚠️ [$host] TRANSIENT: SSH unreachable (will retry next cycle)."
fi
continue
else
reset_error_state "$host-ssh"
fi
# Sample: check logs for real errors
if output=$(check_logs_for_errors "$host" "/var/log/syslog"); then
if [[ -n "$output" ]]; then
log "⚠️ [$host] Errors found in syslog:\n$output"
increment_error_state "$host-logs"
if is_persistent_failure "$host-logs"; then
log "❌ [$host] Persistent syslog errors."
ALL_OK=false
fi
else
reset_error_state "$host-logs"
fi
fi
done
if $ALL_OK; then
log "✅ All systems nominal."
else
log "⚠️ Some systems reporting persistent warnings or failures."
fi
# Optional: Send to Mastodon, Telegram, email, etc.
# cat "$STATUS_OUT"