#!/bin/bash # === CONFIG === LOG_PATH="$HOME/.genesis_healthcheck" ERROR_CACHE="$LOG_PATH/transient_errors.log" STATUS_OUT="$LOG_PATH/status_report.txt" MAX_RETRIES=3 RETRY_INTERVAL=3 TRANSIENT_TTL=3 # Number of cycles to tolerate transient failures mkdir -p "$LOG_PATH" touch "$ERROR_CACHE" touch "$STATUS_OUT" # === FUNCTIONS === log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> "$STATUS_OUT" } check_ssh() { local host=$1 for i in $(seq 1 $MAX_RETRIES); do ssh -o BatchMode=yes -o ConnectTimeout=3 -o UseDNS=no "$host" 'echo OK' 2>/dev/null && return 0 sleep $RETRY_INTERVAL done return 1 } check_logs_for_errors() { local host=$1 local file=$2 ssh -o BatchMode=yes -o ConnectTimeout=3 -o UseDNS=no "$host" "grep -i 'error' $file | tail -n 10" 2>/dev/null } increment_error_state() { local key="$1" grep -q "$key" "$ERROR_CACHE" && \ sed -i "/^$key/c\$key $(($(grep "$key" "$ERROR_CACHE" | cut -d ' ' -f 2)+1))" "$ERROR_CACHE" || \ echo "$key 1" >> "$ERROR_CACHE" } reset_error_state() { grep -v "^$1" "$ERROR_CACHE" > "$ERROR_CACHE.tmp" && mv "$ERROR_CACHE.tmp" "$ERROR_CACHE" } is_persistent_failure() { local key="$1" local count=$(grep "$key" "$ERROR_CACHE" | awk '{print $2}') [[ "$count" -ge $TRANSIENT_TTL ]] && return 0 || return 1 } # === BEGIN === echo "Genesis Radio Healthcheck - $(date)" > "$STATUS_OUT" ALL_OK=true HOSTS=(shredder mastodon db1 db2) for host in "${HOSTS[@]}"; do echo -n "Checking $host... " if ! check_ssh "$host"; then increment_error_state "$host-ssh" if is_persistent_failure "$host-ssh"; then log "❌ [$host] SSH unreachable after $MAX_RETRIES attempts." ALL_OK=false else log "⚠️ [$host] TRANSIENT: SSH unreachable (will retry next cycle)." fi continue else reset_error_state "$host-ssh" fi # Sample: check logs for real errors if output=$(check_logs_for_errors "$host" "/var/log/syslog"); then if [[ -n "$output" ]]; then log "⚠️ [$host] Errors found in syslog:\n$output" increment_error_state "$host-logs" if is_persistent_failure "$host-logs"; then log "❌ [$host] Persistent syslog errors." ALL_OK=false fi else reset_error_state "$host-logs" fi fi done if $ALL_OK; then log "✅ All systems nominal." else log "⚠️ Some systems reporting persistent warnings or failures." fi # Optional: Send to Mastodon, Telegram, email, etc. # cat "$STATUS_OUT"