96 lines
2.4 KiB
Bash
Executable File
96 lines
2.4 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# === CONFIG ===
|
|
LOG_PATH="$HOME/.genesis_healthcheck"
|
|
ERROR_CACHE="$LOG_PATH/transient_errors.log"
|
|
STATUS_OUT="$LOG_PATH/status_report.txt"
|
|
MAX_RETRIES=3
|
|
RETRY_INTERVAL=3
|
|
TRANSIENT_TTL=3 # Number of cycles to tolerate transient failures
|
|
mkdir -p "$LOG_PATH"
|
|
touch "$ERROR_CACHE"
|
|
touch "$STATUS_OUT"
|
|
|
|
# === FUNCTIONS ===
|
|
log() {
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> "$STATUS_OUT"
|
|
}
|
|
|
|
check_ssh() {
|
|
local host=$1
|
|
for i in $(seq 1 $MAX_RETRIES); do
|
|
ssh -o BatchMode=yes -o ConnectTimeout=3 -o UseDNS=no "$host" 'echo OK' 2>/dev/null && return 0
|
|
sleep $RETRY_INTERVAL
|
|
done
|
|
return 1
|
|
}
|
|
|
|
check_logs_for_errors() {
|
|
local host=$1
|
|
local file=$2
|
|
ssh -o BatchMode=yes -o ConnectTimeout=3 -o UseDNS=no "$host" "grep -i 'error' $file | tail -n 10" 2>/dev/null
|
|
}
|
|
|
|
increment_error_state() {
|
|
local key="$1"
|
|
grep -q "$key" "$ERROR_CACHE" && \
|
|
sed -i "/^$key/c\$key $(($(grep "$key" "$ERROR_CACHE" | cut -d ' ' -f 2)+1))" "$ERROR_CACHE" || \
|
|
echo "$key 1" >> "$ERROR_CACHE"
|
|
}
|
|
|
|
reset_error_state() {
|
|
grep -v "^$1" "$ERROR_CACHE" > "$ERROR_CACHE.tmp" && mv "$ERROR_CACHE.tmp" "$ERROR_CACHE"
|
|
}
|
|
|
|
is_persistent_failure() {
|
|
local key="$1"
|
|
local count=$(grep "$key" "$ERROR_CACHE" | awk '{print $2}')
|
|
[[ "$count" -ge $TRANSIENT_TTL ]] && return 0 || return 1
|
|
}
|
|
|
|
# === BEGIN ===
|
|
echo "Genesis Radio Healthcheck - $(date)" > "$STATUS_OUT"
|
|
ALL_OK=true
|
|
HOSTS=(shredder mastodon db1 db2)
|
|
|
|
for host in "${HOSTS[@]}"; do
|
|
echo -n "Checking $host... "
|
|
|
|
if ! check_ssh "$host"; then
|
|
increment_error_state "$host-ssh"
|
|
if is_persistent_failure "$host-ssh"; then
|
|
log "❌ [$host] SSH unreachable after $MAX_RETRIES attempts."
|
|
ALL_OK=false
|
|
else
|
|
log "⚠️ [$host] TRANSIENT: SSH unreachable (will retry next cycle)."
|
|
fi
|
|
continue
|
|
else
|
|
reset_error_state "$host-ssh"
|
|
fi
|
|
|
|
# Sample: check logs for real errors
|
|
if output=$(check_logs_for_errors "$host" "/var/log/syslog"); then
|
|
if [[ -n "$output" ]]; then
|
|
log "⚠️ [$host] Errors found in syslog:\n$output"
|
|
increment_error_state "$host-logs"
|
|
if is_persistent_failure "$host-logs"; then
|
|
log "❌ [$host] Persistent syslog errors."
|
|
ALL_OK=false
|
|
fi
|
|
else
|
|
reset_error_state "$host-logs"
|
|
fi
|
|
fi
|
|
|
|
done
|
|
|
|
if $ALL_OK; then
|
|
log "✅ All systems nominal."
|
|
else
|
|
log "⚠️ Some systems reporting persistent warnings or failures."
|
|
fi
|
|
|
|
# Optional: Send to Mastodon, Telegram, email, etc.
|
|
# cat "$STATUS_OUT"
|