import subprocess import os import requests import datetime import paramiko import time import psycopg2 # ==== CONFIG ==== MASTODON_INSTANCE = "https://chatwithus.live" MASTODON_TOKEN = "rimxBLi-eaJAcwagkmoj6UoW7Lc473tQY0cOM041Euw" MASTODON_USER_ID = "114386383616633367" HEALTHCHECK_HTML = "/var/www/html/healthcheck.html" SUDO_PASSWORD = 'rusty2281' DISK_WARN_THRESHOLD = 10 INODE_WARN_THRESHOLD = 10 LOG_FILES = ["/var/log/syslog", "/var/log/nginx/error.log"] LOG_PATTERNS = ["ERROR", "FATAL", "disk full", "out of memory"] SUPPRESSED_PATTERNS = ["SomeKnownHarmlessMastodonError"] NODES = [ {"name": "shredder", "host": "38.102.127.172", "ssh_user": "doc", "services": ["minio.service"], "disks": ["/", "/mnt/raid5"], "db": False, "raid": True}, {"name": "mastodon", "host": "chatwithus.live", "ssh_user": "root", "services": ["nginx", "mastodon-web"], "disks": ["/"], "db": False, "raid": False}, {"name": "db1", "host": "zcluster.technodrome1.sshjunkie.com", "ssh_user": "doc", "services": ["postgresql@16-main.service"], "disks": ["/", "/var/lib/postgresql"], "db": True, "raid": True}, {"name": "db2", "host": "zcluster.technodrome1.sshjunkie.com", "ssh_user": "doc", "services": ["postgresql@16-main.service"], "disks": ["/", "/var/lib/postgresql"], "db": True, "raid": True} ] def mastodon_dm(message, retries=3): url = f"{MASTODON_INSTANCE}/api/v1/statuses" headers = {"Authorization": f"Bearer {MASTODON_TOKEN}"} payload = {"status": message, "visibility": "direct", "in_reply_to_account_id": MASTODON_USER_ID} for attempt in range(retries): try: resp = requests.post(url, headers=headers, data=payload, timeout=10) if resp.status_code == 200: return print(f"Failed to send Mastodon DM (attempt {attempt+1}): {resp.text}") except Exception as e: print(f"Exception during Mastodon DM (attempt {attempt+1}): {e}") time.sleep(5) def ssh_command(host, user, cmd): ssh = paramiko.SSHClient() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) ssh.connect(hostname=host, username=user, key_filename=os.path.expanduser("~/.ssh/genesis_healthcheck"), timeout=10) stdin, stdout, stderr = ssh.exec_command(cmd) out = stdout.read().decode().strip() err = stderr.read().decode().strip() if not out or "permission denied" in err.lower() or "sudo:" in err.lower(): sudo_cmd = f"echo '{SUDO_PASSWORD}' | sudo -S {cmd}" stdin, stdout, stderr = ssh.exec_command(sudo_cmd) out = stdout.read().decode().strip() err = stderr.read().decode().strip() ssh.close() return out if out else err def choose_emoji(line): if "RAID" in line: if "disk" in line.lower(): return "šŸ“ˆ" if "rclone" in line.lower(): return "🐢" if "Service" in line: return "🚩" if "Replication" in line: return "šŸ’„" return "āš ļø" def check_remote_disk(host, user, path, node_name): try: cmd = f"df --output=pcent {path} | tail -1 | tr -dc '0-9'" out = ssh_command(host, user, cmd) if not out: return f"[{node_name}] ERROR: Disk {path} not found or could not check disk usage." percent = int(out) if percent > (100 - DISK_WARN_THRESHOLD): return f"[{node_name}] WARNING: Only {100 - percent}% disk free on {path}." except Exception as e: return f"[{node_name}] ERROR: Disk check failed: {e}" return None def check_remote_service(host, user, service, node_name): try: cmd = f"systemctl is-active {service}" out = ssh_command(host, user, cmd) if out.strip() != "active": return f"[{node_name}] CRITICAL: Service {service} not running!" except Exception as e: return f"[{node_name}] ERROR: Service check failed: {e}" return None def check_replication(host, node_name): try: conn = psycopg2.connect(host=host, dbname="postgres", user="postgres", connect_timeout=5) cur = conn.cursor() cur.execute("SELECT pg_is_in_recovery();") is_replica = cur.fetchone()[0] if is_replica: cur.execute("SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))::INT;") lag = cur.fetchone()[0] if lag is None: return f"[{node_name}] CRITICAL: Standby not streaming! Replication down." elif lag >= 60: return f"[{node_name}] WARNING: Replication lag is {lag} seconds." cur.close() conn.close() except Exception as e: return f"[{node_name}] ERROR: Replication check failed: {e}" return None def check_remote_raid_md0(host, user, node_name): alerts = [] try: pools_output = ssh_command(host, user, "zpool list -H -o name") pool_names = pools_output.strip().splitlines() for pool in pool_names: health_cmd = f"zpool status -x {pool}" health_out = ssh_command(host, user, health_cmd) if f"pool '{pool}' is healthy" not in health_out.lower(): alerts.append(f"[{node_name}] WARNING: ZFS pool '{pool}' is not healthy: {health_out.strip()}") snap_cmd = f"zfs list -t snapshot -o name,creation -s creation -H -r {pool}" snap_out = ssh_command(host, user, snap_cmd) if not snap_out.strip(): alerts.append(f"[{node_name}] WARNING: No snapshots found in ZFS pool '{pool}'") else: last_snap = snap_out.strip().splitlines()[-1] snap_parts = last_snap.split("\t") if len(snap_parts) == 2: snap_time_str = snap_parts[1].strip() snap_time = datetime.datetime.strptime(snap_time_str, "%a %b %d %H:%M %Y") delta = datetime.datetime.now() - snap_time if delta.total_seconds() > 86400: alerts.append(f"[{node_name}] WARNING: Last snapshot on pool '{pool}' is older than 24h: {snap_time_str}") except Exception as e: alerts.append(f"[{node_name}] ERROR: ZFS RAID check failed: {e}") try: mdstat = ssh_command(host, user, "cat /proc/mdstat") lines = mdstat.splitlines() status = None inside_md0 = False for line in lines: if line.startswith("md0"): inside_md0 = True elif inside_md0: if "[" in line and "]" in line: status = line[line.index("["):line.index("]")+1] break if line.strip() == "" or ":" in line: break if status is None: alerts.append(f"[{node_name}] CRITICAL: /dev/md0 RAID status string not found!") elif "_" in status: alerts.append(f"[{node_name}] WARNING: /dev/md0 RAID degraded! Status: {status}") except Exception as fallback_e: alerts.append(f"[{node_name}] ERROR: RAID check failed (ZFS+mdstat): {e}; {fallback_e}") return "\n".join(alerts) if alerts else None def check_remote_logs(host, user, node_name): alerts = [] for log in LOG_FILES: cmd = f"tail -500 {log}" try: out = ssh_command(host, user, cmd) lines = out.split("\n") for pattern in LOG_PATTERNS: for line in lines: if pattern in line and not any(suppress in line for suppress in SUPPRESSED_PATTERNS): alerts.append(f"[{node_name}] WARNING: Pattern '{pattern}' in {log}") except Exception as e: alerts.append(f"[{node_name}] ERROR: Could not read log {log}: {e}") return alerts def check_postgres_snapshot(): try: result = subprocess.run( ["sudo", "-S", "bash", "-c", "/root/genesis-tools/miscellaneous/bash/dbv2.sh"], input=f"{SUDO_PASSWORD}\n", capture_output=True, text=True ) if result.returncode != 0: mastodon_dm(f"🚨 Snapshot verification failed:\nstdout:\n{result.stdout.strip()}\nstderr:\n{result.stderr.strip()}") except Exception as e: mastodon_dm(f"🚨 Snapshot verification failed to run: {e}") def main(): critical_problems = [] warning_problems = [] node_status = {} for node in NODES: status = "Healthy" raid_info = "All pools healthy" for disk in node["disks"]: disk_res = check_remote_disk(node["host"], node["ssh_user"], disk, node["name"]) if disk_res: if "CRITICAL" in disk_res: critical_problems.append(disk_res) status = "Critical" elif "WARNING" in disk_res and status != "Critical": warning_problems.append(disk_res) status = "Warning" for svc in node["services"]: svc_res = check_remote_service(node["host"], node["ssh_user"], svc, node["name"]) if svc_res: if "CRITICAL" in svc_res: critical_problems.append(svc_res) status = "Critical" elif "WARNING" in svc_res and status != "Critical": warning_problems.append(svc_res) status = "Warning" if node.get("db"): rep_res = check_replication(node["host"], node["name"]) if rep_res: if "CRITICAL" in rep_res: critical_problems.append(rep_res) status = "Critical" else: warning_problems.append(rep_res) if status != "Critical": status = "Warning" if node.get("raid", False): raid_res = check_remote_raid_md0(node["host"], node["ssh_user"], node["name"]) if raid_res: if "CRITICAL" in raid_res: critical_problems.append(raid_res) status = "Critical" else: warning_problems.append(raid_res) if status != "Critical": status = "Warning" raid_info = raid_res logs = check_remote_logs(node["host"], node["ssh_user"], node["name"]) for log_alert in logs: warning_problems.append(log_alert) if status != "Critical": status = "Warning" node_status[node["name"]] = (status, raid_info) now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") if critical_problems: formatted = "\n".join(f"- {choose_emoji(p)} {p}" for p in critical_problems) msg = f"🚨 Genesis Radio Critical Healthcheck {now} 🚨\n⚔ {len(critical_problems)} critical issues found:\n{formatted}" print(msg) mastodon_dm(msg) if warning_problems: formatted = "\n".join(f"- {choose_emoji(p)} {p}" for p in warning_problems) msg = f"āš ļø Genesis Radio Warning Healthcheck {now} āš ļø\n⚔ {len(warning_problems)} warnings found:\n{formatted}" print(msg) mastodon_dm(msg) if not critical_problems and not warning_problems: msg = f"āœ… Genesis Radio Healthcheck {now}: All systems normal." print(msg) mastodon_dm(msg) with open(HEALTHCHECK_HTML, "w") as f: f.write("Genesis Radio Healthcheck") f.write(f"

Genesis Radio System Health

") f.write(f"

Last Checked: {now}

") f.write("") for node, (status, zfs_info) in node_status.items(): color = 'green' if 'Healthy' in status else ('orange' if 'Warning' in status else 'red') f.write(f"") f.write("
SystemStatusZFS Details
{node}{status}
{zfs_info}
") check_postgres_snapshot() if __name__ == "__main__": main()