import os import requests import datetime import paramiko # ==== CONFIG ==== MASTODON_INSTANCE = "https://chatwithus.live" MASTODON_TOKEN = "" MASTODON_USER_ID = "" DISK_WARN_THRESHOLD = 10 # percent free INODE_WARN_THRESHOLD = 10 # percent free LOG_FILES = ["/var/log/syslog", "/var/log/nginx/error.log"] LOG_PATTERNS = ["ERROR", "FATAL", "disk full", "out of memory"] NODES = [ { "name": "shredder", "host": "38.102.127.171", "ssh_user": "doc", "services": ["minio.service"], "disks": ["/", "/mnt/raid5"], "type": "remote", "db": False, "raid": True }, { "name": "mastodon", "host": "chatwithus.live", # Now points directly to your Mastodon server "ssh_user": "root", "services": ["nginx", "mastodon-web"], "disks": ["/"], "type": "remote", "db": False, "raid": False }, { "name": "db1", "host": "cluster.db1.genesishostingtechnologies.com", "ssh_user": "doc", "services": ["postgresql@16-main.service"], "disks": ["/", "/var/lib/postgresql"], "type": "remote", "db": True, "raid": False }, { "name": "db2", "host": "cluster.db2.genesishostingtechnologies.com", "ssh_user": "doc", "services": ["postgresql@16-main.service"], "disks": ["/", "/var/lib/postgresql"], "type": "remote", "db": True, "raid": False }, ] # ==== Mastodon DM function ==== def mastodon_dm(message): url = f"{MASTODON_INSTANCE}/api/v1/statuses" headers = {"Authorization": f"Bearer {MASTODON_TOKEN}"} payload = { "status": message, "visibility": "direct", "in_reply_to_account_id": MASTODON_USER_ID } resp = requests.post(url, headers=headers, data=payload) if resp.status_code != 200: print(f"Failed to send Mastodon DM: {resp.text}") # ==== SSH command runner ==== def ssh_command(host, user, cmd): ssh = paramiko.SSHClient() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) ssh.connect(hostname=host, username=user, timeout=10) stdin, stdout, stderr = ssh.exec_command(cmd) out = stdout.read().decode().strip() ssh.close() return out # ==== Robust Remote disk check ==== def check_remote_disk(host, user, path, node_name): cmd = f"df --output=pcent {path} | tail -1 | tr -dc '0-9'" out = ssh_command(host, user, cmd) if not out: return f"[{node_name}] ERROR: Disk {path} not found or could not check disk usage." try: percent = int(out) except ValueError: return f"[{node_name}] ERROR: Could not parse disk usage for {path}. Output was: '{out}'" if percent > 90: return f"[{node_name}] WARNING: Only {100-percent}% disk free on {path}." # Inode check cmd_inode = f"df --output=ipcent {path} | tail -1 | tr -dc '0-9'" out_inode = ssh_command(host, user, cmd_inode) if not out_inode: return f"[{node_name}] ERROR: Disk {path} not found or could not check inode usage." try: percent_inode = int(out_inode) except ValueError: return f"[{node_name}] ERROR: Could not parse inode usage for {path}. Output was: '{out_inode}'" if percent_inode > 90: return f"[{node_name}] WARNING: Only {100-percent_inode}% inodes free on {path}." return None # ==== SMART health check (for all disks) ==== def check_remote_smart(host, user, node_name): alerts = [] # List block devices cmd_lsblk = "lsblk -ndo NAME,TYPE | awk '$2==\"disk\" {print $1}'" devs = ssh_command(host, user, cmd_lsblk) if not devs: alerts.append(f"[{node_name}] ERROR: Could not list block devices for SMART check.") return alerts for dev in devs.split(): smart_cmd = f"sudo smartctl -H /dev/{dev}" out = ssh_command(host, user, smart_cmd) if "PASSED" in out: continue # All good elif "FAILED" in out or "Pre-fail" in out or "SMART support is: Unavailable" in out: alerts.append(f"[{node_name}] CRITICAL: SMART health issue on /dev/{dev}!\n{out}") elif "Unknown" in out or not out: alerts.append(f"[{node_name}] ERROR: SMART status unknown on /dev/{dev}. Output: {out}") # Optionally scan for other SMART errors return alerts # ==== Remote service check ==== def check_remote_service(host, user, service, node_name): cmd = f"systemctl is-active {service}" out = ssh_command(host, user, cmd) if out.strip() != "active": return f"[{node_name}] CRITICAL: Service {service} not running!" return None # ==== Remote RAID md0 check (robust for all mdstat layouts) ==== def check_remote_raid_md0(host, user, node_name): try: import re ssh = paramiko.SSHClient() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) ssh.connect(hostname=host, username=user, timeout=10) stdin, stdout, stderr = ssh.exec_command("cat /proc/mdstat") mdstat = stdout.read().decode() # Find the block for md0 and look for the [UU_] status lines = mdstat.splitlines() status = None inside_md0 = False for line in lines: if line.startswith("md0"): inside_md0 = True elif inside_md0: m = re.search(r"\[(U|_)+\]", line) if m: status = m.group(0) break # Stop searching if we hit a blank line or another array if line.strip() == "" or ":" in line: break ssh.close() if status is None: return f"[{node_name}] CRITICAL: /dev/md0 RAID status string not found!" if "_" in status: return f"[{node_name}] WARNING: /dev/md0 RAID degraded! Status: {status}" # All U's means all disks up return None except Exception as e: return f"[{node_name}] ERROR: Could not check RAID health remotely: {e}" # ==== Remote log scan ==== def check_remote_logs(host, user, node_name): alerts = [] for log in LOG_FILES: cmd = f"tail -500 {log}" try: out = ssh_command(host, user, cmd) lines = out.split("\n") for pattern in LOG_PATTERNS: if any(pattern in line for line in lines): alerts.append(f"[{node_name}] WARNING: Pattern '{pattern}' in {log}") except Exception as e: alerts.append(f"[{node_name}] ERROR: Could not read log {log}: {e}") return alerts # ==== Remote PostgreSQL replication check ==== def check_replication(host, node_name): try: import psycopg2 conn = psycopg2.connect(host=host, dbname="postgres", user="postgres", connect_timeout=5) cur = conn.cursor() cur.execute("SELECT pg_is_in_recovery();") is_replica = cur.fetchone()[0] if is_replica: cur.execute("SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))::INT;") lag = cur.fetchone()[0] if lag is None: return f"[{node_name}] CRITICAL: Standby not streaming! Replication down." elif lag > 10: return f"[{node_name}] WARNING: Replication lag is {lag} seconds." cur.close() conn.close() except Exception as e: return f"[{node_name}] ERROR: Could not check replication: {e}" return None # ==== Main routine ==== def main(): problems = [] # Multi-node checks for node in NODES: # All checks via SSH for disk in node["disks"]: res = check_remote_disk(node["host"], node["ssh_user"], disk, node["name"]) if res: problems.append(res) # SMART check for all disks on this node smart_alerts = check_remote_smart(node["host"], node["ssh_user"], node["name"]) if smart_alerts: problems.extend(smart_alerts) for svc in node["services"]: res = check_remote_service(node["host"], node["ssh_user"], svc, node["name"]) if res: problems.append(res) # Replication check if node.get("db"): res = check_replication(node["host"], node["name"]) if res: problems.append(res) # RAID check, only for nodes with "raid": True if node.get("raid", False): raid_health = check_remote_raid_md0(node["host"], node["ssh_user"], node["name"]) if raid_health: problems.append(raid_health) # Log scan logs = check_remote_logs(node["host"], node["ssh_user"], node["name"]) if logs: problems.extend(logs) # Send DM if anything wrong if problems: now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") msg = f"🚨 Genesis Radio Multi-Node Healthcheck {now} 🚨\n" + "\n".join(problems) print(msg) mastodon_dm(msg) else: print("OK: All checks passed.") if __name__ == "__main__": main()