diff --git a/dbcheck/dbcheck1.py b/dbcheck/dbcheck1.py deleted file mode 100644 index b76810e..0000000 --- a/dbcheck/dbcheck1.py +++ /dev/null @@ -1,251 +0,0 @@ -import os -import requests -import datetime -import paramiko - -# ==== CONFIG ==== -MASTODON_INSTANCE = "https://chatwithus.live" -MASTODON_TOKEN = "" -MASTODON_USER_ID = "" - -DISK_WARN_THRESHOLD = 10 # percent free -INODE_WARN_THRESHOLD = 10 # percent free -LOG_FILES = ["/var/log/syslog", "/var/log/nginx/error.log"] -LOG_PATTERNS = ["ERROR", "FATAL", "disk full", "out of memory"] - -NODES = [ - { - "name": "shredder", - "host": "38.102.127.171", - "ssh_user": "doc", - "services": ["minio.service"], - "disks": ["/", "/mnt/raid5"], - "type": "remote", - "db": False, - "raid": True - }, - { - "name": "mastodon", - "host": "chatwithus.live", # Now points directly to your Mastodon server - "ssh_user": "root", - "services": ["nginx", "mastodon-web"], - "disks": ["/"], - "type": "remote", - "db": False, - "raid": False - }, - { - "name": "db1", - "host": "cluster.db1.genesishostingtechnologies.com", - "ssh_user": "doc", - "services": ["postgresql@16-main.service"], - "disks": ["/", "/var/lib/postgresql"], - "type": "remote", - "db": True, - "raid": False - }, - { - "name": "db2", - "host": "cluster.db2.genesishostingtechnologies.com", - "ssh_user": "doc", - "services": ["postgresql@16-main.service"], - "disks": ["/", "/var/lib/postgresql"], - "type": "remote", - "db": True, - "raid": False - }, -] - -# ==== Mastodon DM function ==== -def mastodon_dm(message): - url = f"{MASTODON_INSTANCE}/api/v1/statuses" - headers = {"Authorization": f"Bearer {MASTODON_TOKEN}"} - payload = { - "status": message, - "visibility": "direct", - "in_reply_to_account_id": MASTODON_USER_ID - } - resp = requests.post(url, headers=headers, data=payload) - if resp.status_code != 200: - print(f"Failed to send Mastodon DM: {resp.text}") - -# ==== SSH command runner ==== -def ssh_command(host, user, cmd): - ssh = paramiko.SSHClient() - ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - ssh.connect(hostname=host, username=user, timeout=10) - stdin, stdout, stderr = ssh.exec_command(cmd) - out = stdout.read().decode().strip() - ssh.close() - return out - -# ==== Robust Remote disk check ==== -def check_remote_disk(host, user, path, node_name): - cmd = f"df --output=pcent {path} | tail -1 | tr -dc '0-9'" - out = ssh_command(host, user, cmd) - if not out: - return f"[{node_name}] ERROR: Disk {path} not found or could not check disk usage." - try: - percent = int(out) - except ValueError: - return f"[{node_name}] ERROR: Could not parse disk usage for {path}. Output was: '{out}'" - if percent > 90: - return f"[{node_name}] WARNING: Only {100-percent}% disk free on {path}." - # Inode check - cmd_inode = f"df --output=ipcent {path} | tail -1 | tr -dc '0-9'" - out_inode = ssh_command(host, user, cmd_inode) - if not out_inode: - return f"[{node_name}] ERROR: Disk {path} not found or could not check inode usage." - try: - percent_inode = int(out_inode) - except ValueError: - return f"[{node_name}] ERROR: Could not parse inode usage for {path}. Output was: '{out_inode}'" - if percent_inode > 90: - return f"[{node_name}] WARNING: Only {100-percent_inode}% inodes free on {path}." - return None - -# ==== SMART health check (for all disks) ==== -def check_remote_smart(host, user, node_name): - alerts = [] - # List block devices - cmd_lsblk = "lsblk -ndo NAME,TYPE | awk '$2==\"disk\" {print $1}'" - devs = ssh_command(host, user, cmd_lsblk) - if not devs: - alerts.append(f"[{node_name}] ERROR: Could not list block devices for SMART check.") - return alerts - for dev in devs.split(): - smart_cmd = f"sudo smartctl -H /dev/{dev}" - out = ssh_command(host, user, smart_cmd) - if "PASSED" in out: - continue # All good - elif "FAILED" in out or "Pre-fail" in out or "SMART support is: Unavailable" in out: - alerts.append(f"[{node_name}] CRITICAL: SMART health issue on /dev/{dev}!\n{out}") - elif "Unknown" in out or not out: - alerts.append(f"[{node_name}] ERROR: SMART status unknown on /dev/{dev}. Output: {out}") - # Optionally scan for other SMART errors - return alerts - -# ==== Remote service check ==== -def check_remote_service(host, user, service, node_name): - cmd = f"systemctl is-active {service}" - out = ssh_command(host, user, cmd) - if out.strip() != "active": - return f"[{node_name}] CRITICAL: Service {service} not running!" - return None - -# ==== Remote RAID md0 check (robust for all mdstat layouts) ==== -def check_remote_raid_md0(host, user, node_name): - try: - import re - ssh = paramiko.SSHClient() - ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - ssh.connect(hostname=host, username=user, timeout=10) - stdin, stdout, stderr = ssh.exec_command("cat /proc/mdstat") - mdstat = stdout.read().decode() - - # Find the block for md0 and look for the [UU_] status - lines = mdstat.splitlines() - status = None - inside_md0 = False - for line in lines: - if line.startswith("md0"): - inside_md0 = True - elif inside_md0: - m = re.search(r"\[(U|_)+\]", line) - if m: - status = m.group(0) - break - # Stop searching if we hit a blank line or another array - if line.strip() == "" or ":" in line: - break - - ssh.close() - - if status is None: - return f"[{node_name}] CRITICAL: /dev/md0 RAID status string not found!" - if "_" in status: - return f"[{node_name}] WARNING: /dev/md0 RAID degraded! Status: {status}" - # All U's means all disks up - return None - - except Exception as e: - return f"[{node_name}] ERROR: Could not check RAID health remotely: {e}" - -# ==== Remote log scan ==== -def check_remote_logs(host, user, node_name): - alerts = [] - for log in LOG_FILES: - cmd = f"tail -500 {log}" - try: - out = ssh_command(host, user, cmd) - lines = out.split("\n") - for pattern in LOG_PATTERNS: - if any(pattern in line for line in lines): - alerts.append(f"[{node_name}] WARNING: Pattern '{pattern}' in {log}") - except Exception as e: - alerts.append(f"[{node_name}] ERROR: Could not read log {log}: {e}") - return alerts - -# ==== Remote PostgreSQL replication check ==== -def check_replication(host, node_name): - try: - import psycopg2 - conn = psycopg2.connect(host=host, dbname="postgres", user="postgres", connect_timeout=5) - cur = conn.cursor() - cur.execute("SELECT pg_is_in_recovery();") - is_replica = cur.fetchone()[0] - if is_replica: - cur.execute("SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))::INT;") - lag = cur.fetchone()[0] - if lag is None: - return f"[{node_name}] CRITICAL: Standby not streaming! Replication down." - elif lag > 10: - return f"[{node_name}] WARNING: Replication lag is {lag} seconds." - cur.close() - conn.close() - except Exception as e: - return f"[{node_name}] ERROR: Could not check replication: {e}" - return None - -# ==== Main routine ==== -def main(): - problems = [] - - # Multi-node checks - for node in NODES: - # All checks via SSH - for disk in node["disks"]: - res = check_remote_disk(node["host"], node["ssh_user"], disk, node["name"]) - if res: problems.append(res) - # SMART check for all disks on this node - smart_alerts = check_remote_smart(node["host"], node["ssh_user"], node["name"]) - if smart_alerts: - problems.extend(smart_alerts) - for svc in node["services"]: - res = check_remote_service(node["host"], node["ssh_user"], svc, node["name"]) - if res: problems.append(res) - # Replication check - if node.get("db"): - res = check_replication(node["host"], node["name"]) - if res: problems.append(res) - # RAID check, only for nodes with "raid": True - if node.get("raid", False): - raid_health = check_remote_raid_md0(node["host"], node["ssh_user"], node["name"]) - if raid_health: - problems.append(raid_health) - # Log scan - logs = check_remote_logs(node["host"], node["ssh_user"], node["name"]) - if logs: - problems.extend(logs) - - # Send DM if anything wrong - if problems: - now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - msg = f"🚨 Genesis Radio Multi-Node Healthcheck {now} 🚨\n" + "\n".join(problems) - print(msg) - mastodon_dm(msg) - else: - print("OK: All checks passed.") - -if __name__ == "__main__": - main()