bin-there-done-that/dbcheck/dbcheck1.py

import os
import requests
import datetime
import paramiko

# ==== CONFIG ====
MASTODON_INSTANCE = "https://chatwithus.live"
MASTODON_TOKEN = ""
MASTODON_USER_ID = ""

DISK_WARN_THRESHOLD = 10     # percent free
INODE_WARN_THRESHOLD = 10    # percent free
LOG_FILES = ["/var/log/syslog", "/var/log/nginx/error.log"]
LOG_PATTERNS = ["ERROR", "FATAL", "disk full", "out of memory"]

NODES = [
    {
        "name": "shredder",
        "host": "38.102.127.171",
        "ssh_user": "doc",
        "services": ["minio.service"],
        "disks": ["/", "/mnt/raid5"],
        "type": "remote",
        "db": False,
        "raid": True
    },
    {
        "name": "mastodon",
        "host": "chatwithus.live",   # Now points directly to your Mastodon server
        "ssh_user": "root",
        "services": ["nginx", "mastodon-web"],
        "disks": ["/"],
        "type": "remote",
        "db": False,
        "raid": False
    },
    {
        "name": "db1",
        "host": "cluster.db1.genesishostingtechnologies.com",
        "ssh_user": "doc",
        "services": ["postgresql@16-main.service"],
        "disks": ["/", "/var/lib/postgresql"],
        "type": "remote",
        "db": True,
        "raid": False
    },
    {
        "name": "db2",
        "host": "cluster.db2.genesishostingtechnologies.com",
        "ssh_user": "doc",
        "services": ["postgresql@16-main.service"],
        "disks": ["/", "/var/lib/postgresql"],
        "type": "remote",
        "db": True,
        "raid": False
    },
]

# ==== Mastodon DM function ====
def mastodon_dm(message):
    url = f"{MASTODON_INSTANCE}/api/v1/statuses"
    headers = {"Authorization": f"Bearer {MASTODON_TOKEN}"}
    payload = {
        "status": message,
        "visibility": "direct",
        "in_reply_to_account_id": MASTODON_USER_ID
    }
    resp = requests.post(url, headers=headers, data=payload)
    if resp.status_code != 200:
        print(f"Failed to send Mastodon DM: {resp.text}")

# ==== SSH command runner ====
def ssh_command(host, user, cmd):
    ssh = paramiko.SSHClient()
    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    ssh.connect(hostname=host, username=user, timeout=10)
    stdin, stdout, stderr = ssh.exec_command(cmd)
    out = stdout.read().decode().strip()
    ssh.close()
    return out

# ==== Robust Remote disk check ====
def check_remote_disk(host, user, path, node_name):
    cmd = f"df --output=pcent {path} | tail -1 | tr -dc '0-9'"
    out = ssh_command(host, user, cmd)
    if not out:
        return f"[{node_name}] ERROR: Disk {path} not found or could not check disk usage."
    try:
        percent = int(out)
    except ValueError:
        return f"[{node_name}] ERROR: Could not parse disk usage for {path}. Output was: '{out}'"
    if percent > 90:
        return f"[{node_name}] WARNING: Only {100-percent}% disk free on {path}."
    # Inode check
    cmd_inode = f"df --output=ipcent {path} | tail -1 | tr -dc '0-9'"
    out_inode = ssh_command(host, user, cmd_inode)
    if not out_inode:
        return f"[{node_name}] ERROR: Disk {path} not found or could not check inode usage."
    try:
        percent_inode = int(out_inode)
    except ValueError:
        return f"[{node_name}] ERROR: Could not parse inode usage for {path}. Output was: '{out_inode}'"
    if percent_inode > 90:
        return f"[{node_name}] WARNING: Only {100-percent_inode}% inodes free on {path}."
    return None

# ==== SMART health check (for all disks) ====
def check_remote_smart(host, user, node_name):
    alerts = []
    # List block devices
    cmd_lsblk = "lsblk -ndo NAME,TYPE | awk '$2==\"disk\" {print $1}'"
    devs = ssh_command(host, user, cmd_lsblk)
    if not devs:
        alerts.append(f"[{node_name}] ERROR: Could not list block devices for SMART check.")
        return alerts
    for dev in devs.split():
        smart_cmd = f"sudo smartctl -H /dev/{dev}"
        out = ssh_command(host, user, smart_cmd)
        if "PASSED" in out:
            continue  # All good
        elif "FAILED" in out or "Pre-fail" in out or "SMART support is: Unavailable" in out:
            alerts.append(f"[{node_name}] CRITICAL: SMART health issue on /dev/{dev}!\n{out}")
        elif "Unknown" in out or not out:
            alerts.append(f"[{node_name}] ERROR: SMART status unknown on /dev/{dev}. Output: {out}")
        # Optionally scan for other SMART errors
    return alerts

# ==== Remote service check ====
def check_remote_service(host, user, service, node_name):
    cmd = f"systemctl is-active {service}"
    out = ssh_command(host, user, cmd)
    if out.strip() != "active":
        return f"[{node_name}] CRITICAL: Service {service} not running!"
    return None

# ==== Remote RAID md0 check (robust for all mdstat layouts) ====
def check_remote_raid_md0(host, user, node_name):
    try:
        import re
        ssh = paramiko.SSHClient()
        ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        ssh.connect(hostname=host, username=user, timeout=10)
        stdin, stdout, stderr = ssh.exec_command("cat /proc/mdstat")
        mdstat = stdout.read().decode()

        # Find the block for md0 and look for the [UU_] status
        lines = mdstat.splitlines()
        status = None
        inside_md0 = False
        for line in lines:
            if line.startswith("md0"):
                inside_md0 = True
            elif inside_md0:
                m = re.search(r"\[(U|_)+\]", line)
                if m:
                    status = m.group(0)
                    break
                # Stop searching if we hit a blank line or another array
                if line.strip() == "" or ":" in line:
                    break

        ssh.close()

        if status is None:
            return f"[{node_name}] CRITICAL: /dev/md0 RAID status string not found!"
        if "_" in status:
            return f"[{node_name}] WARNING: /dev/md0 RAID degraded! Status: {status}"
        # All U's means all disks up
        return None

    except Exception as e:
        return f"[{node_name}] ERROR: Could not check RAID health remotely: {e}"

# ==== Remote log scan ====
def check_remote_logs(host, user, node_name):
    alerts = []
    for log in LOG_FILES:
        cmd = f"tail -500 {log}"
        try:
            out = ssh_command(host, user, cmd)
            lines = out.split("\n")
            for pattern in LOG_PATTERNS:
                if any(pattern in line for line in lines):
                    alerts.append(f"[{node_name}] WARNING: Pattern '{pattern}' in {log}")
        except Exception as e:
            alerts.append(f"[{node_name}] ERROR: Could not read log {log}: {e}")
    return alerts

# ==== Remote PostgreSQL replication check ====
def check_replication(host, node_name):
    try:
        import psycopg2
        conn = psycopg2.connect(host=host, dbname="postgres", user="postgres", connect_timeout=5)
        cur = conn.cursor()
        cur.execute("SELECT pg_is_in_recovery();")
        is_replica = cur.fetchone()[0]
        if is_replica:
            cur.execute("SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))::INT;")
            lag = cur.fetchone()[0]
            if lag is None:
                return f"[{node_name}] CRITICAL: Standby not streaming! Replication down."
            elif lag > 10:
                return f"[{node_name}] WARNING: Replication lag is {lag} seconds."
        cur.close()
        conn.close()
    except Exception as e:
        return f"[{node_name}] ERROR: Could not check replication: {e}"
    return None

# ==== Main routine ====
def main():
    problems = []

    # Multi-node checks
    for node in NODES:
        # All checks via SSH
        for disk in node["disks"]:
            res = check_remote_disk(node["host"], node["ssh_user"], disk, node["name"])
            if res: problems.append(res)
        # SMART check for all disks on this node
        smart_alerts = check_remote_smart(node["host"], node["ssh_user"], node["name"])
        if smart_alerts:
            problems.extend(smart_alerts)
        for svc in node["services"]:
            res = check_remote_service(node["host"], node["ssh_user"], svc, node["name"])
            if res: problems.append(res)
        # Replication check
        if node.get("db"):
            res = check_replication(node["host"], node["name"])
            if res: problems.append(res)
        # RAID check, only for nodes with "raid": True
        if node.get("raid", False):
            raid_health = check_remote_raid_md0(node["host"], node["ssh_user"], node["name"])
            if raid_health:
                problems.append(raid_health)
        # Log scan
        logs = check_remote_logs(node["host"], node["ssh_user"], node["name"])
        if logs:
            problems.extend(logs)

    # Send DM if anything wrong
    if problems:
        now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        msg = f"🚨 Genesis Radio Multi-Node Healthcheck {now} 🚨\n" + "\n".join(problems)
        print(msg)
        mastodon_dm(msg)
    else:
        print("OK: All checks passed.")

if __name__ == "__main__":
    main()
Auto commit from /home/doc/genesis-tools 2025-04-28 14:50:20 -04:00			`import os`
			`import requests`
			`import datetime`
			`import paramiko`

			`# ==== CONFIG ====`
			`MASTODON_INSTANCE = "https://chatwithus.live"`
Update dbcheck/dbcheck1.py 2025-04-30 14:35:18 +00:00			`MASTODON_TOKEN = ""`
			`MASTODON_USER_ID = ""`
Auto commit from /home/doc/genesis-tools 2025-04-28 14:50:20 -04:00
			`DISK_WARN_THRESHOLD = 10 # percent free`
			`INODE_WARN_THRESHOLD = 10 # percent free`
			`LOG_FILES = ["/var/log/syslog", "/var/log/nginx/error.log"]`
			`LOG_PATTERNS = ["ERROR", "FATAL", "disk full", "out of memory"]`

			`NODES = [`
			`{`
			`"name": "shredder",`
			`"host": "38.102.127.171",`
			`"ssh_user": "doc",`
			`"services": ["minio.service"],`
			`"disks": ["/", "/mnt/raid5"],`
			`"type": "remote",`
			`"db": False,`
			`"raid": True`
			`},`
			`{`
			`"name": "mastodon",`
			`"host": "chatwithus.live", # Now points directly to your Mastodon server`
			`"ssh_user": "root",`
			`"services": ["nginx", "mastodon-web"],`
			`"disks": ["/"],`
			`"type": "remote",`
			`"db": False,`
			`"raid": False`
			`},`
			`{`
			`"name": "db1",`
			`"host": "cluster.db1.genesishostingtechnologies.com",`
			`"ssh_user": "doc",`
			`"services": ["postgresql@16-main.service"],`
			`"disks": ["/", "/var/lib/postgresql"],`
			`"type": "remote",`
			`"db": True,`
			`"raid": False`
			`},`
			`{`
			`"name": "db2",`
			`"host": "cluster.db2.genesishostingtechnologies.com",`
			`"ssh_user": "doc",`
			`"services": ["postgresql@16-main.service"],`
			`"disks": ["/", "/var/lib/postgresql"],`
			`"type": "remote",`
			`"db": True,`
			`"raid": False`
			`},`
			`]`

			`# ==== Mastodon DM function ====`
			`def mastodon_dm(message):`
			`url = f"{MASTODON_INSTANCE}/api/v1/statuses"`
			`headers = {"Authorization": f"Bearer {MASTODON_TOKEN}"}`
			`payload = {`
			`"status": message,`
			`"visibility": "direct",`
			`"in_reply_to_account_id": MASTODON_USER_ID`
			`}`
			`resp = requests.post(url, headers=headers, data=payload)`
			`if resp.status_code != 200:`
			`print(f"Failed to send Mastodon DM: {resp.text}")`

			`# ==== SSH command runner ====`
			`def ssh_command(host, user, cmd):`
			`ssh = paramiko.SSHClient()`
			`ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())`
			`ssh.connect(hostname=host, username=user, timeout=10)`
			`stdin, stdout, stderr = ssh.exec_command(cmd)`
			`out = stdout.read().decode().strip()`
			`ssh.close()`
			`return out`

			`# ==== Robust Remote disk check ====`
			`def check_remote_disk(host, user, path, node_name):`
			`cmd = f"df --output=pcent {path} \| tail -1 \| tr -dc '0-9'"`
			`out = ssh_command(host, user, cmd)`
			`if not out:`
			`return f"[{node_name}] ERROR: Disk {path} not found or could not check disk usage."`
			`try:`
			`percent = int(out)`
			`except ValueError:`
			`return f"[{node_name}] ERROR: Could not parse disk usage for {path}. Output was: '{out}'"`
			`if percent > 90:`
			`return f"[{node_name}] WARNING: Only {100-percent}% disk free on {path}."`
			`# Inode check`
			`cmd_inode = f"df --output=ipcent {path} \| tail -1 \| tr -dc '0-9'"`
			`out_inode = ssh_command(host, user, cmd_inode)`
			`if not out_inode:`
			`return f"[{node_name}] ERROR: Disk {path} not found or could not check inode usage."`
			`try:`
			`percent_inode = int(out_inode)`
			`except ValueError:`
			`return f"[{node_name}] ERROR: Could not parse inode usage for {path}. Output was: '{out_inode}'"`
			`if percent_inode > 90:`
			`return f"[{node_name}] WARNING: Only {100-percent_inode}% inodes free on {path}."`
			`return None`

			`# ==== SMART health check (for all disks) ====`
			`def check_remote_smart(host, user, node_name):`
			`alerts = []`
			`# List block devices`
			`cmd_lsblk = "lsblk -ndo NAME,TYPE \| awk '$2==\"disk\" {print $1}'"`
			`devs = ssh_command(host, user, cmd_lsblk)`
			`if not devs:`
			`alerts.append(f"[{node_name}] ERROR: Could not list block devices for SMART check.")`
			`return alerts`
			`for dev in devs.split():`
			`smart_cmd = f"sudo smartctl -H /dev/{dev}"`
			`out = ssh_command(host, user, smart_cmd)`
			`if "PASSED" in out:`
			`continue # All good`
			`elif "FAILED" in out or "Pre-fail" in out or "SMART support is: Unavailable" in out:`
			`alerts.append(f"[{node_name}] CRITICAL: SMART health issue on /dev/{dev}!\n{out}")`
			`elif "Unknown" in out or not out:`
			`alerts.append(f"[{node_name}] ERROR: SMART status unknown on /dev/{dev}. Output: {out}")`
			`# Optionally scan for other SMART errors`
			`return alerts`

			`# ==== Remote service check ====`
			`def check_remote_service(host, user, service, node_name):`
			`cmd = f"systemctl is-active {service}"`
			`out = ssh_command(host, user, cmd)`
			`if out.strip() != "active":`
			`return f"[{node_name}] CRITICAL: Service {service} not running!"`
			`return None`

			`# ==== Remote RAID md0 check (robust for all mdstat layouts) ====`
			`def check_remote_raid_md0(host, user, node_name):`
			`try:`
			`import re`
			`ssh = paramiko.SSHClient()`
			`ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())`
			`ssh.connect(hostname=host, username=user, timeout=10)`
			`stdin, stdout, stderr = ssh.exec_command("cat /proc/mdstat")`
			`mdstat = stdout.read().decode()`

			`# Find the block for md0 and look for the [UU_] status`
			`lines = mdstat.splitlines()`
			`status = None`
			`inside_md0 = False`
			`for line in lines:`
			`if line.startswith("md0"):`
			`inside_md0 = True`
			`elif inside_md0:`
			`m = re.search(r"\[(U\|_)+\]", line)`
			`if m:`
			`status = m.group(0)`
			`break`
			`# Stop searching if we hit a blank line or another array`
			`if line.strip() == "" or ":" in line:`
			`break`

			`ssh.close()`

			`if status is None:`
			`return f"[{node_name}] CRITICAL: /dev/md0 RAID status string not found!"`
			`if "_" in status:`
			`return f"[{node_name}] WARNING: /dev/md0 RAID degraded! Status: {status}"`
			`# All U's means all disks up`
			`return None`

			`except Exception as e:`
			`return f"[{node_name}] ERROR: Could not check RAID health remotely: {e}"`

			`# ==== Remote log scan ====`
			`def check_remote_logs(host, user, node_name):`
			`alerts = []`
			`for log in LOG_FILES:`
			`cmd = f"tail -500 {log}"`
			`try:`
			`out = ssh_command(host, user, cmd)`
			`lines = out.split("\n")`
			`for pattern in LOG_PATTERNS:`
			`if any(pattern in line for line in lines):`
			`alerts.append(f"[{node_name}] WARNING: Pattern '{pattern}' in {log}")`
			`except Exception as e:`
			`alerts.append(f"[{node_name}] ERROR: Could not read log {log}: {e}")`
			`return alerts`

			`# ==== Remote PostgreSQL replication check ====`
			`def check_replication(host, node_name):`
			`try:`
			`import psycopg2`
			`conn = psycopg2.connect(host=host, dbname="postgres", user="postgres", connect_timeout=5)`
			`cur = conn.cursor()`
			`cur.execute("SELECT pg_is_in_recovery();")`
			`is_replica = cur.fetchone()[0]`
			`if is_replica:`
			`cur.execute("SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))::INT;")`
			`lag = cur.fetchone()[0]`
			`if lag is None:`
			`return f"[{node_name}] CRITICAL: Standby not streaming! Replication down."`
			`elif lag > 10:`
			`return f"[{node_name}] WARNING: Replication lag is {lag} seconds."`
			`cur.close()`
			`conn.close()`
			`except Exception as e:`
			`return f"[{node_name}] ERROR: Could not check replication: {e}"`
			`return None`

			`# ==== Main routine ====`
			`def main():`
			`problems = []`

			`# Multi-node checks`
			`for node in NODES:`
			`# All checks via SSH`
			`for disk in node["disks"]:`
			`res = check_remote_disk(node["host"], node["ssh_user"], disk, node["name"])`
			`if res: problems.append(res)`
			`# SMART check for all disks on this node`
			`smart_alerts = check_remote_smart(node["host"], node["ssh_user"], node["name"])`
			`if smart_alerts:`
			`problems.extend(smart_alerts)`
			`for svc in node["services"]:`
			`res = check_remote_service(node["host"], node["ssh_user"], svc, node["name"])`
			`if res: problems.append(res)`
			`# Replication check`
			`if node.get("db"):`
			`res = check_replication(node["host"], node["name"])`
			`if res: problems.append(res)`
			`# RAID check, only for nodes with "raid": True`
			`if node.get("raid", False):`
			`raid_health = check_remote_raid_md0(node["host"], node["ssh_user"], node["name"])`
			`if raid_health:`
			`problems.append(raid_health)`
			`# Log scan`
			`logs = check_remote_logs(node["host"], node["ssh_user"], node["name"])`
			`if logs:`
			`problems.extend(logs)`

			`# Send DM if anything wrong`
			`if problems:`
			`now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")`
			`msg = f"🚨 Genesis Radio Multi-Node Healthcheck {now} 🚨\n" + "\n".join(problems)`
			`print(msg)`
			`mastodon_dm(msg)`
			`else:`
			`print("OK: All checks passed.")`

			`if __name__ == "__main__":`
			`main()`