252 lines
9.0 KiB
Python
252 lines
9.0 KiB
Python
import os
|
|
import requests
|
|
import datetime
|
|
import paramiko
|
|
|
|
# ==== CONFIG ====
|
|
MASTODON_INSTANCE = "https://chatwithus.live"
|
|
MASTODON_TOKEN = "rimxBLi-eaJAcwagkmoj6UoW7Lc473tQY0cOM041Euw"
|
|
MASTODON_USER_ID = "114386383616633367"
|
|
|
|
DISK_WARN_THRESHOLD = 10 # percent free
|
|
INODE_WARN_THRESHOLD = 10 # percent free
|
|
LOG_FILES = ["/var/log/syslog", "/var/log/nginx/error.log"]
|
|
LOG_PATTERNS = ["ERROR", "FATAL", "disk full", "out of memory"]
|
|
|
|
NODES = [
|
|
{
|
|
"name": "shredder",
|
|
"host": "38.102.127.171",
|
|
"ssh_user": "doc",
|
|
"services": [],
|
|
"disks": ["/", "/mnt/raid5"],
|
|
"type": "remote",
|
|
"db": False,
|
|
"raid": True
|
|
},
|
|
{
|
|
"name": "mastodon",
|
|
"host": "chatwithus.live", # Now points directly to your Mastodon server
|
|
"ssh_user": "root",
|
|
"services": ["nginx", "mastodon-web"],
|
|
"disks": ["/"],
|
|
"type": "remote",
|
|
"db": False,
|
|
"raid": False
|
|
},
|
|
{
|
|
"name": "db1",
|
|
"host": "cluster.db1.genesishostingtechnologies.com",
|
|
"ssh_user": "doc",
|
|
"services": ["postgresql@16-main.service"],
|
|
"disks": ["/", "/var/lib/postgresql"],
|
|
"type": "remote",
|
|
"db": True,
|
|
"raid": False
|
|
},
|
|
{
|
|
"name": "db2",
|
|
"host": "cluster.db2.genesishostingtechnologies.com",
|
|
"ssh_user": "doc",
|
|
"services": ["postgresql@16-postgresqlreplica.service"],
|
|
"disks": ["/", "/var/lib/postgresql"],
|
|
"type": "remote",
|
|
"db": True,
|
|
"raid": False
|
|
},
|
|
]
|
|
|
|
# ==== Mastodon DM function ====
|
|
def mastodon_dm(message):
|
|
url = f"{MASTODON_INSTANCE}/api/v1/statuses"
|
|
headers = {"Authorization": f"Bearer {MASTODON_TOKEN}"}
|
|
payload = {
|
|
"status": message,
|
|
"visibility": "direct",
|
|
"in_reply_to_account_id": MASTODON_USER_ID
|
|
}
|
|
resp = requests.post(url, headers=headers, data=payload)
|
|
if resp.status_code != 200:
|
|
print(f"Failed to send Mastodon DM: {resp.text}")
|
|
|
|
# ==== SSH command runner ====
|
|
def ssh_command(host, user, cmd):
|
|
ssh = paramiko.SSHClient()
|
|
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
|
ssh.connect(hostname=host, username=user, timeout=10)
|
|
stdin, stdout, stderr = ssh.exec_command(cmd)
|
|
out = stdout.read().decode().strip()
|
|
ssh.close()
|
|
return out
|
|
|
|
# ==== Robust Remote disk check ====
|
|
def check_remote_disk(host, user, path, node_name):
|
|
cmd = f"df --output=pcent {path} | tail -1 | tr -dc '0-9'"
|
|
out = ssh_command(host, user, cmd)
|
|
if not out:
|
|
return f"[{node_name}] ERROR: Disk {path} not found or could not check disk usage."
|
|
try:
|
|
percent = int(out)
|
|
except ValueError:
|
|
return f"[{node_name}] ERROR: Could not parse disk usage for {path}. Output was: '{out}'"
|
|
if percent > 90:
|
|
return f"[{node_name}] WARNING: Only {100-percent}% disk free on {path}."
|
|
# Inode check
|
|
cmd_inode = f"df --output=ipcent {path} | tail -1 | tr -dc '0-9'"
|
|
out_inode = ssh_command(host, user, cmd_inode)
|
|
if not out_inode:
|
|
return f"[{node_name}] ERROR: Disk {path} not found or could not check inode usage."
|
|
try:
|
|
percent_inode = int(out_inode)
|
|
except ValueError:
|
|
return f"[{node_name}] ERROR: Could not parse inode usage for {path}. Output was: '{out_inode}'"
|
|
if percent_inode > 90:
|
|
return f"[{node_name}] WARNING: Only {100-percent_inode}% inodes free on {path}."
|
|
return None
|
|
|
|
# ==== SMART health check (for all disks) ====
|
|
def check_remote_smart(host, user, node_name):
|
|
alerts = []
|
|
# List block devices
|
|
cmd_lsblk = "lsblk -ndo NAME,TYPE | awk '$2==\"disk\" {print $1}'"
|
|
devs = ssh_command(host, user, cmd_lsblk)
|
|
if not devs:
|
|
alerts.append(f"[{node_name}] ERROR: Could not list block devices for SMART check.")
|
|
return alerts
|
|
for dev in devs.split():
|
|
smart_cmd = f"sudo smartctl -H /dev/{dev}"
|
|
out = ssh_command(host, user, smart_cmd)
|
|
if "PASSED" in out:
|
|
continue # All good
|
|
elif "FAILED" in out or "Pre-fail" in out or "SMART support is: Unavailable" in out:
|
|
alerts.append(f"[{node_name}] CRITICAL: SMART health issue on /dev/{dev}!\n{out}")
|
|
elif "Unknown" in out or not out:
|
|
alerts.append(f"[{node_name}] ERROR: SMART status unknown on /dev/{dev}. Output: {out}")
|
|
# Optionally scan for other SMART errors
|
|
return alerts
|
|
|
|
# ==== Remote service check ====
|
|
def check_remote_service(host, user, service, node_name):
|
|
cmd = f"systemctl is-active {service}"
|
|
out = ssh_command(host, user, cmd)
|
|
if out.strip() != "active":
|
|
return f"[{node_name}] CRITICAL: Service {service} not running!"
|
|
return None
|
|
|
|
# ==== Remote RAID md0 check (robust for all mdstat layouts) ====
|
|
def check_remote_raid_md0(host, user, node_name):
|
|
try:
|
|
import re
|
|
ssh = paramiko.SSHClient()
|
|
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
|
ssh.connect(hostname=host, username=user, timeout=10)
|
|
stdin, stdout, stderr = ssh.exec_command("cat /proc/mdstat")
|
|
mdstat = stdout.read().decode()
|
|
|
|
# Find the block for md0 and look for the [UU_] status
|
|
lines = mdstat.splitlines()
|
|
status = None
|
|
inside_md0 = False
|
|
for line in lines:
|
|
if line.startswith("md0"):
|
|
inside_md0 = True
|
|
elif inside_md0:
|
|
m = re.search(r"\[(U|_)+\]", line)
|
|
if m:
|
|
status = m.group(0)
|
|
break
|
|
# Stop searching if we hit a blank line or another array
|
|
if line.strip() == "" or ":" in line:
|
|
break
|
|
|
|
ssh.close()
|
|
|
|
if status is None:
|
|
return f"[{node_name}] CRITICAL: /dev/md0 RAID status string not found!"
|
|
if "_" in status:
|
|
return f"[{node_name}] WARNING: /dev/md0 RAID degraded! Status: {status}"
|
|
# All U's means all disks up
|
|
return None
|
|
|
|
except Exception as e:
|
|
return f"[{node_name}] ERROR: Could not check RAID health remotely: {e}"
|
|
|
|
# ==== Remote log scan ====
|
|
def check_remote_logs(host, user, node_name):
|
|
alerts = []
|
|
for log in LOG_FILES:
|
|
cmd = f"tail -500 {log}"
|
|
try:
|
|
out = ssh_command(host, user, cmd)
|
|
lines = out.split("\n")
|
|
for pattern in LOG_PATTERNS:
|
|
if any(pattern in line for line in lines):
|
|
alerts.append(f"[{node_name}] WARNING: Pattern '{pattern}' in {log}")
|
|
except Exception as e:
|
|
alerts.append(f"[{node_name}] ERROR: Could not read log {log}: {e}")
|
|
return alerts
|
|
|
|
# ==== Remote PostgreSQL replication check ====
|
|
def check_replication(host, node_name):
|
|
try:
|
|
import psycopg2
|
|
conn = psycopg2.connect(host=host, dbname="postgres", user="postgres", connect_timeout=5)
|
|
cur = conn.cursor()
|
|
cur.execute("SELECT pg_is_in_recovery();")
|
|
is_replica = cur.fetchone()[0]
|
|
if is_replica:
|
|
cur.execute("SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))::INT;")
|
|
lag = cur.fetchone()[0]
|
|
if lag is None:
|
|
return f"[{node_name}] CRITICAL: Standby not streaming! Replication down."
|
|
elif lag > 10:
|
|
return f"[{node_name}] WARNING: Replication lag is {lag} seconds."
|
|
cur.close()
|
|
conn.close()
|
|
except Exception as e:
|
|
return f"[{node_name}] ERROR: Could not check replication: {e}"
|
|
return None
|
|
|
|
# ==== Main routine ====
|
|
def main():
|
|
problems = []
|
|
|
|
# Multi-node checks
|
|
for node in NODES:
|
|
# All checks via SSH
|
|
for disk in node["disks"]:
|
|
res = check_remote_disk(node["host"], node["ssh_user"], disk, node["name"])
|
|
if res: problems.append(res)
|
|
# SMART check for all disks on this node
|
|
smart_alerts = check_remote_smart(node["host"], node["ssh_user"], node["name"])
|
|
if smart_alerts:
|
|
problems.extend(smart_alerts)
|
|
for svc in node["services"]:
|
|
res = check_remote_service(node["host"], node["ssh_user"], svc, node["name"])
|
|
if res: problems.append(res)
|
|
# Replication check
|
|
if node.get("db"):
|
|
res = check_replication(node["host"], node["name"])
|
|
if res: problems.append(res)
|
|
# RAID check, only for nodes with "raid": True
|
|
if node.get("raid", False):
|
|
raid_health = check_remote_raid_md0(node["host"], node["ssh_user"], node["name"])
|
|
if raid_health:
|
|
problems.append(raid_health)
|
|
# Log scan
|
|
logs = check_remote_logs(node["host"], node["ssh_user"], node["name"])
|
|
if logs:
|
|
problems.extend(logs)
|
|
|
|
# Send DM if anything wrong
|
|
if problems:
|
|
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
msg = f"🚨 Genesis Radio Multi-Node Healthcheck {now} 🚨\n" + "\n".join(problems)
|
|
print(msg)
|
|
mastodon_dm(msg)
|
|
else:
|
|
print("OK: All checks passed.")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|