190 lines
7.7 KiB
Python
190 lines
7.7 KiB
Python
import os
|
|
import requests
|
|
import datetime
|
|
import paramiko
|
|
import time
|
|
|
|
# ==== CONFIG ====
|
|
MASTODON_INSTANCE = "https://chatwithus.live"
|
|
MASTODON_TOKEN = "rimxBLi-eaJAcwagkmoj6UoW7Lc473tQY0cOM041Euw"
|
|
MASTODON_USER_ID = "114386383616633367"
|
|
HEALTHCHECK_HTML = "/var/www/html/healthcheck.html"
|
|
|
|
DISK_WARN_THRESHOLD = 10
|
|
INODE_WARN_THRESHOLD = 10
|
|
LOG_FILES = ["/var/log/syslog", "/var/log/nginx/error.log"]
|
|
LOG_PATTERNS = ["ERROR", "FATAL", "disk full", "out of memory"]
|
|
SUPPRESSED_PATTERNS = ["SomeKnownHarmlessMastodonError"]
|
|
|
|
NODES = [
|
|
{"name": "shredder", "host": "38.102.127.171", "ssh_user": "doc", "services": ["minio.service"], "disks": ["/", "/mnt/raid5"], "type": "remote", "db": False, "raid": True},
|
|
{"name": "mastodon", "host": "chatwithus.live", "ssh_user": "root", "services": ["nginx", "mastodon-web"], "disks": ["/"], "type": "remote", "db": False, "raid": False},
|
|
{"name": "db1", "host": "cluster.db1.genesishostingtechnologies.com", "ssh_user": "doc", "services": ["postgresql@16-main.service"], "disks": ["/", "/var/lib/postgresql"], "type": "remote", "db": True, "raid": False},
|
|
{"name": "db2", "host": "cluster.db2.genesishostingtechnologies.com", "ssh_user": "doc", "services": ["postgresql@16-main.service"], "disks": ["/", "/var/lib/postgresql"], "type": "remote", "db": True, "raid": False}
|
|
]
|
|
|
|
# ==== Mastodon DM function with retry ====
|
|
def mastodon_dm(message, retries=3):
|
|
url = f"{MASTODON_INSTANCE}/api/v1/statuses"
|
|
headers = {"Authorization": f"Bearer {MASTODON_TOKEN}"}
|
|
payload = {"status": message, "visibility": "direct", "in_reply_to_account_id": MASTODON_USER_ID}
|
|
for attempt in range(retries):
|
|
resp = requests.post(url, headers=headers, data=payload)
|
|
if resp.status_code == 200:
|
|
return
|
|
print(f"Failed to send Mastodon DM (attempt {attempt+1}): {resp.text}")
|
|
time.sleep(5)
|
|
|
|
# ==== SSH command runner ====
|
|
def ssh_command(host, user, cmd):
|
|
ssh = paramiko.SSHClient()
|
|
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
|
ssh.connect(hostname=host, username=user, timeout=10)
|
|
stdin, stdout, stderr = ssh.exec_command(cmd)
|
|
out = stdout.read().decode().strip()
|
|
ssh.close()
|
|
return out
|
|
|
|
# ==== Emoji chooser ====
|
|
def choose_emoji(line):
|
|
if "RAID" in line:
|
|
return "🧨"
|
|
if "disk" in line.lower():
|
|
return "📈"
|
|
if "rclone" in line.lower():
|
|
return "🐢"
|
|
if "Service" in line:
|
|
return "🛑"
|
|
if "Replication" in line:
|
|
return "💥"
|
|
return "⚠️"
|
|
|
|
# ==== Check rclone health ====
|
|
def check_rclone_health(node):
|
|
try:
|
|
result = ssh_command(node["host"], node["ssh_user"], "rclone rc vfs/stats")
|
|
if "error" in result.lower() or "failed" in result.lower():
|
|
return ("critical", f"[{node['name']}] ERROR: rclone health check failed. Output: {result}")
|
|
if "bytesUsed" in result:
|
|
bytes_used = int(result.split('"bytesUsed":')[1].split(',')[0].strip())
|
|
if bytes_used > 100000000000:
|
|
return ("warning", f"[{node['name']}] WARNING: rclone cache usage high: {bytes_used} bytes used.")
|
|
except Exception as e:
|
|
return ("critical", f"[{node['name']}] ERROR: Could not check rclone health: {str(e)}")
|
|
return None
|
|
|
|
# ==== Remote log scan ====
|
|
def check_remote_logs(host, user, node_name):
|
|
alerts = []
|
|
for log in LOG_FILES:
|
|
cmd = f"tail -500 {log}"
|
|
try:
|
|
out = ssh_command(host, user, cmd)
|
|
lines = out.split("\n")
|
|
for pattern in LOG_PATTERNS:
|
|
for line in lines:
|
|
if pattern in line and not any(suppress in line for suppress in SUPPRESSED_PATTERNS):
|
|
alerts.append(f"[{node_name}] WARNING: Pattern '{pattern}' in {log}")
|
|
except Exception as e:
|
|
alerts.append(f"[{node_name}] ERROR: Could not read log {log}: {e}")
|
|
return alerts
|
|
|
|
# ==== Main Routine ====
|
|
def main():
|
|
critical_problems = []
|
|
warning_problems = []
|
|
node_status = {}
|
|
|
|
for node in NODES:
|
|
status = "✅ Healthy"
|
|
|
|
if "rclone" in node.get("services", []):
|
|
res = check_rclone_health(node)
|
|
if res:
|
|
level, msg = res
|
|
if level == "critical":
|
|
critical_problems.append(msg)
|
|
status = "🚨 Critical"
|
|
else:
|
|
warning_problems.append(msg)
|
|
if status != "🚨 Critical":
|
|
status = "⚠️ Warning"
|
|
|
|
for disk in node["disks"]:
|
|
res = check_remote_disk(node["host"], node["ssh_user"], disk, node["name"])
|
|
if res:
|
|
if "CRITICAL" in res:
|
|
critical_problems.append(res)
|
|
status = "🚨 Critical"
|
|
elif "WARNING" in res and status != "🚨 Critical":
|
|
warning_problems.append(res)
|
|
status = "⚠️ Warning"
|
|
|
|
for svc in node["services"]:
|
|
res = check_remote_service(node["host"], node["ssh_user"], svc, node["name"])
|
|
if res:
|
|
if "CRITICAL" in res:
|
|
critical_problems.append(res)
|
|
status = "🚨 Critical"
|
|
elif "WARNING" in res and status != "🚨 Critical":
|
|
warning_problems.append(res)
|
|
status = "⚠️ Warning"
|
|
|
|
if node.get("db"):
|
|
res = check_replication(node["host"], node["name"])
|
|
if res:
|
|
critical_problems.append(res)
|
|
status = "🚨 Critical"
|
|
|
|
if node.get("raid", False):
|
|
res = check_remote_raid_md0(node["host"], node["ssh_user"], node["name"])
|
|
if res:
|
|
if "CRITICAL" in res:
|
|
critical_problems.append(res)
|
|
status = "🚨 Critical"
|
|
elif status != "🚨 Critical":
|
|
warning_problems.append(res)
|
|
status = "⚠️ Warning"
|
|
|
|
logs = check_remote_logs(node["host"], node["ssh_user"], node["name"])
|
|
if logs:
|
|
warning_problems.extend(logs)
|
|
if status != "🚨 Critical":
|
|
status = "⚠️ Warning"
|
|
|
|
node_status[node["name"]] = status
|
|
|
|
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
if critical_problems:
|
|
formatted = "\n".join(f"- {choose_emoji(p)} {p}" for p in critical_problems)
|
|
msg = f"🚨 Genesis Radio Critical Healthcheck {now} 🚨\n⚡ {len(critical_problems)} critical issues found:\n{formatted}"
|
|
print(msg)
|
|
mastodon_dm(msg)
|
|
|
|
if warning_problems:
|
|
formatted = "\n".join(f"- {choose_emoji(p)} {p}" for p in warning_problems)
|
|
msg = f"⚠️ Genesis Radio Warning Healthcheck {now} ⚠️\n⚡ {len(warning_problems)} warnings found:\n{formatted}"
|
|
print(msg)
|
|
mastodon_dm(msg)
|
|
|
|
if not critical_problems and not warning_problems:
|
|
msg = f"✅ Genesis Radio Healthcheck {now}: All systems normal."
|
|
print(msg)
|
|
mastodon_dm(msg)
|
|
|
|
# Write healthcheck HTML dashboard
|
|
with open(HEALTHCHECK_HTML, "w") as f:
|
|
f.write("<html><head><title>Genesis Radio Healthcheck</title><meta http-equiv='refresh' content='60'></head><body>")
|
|
f.write(f"<h1>Genesis Radio System Health</h1>")
|
|
f.write(f"<p>Last Checked: {now}</p>")
|
|
f.write("<table border='1' cellpadding='5' style='border-collapse: collapse;'>
|
|
<tr><th>System</th><th>Status</th></tr>")
|
|
for node, status in node_status.items():
|
|
color = 'green' if 'Healthy' in status else ('orange' if 'Warning' in status else 'red')
|
|
f.write(f"<tr><td>{node}</td><td style='color:{color};'>{status}</td></tr>")
|
|
f.write("</table></body></html>")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|