Auto commit from /home/doc/genesis-tools
This commit is contained in:
parent
4e4f7d797a
commit
a7fee7cfee
File diff suppressed because it is too large
Load Diff
@ -392,3 +392,10 @@ OK: All checks passed.
|
||||
f.write("<table border='1' cellpadding='5' style='border-collapse: collapse;'>
|
||||
^
|
||||
SyntaxError: unterminated string literal (detected at line 181)
|
||||
Traceback (most recent call last):
|
||||
File "/home/doc/genesis-tools/miscellaneous/dbcheck1.py", line 187, in <module>
|
||||
main()
|
||||
File "/home/doc/genesis-tools/miscellaneous/dbcheck1.py", line 113, in main
|
||||
res = check_remote_disk(node["host"], node["ssh_user"], disk, node["name"])
|
||||
^^^^^^^^^^^^^^^^^
|
||||
NameError: name 'check_remote_disk' is not defined. Did you mean: 'check_remote_logs'?
|
||||
|
@ -3,6 +3,7 @@ import requests
|
||||
import datetime
|
||||
import paramiko
|
||||
import time
|
||||
import psycopg2
|
||||
|
||||
# ==== CONFIG ====
|
||||
MASTODON_INSTANCE = "https://chatwithus.live"
|
||||
@ -17,13 +18,13 @@ LOG_PATTERNS = ["ERROR", "FATAL", "disk full", "out of memory"]
|
||||
SUPPRESSED_PATTERNS = ["SomeKnownHarmlessMastodonError"]
|
||||
|
||||
NODES = [
|
||||
{"name": "shredder", "host": "38.102.127.171", "ssh_user": "doc", "services": ["minio.service"], "disks": ["/", "/mnt/raid5"], "type": "remote", "db": False, "raid": True},
|
||||
{"name": "mastodon", "host": "chatwithus.live", "ssh_user": "root", "services": ["nginx", "mastodon-web"], "disks": ["/"], "type": "remote", "db": False, "raid": False},
|
||||
{"name": "db1", "host": "cluster.db1.genesishostingtechnologies.com", "ssh_user": "doc", "services": ["postgresql@16-main.service"], "disks": ["/", "/var/lib/postgresql"], "type": "remote", "db": True, "raid": False},
|
||||
{"name": "db2", "host": "cluster.db2.genesishostingtechnologies.com", "ssh_user": "doc", "services": ["postgresql@16-main.service"], "disks": ["/", "/var/lib/postgresql"], "type": "remote", "db": True, "raid": False}
|
||||
{"name": "shredder", "host": "38.102.127.171", "ssh_user": "doc", "services": ["minio.service"], "disks": ["/", "/mnt/raid5"], "db": False, "raid": True},
|
||||
{"name": "mastodon", "host": "chatwithus.live", "ssh_user": "root", "services": ["nginx", "mastodon-web"], "disks": ["/"], "db": False, "raid": False},
|
||||
{"name": "db1", "host": "cluster.db1.genesishostingtechnologies.com", "ssh_user": "doc", "services": ["postgresql@16-main.service"], "disks": ["/", "/var/lib/postgresql"], "db": True, "raid": False},
|
||||
{"name": "db2", "host": "cluster.db2.genesishostingtechnologies.com", "ssh_user": "doc", "services": ["postgresql@16-main.service"], "disks": ["/", "/var/lib/postgresql"], "db": True, "raid": False}
|
||||
]
|
||||
|
||||
# ==== Mastodon DM with retry ====
|
||||
# ==== Mastodon DM ====
|
||||
def mastodon_dm(message, retries=3):
|
||||
url = f"{MASTODON_INSTANCE}/api/v1/statuses"
|
||||
headers = {"Authorization": f"Bearer {MASTODON_TOKEN}"}
|
||||
@ -35,7 +36,7 @@ def mastodon_dm(message, retries=3):
|
||||
print(f"Failed to send Mastodon DM (attempt {attempt+1}): {resp.text}")
|
||||
time.sleep(5)
|
||||
|
||||
# ==== SSH command runner ====
|
||||
# ==== SSH Helper ====
|
||||
def ssh_command(host, user, cmd):
|
||||
ssh = paramiko.SSHClient()
|
||||
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
@ -45,7 +46,7 @@ def ssh_command(host, user, cmd):
|
||||
ssh.close()
|
||||
return out
|
||||
|
||||
# ==== Emoji chooser ====
|
||||
# ==== Health Check Helpers ====
|
||||
def choose_emoji(line):
|
||||
if "RAID" in line:
|
||||
return "🧨"
|
||||
@ -59,18 +60,69 @@ def choose_emoji(line):
|
||||
return "💥"
|
||||
return "⚠️"
|
||||
|
||||
# ==== Health checks ====
|
||||
def check_rclone_health(node):
|
||||
def check_remote_disk(host, user, path, node_name):
|
||||
try:
|
||||
result = ssh_command(node["host"], node["ssh_user"], "rclone rc vfs/stats")
|
||||
if "error" in result.lower() or "failed" in result.lower():
|
||||
return ("critical", f"[{node['name']}] ERROR: rclone health check failed. Output: {result}")
|
||||
if "bytesUsed" in result:
|
||||
bytes_used = int(result.split('"bytesUsed":')[1].split(',')[0].strip())
|
||||
if bytes_used > 100000000000:
|
||||
return ("warning", f"[{node['name']}] WARNING: rclone cache usage high: {bytes_used} bytes used.")
|
||||
cmd = f"df --output=pcent {path} | tail -1 | tr -dc '0-9'"
|
||||
out = ssh_command(host, user, cmd)
|
||||
if not out:
|
||||
return f"[{node_name}] ERROR: Disk {path} not found or could not check disk usage."
|
||||
percent = int(out)
|
||||
if percent > (100 - DISK_WARN_THRESHOLD):
|
||||
return f"[{node_name}] WARNING: Only {100 - percent}% disk free on {path}."
|
||||
except Exception as e:
|
||||
return ("critical", f"[{node['name']}] ERROR: Could not check rclone health: {str(e)}")
|
||||
return f"[{node_name}] ERROR: Disk check failed: {e}"
|
||||
return None
|
||||
|
||||
def check_remote_service(host, user, service, node_name):
|
||||
try:
|
||||
cmd = f"systemctl is-active {service}"
|
||||
out = ssh_command(host, user, cmd)
|
||||
if out.strip() != "active":
|
||||
return f"[{node_name}] CRITICAL: Service {service} not running!"
|
||||
except Exception as e:
|
||||
return f"[{node_name}] ERROR: Service check failed: {e}"
|
||||
return None
|
||||
|
||||
def check_replication(host, node_name):
|
||||
try:
|
||||
conn = psycopg2.connect(host=host, dbname="postgres", user="postgres", connect_timeout=5)
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT pg_is_in_recovery();")
|
||||
is_replica = cur.fetchone()[0]
|
||||
if is_replica:
|
||||
cur.execute("SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))::INT;")
|
||||
lag = cur.fetchone()[0]
|
||||
if lag is None:
|
||||
return f"[{node_name}] CRITICAL: Standby not streaming! Replication down."
|
||||
elif lag >= 60:
|
||||
return f"[{node_name}] WARNING: Replication lag is {lag} seconds."
|
||||
cur.close()
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
return f"[{node_name}] ERROR: Replication check failed: {e}"
|
||||
return None
|
||||
|
||||
def check_remote_raid_md0(host, user, node_name):
|
||||
try:
|
||||
mdstat = ssh_command(host, user, "cat /proc/mdstat")
|
||||
lines = mdstat.splitlines()
|
||||
status = None
|
||||
inside_md0 = False
|
||||
for line in lines:
|
||||
if line.startswith("md0"):
|
||||
inside_md0 = True
|
||||
elif inside_md0:
|
||||
if "[" in line and "]" in line:
|
||||
status = line[line.index("["):line.index("]")+1]
|
||||
break
|
||||
if line.strip() == "" or ":" in line:
|
||||
break
|
||||
if status is None:
|
||||
return f"[{node_name}] CRITICAL: /dev/md0 RAID status string not found!"
|
||||
if "_" in status:
|
||||
return f"[{node_name}] WARNING: /dev/md0 RAID degraded! Status: {status}"
|
||||
except Exception as e:
|
||||
return f"[{node_name}] ERROR: RAID check failed: {e}"
|
||||
return None
|
||||
|
||||
def check_remote_logs(host, user, node_name):
|
||||
@ -88,7 +140,7 @@ def check_remote_logs(host, user, node_name):
|
||||
alerts.append(f"[{node_name}] ERROR: Could not read log {log}: {e}")
|
||||
return alerts
|
||||
|
||||
# ==== Main routine ====
|
||||
# ==== Main Routine ====
|
||||
def main():
|
||||
critical_problems = []
|
||||
warning_problems = []
|
||||
@ -97,18 +149,6 @@ def main():
|
||||
for node in NODES:
|
||||
status = "✅ Healthy"
|
||||
|
||||
if "rclone" in node.get("services", []):
|
||||
res = check_rclone_health(node)
|
||||
if res:
|
||||
level, msg = res
|
||||
if level == "critical":
|
||||
critical_problems.append(msg)
|
||||
status = "🚨 Critical"
|
||||
else:
|
||||
warning_problems.append(msg)
|
||||
if status != "🚨 Critical":
|
||||
status = "⚠️ Warning"
|
||||
|
||||
for disk in node["disks"]:
|
||||
res = check_remote_disk(node["host"], node["ssh_user"], disk, node["name"])
|
||||
if res:
|
||||
@ -132,8 +172,13 @@ def main():
|
||||
if node.get("db"):
|
||||
res = check_replication(node["host"], node["name"])
|
||||
if res:
|
||||
if "CRITICAL" in res:
|
||||
critical_problems.append(res)
|
||||
status = "🚨 Critical"
|
||||
else:
|
||||
warning_problems.append(res)
|
||||
if status != "🚨 Critical":
|
||||
status = "⚠️ Warning"
|
||||
|
||||
if node.get("raid", False):
|
||||
res = check_remote_raid_md0(node["host"], node["ssh_user"], node["name"])
|
||||
@ -141,13 +186,14 @@ def main():
|
||||
if "CRITICAL" in res:
|
||||
critical_problems.append(res)
|
||||
status = "🚨 Critical"
|
||||
elif status != "🚨 Critical":
|
||||
else:
|
||||
warning_problems.append(res)
|
||||
if status != "🚨 Critical":
|
||||
status = "⚠️ Warning"
|
||||
|
||||
logs = check_remote_logs(node["host"], node["ssh_user"], node["name"])
|
||||
if logs:
|
||||
warning_problems.extend(logs)
|
||||
for log_alert in logs:
|
||||
warning_problems.append(log_alert)
|
||||
if status != "🚨 Critical":
|
||||
status = "⚠️ Warning"
|
||||
|
||||
@ -172,7 +218,7 @@ def main():
|
||||
print(msg)
|
||||
mastodon_dm(msg)
|
||||
|
||||
# Write healthcheck HTML dashboard
|
||||
# Write dashboard
|
||||
with open(HEALTHCHECK_HTML, "w") as f:
|
||||
f.write("<html><head><title>Genesis Radio Healthcheck</title><meta http-equiv='refresh' content='60'></head><body>")
|
||||
f.write(f"<h1>Genesis Radio System Health</h1>")
|
||||
|
Loading…
x
Reference in New Issue
Block a user