Auto commit from /home/doc/genesis-tools

This commit is contained in:
DocTator 2025-04-26 09:16:46 -04:00
parent 4924d79c07
commit 16fd65acc4
4 changed files with 3874 additions and 2 deletions

File diff suppressed because it is too large Load Diff

58
miscellaneous/dbcheck.log Normal file
View File

@ -0,0 +1,58 @@
WARNING: password file "/home/doc/.pgpass" has group or world access; permissions should be u=rw (0600) or less
WARNING: password file "/home/doc/.pgpass" has group or world access; permissions should be u=rw (0600) or less
🚨 Genesis Radio Multi-Node Healthcheck 2025-04-26 09:00:16 🚨
[db1] ERROR: SMART status unknown on /dev/sda. Output:
Exception (client): Error reading SSH protocol banner
Traceback (most recent call last):
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/transport.py", line 2369, in _check_banner
buf = self.packetizer.readline(timeout)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/packet.py", line 395, in readline
buf += self._read_timeout(timeout)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/packet.py", line 665, in _read_timeout
raise EOFError()
EOFError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/transport.py", line 2185, in run
self._check_banner()
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/transport.py", line 2373, in _check_banner
raise SSHException(
paramiko.ssh_exception.SSHException: Error reading SSH protocol banner
Traceback (most recent call last):
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/transport.py", line 2369, in _check_banner
buf = self.packetizer.readline(timeout)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/packet.py", line 395, in readline
buf += self._read_timeout(timeout)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/packet.py", line 665, in _read_timeout
raise EOFError()
EOFError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/doc/genesis-tools/miscellaneous/dbcheck1.py", line 251, in <module>
main()
File "/home/doc/genesis-tools/miscellaneous/dbcheck1.py", line 218, in main
res = check_remote_disk(node["host"], node["ssh_user"], disk, node["name"])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/doc/genesis-tools/miscellaneous/dbcheck1.py", line 85, in check_remote_disk
out = ssh_command(host, user, cmd)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/doc/genesis-tools/miscellaneous/dbcheck1.py", line 76, in ssh_command
ssh.connect(hostname=host, username=user, timeout=10)
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/client.py", line 451, in connect
t.start_client(timeout=timeout)
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/transport.py", line 773, in start_client
raise e
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/transport.py", line 2185, in run
self._check_banner()
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/transport.py", line 2373, in _check_banner
raise SSHException(
paramiko.ssh_exception.SSHException: Error reading SSH protocol banner

View File

@ -18,7 +18,7 @@ NODES = [
"name": "shredder",
"host": "38.102.127.171",
"ssh_user": "doc",
"services": [],
"services": ["minio.service"],
"disks": ["/", "/mnt/raid5"],
"type": "remote",
"db": False,
@ -48,7 +48,7 @@ NODES = [
"name": "db2",
"host": "cluster.db2.genesishostingtechnologies.com",
"ssh_user": "doc",
"services": ["postgresql@16-postgresqlreplica.service"],
"services": ["postgresql@16-main.service"],
"disks": ["/", "/var/lib/postgresql"],
"type": "remote",
"db": True,

131
miscellaneous/dbv1.sh Executable file
View File

@ -0,0 +1,131 @@
#!/bin/bash
# ---- CONFIG ----
PG_REMOTE_USER="postgres"
PG_REMOTE_HOST="cluster.db2.genesishostingtechnologies.com"
PG_REMOTE_PORT="5432"
PG_LOCAL_PORT="5432"
DUMP_DIR="/tmp/pgbackup_verify"
BACKUP_TARGET="root@backup.sshjunkie.com:/mnt/backup/pgdumps"
CC_TARGET="doc@clustercontrol.sshjunkie.com:/home/doc/backups"
DBS=("mastodon_production" "giteaprod" "hostingtootdb" "radiotootdb")
LOGFILE="$DUMP_DIR/verify_log_$(date +%Y%m%d_%H%M%S).txt"
mkdir -p "$DUMP_DIR"
# ==== Mastodon DM function ====
mastodon_alert() {
local msg="$1"
curl -sS -X POST "https://chatwithus.live/api/v1/statuses" \
-H "Authorization: Bearer rimxBLi-eaJAcwagkmoj6UoW7Lc473tQY0cOM041Euw" \
--data-urlencode "status=$msg" \
--data "visibility=direct" \
--data "in_reply_to_account_id=114386383616633367" >/dev/null
}
ALL_OK=true
UPLOAD_LIST=()
for DB in "${DBS[@]}"; do
echo "=== [$(date)] Dumping $DB from $PG_REMOTE_HOST ===" | tee -a "$LOGFILE"
DUMPFILE="$DUMP_DIR/${DB}_$(date +%Y%m%d_%H%M%S).sql"
# Dump from remote
pg_dump -h "$PG_REMOTE_HOST" -p "$PG_REMOTE_PORT" -U "$PG_REMOTE_USER" -d "$DB" > "$DUMPFILE"
if [ $? -ne 0 ]; then
echo "[FAIL] Failed to dump $DB! Skipping upload." | tee -a "$LOGFILE"
ALL_OK=false
mastodon_alert "🚨 Database backup/verify FAILED: Could not dump $DB from $PG_REMOTE_HOST on $(hostname) at $(date). See log: $LOGFILE"
continue
fi
# Generate checksum for the dump file
CHECKSUM_FILE="$DUMPFILE.sha256"
sha256sum "$DUMPFILE" > "$CHECKSUM_FILE"
# Restore/verify on Krang
TESTDB="verify_${DB}_$RANDOM"
echo "Creating test database $TESTDB" | tee -a "$LOGFILE"
sudo -u postgres createdb -p "$PG_LOCAL_PORT" "$TESTDB"
if [ $? -ne 0 ]; then
echo "[FAIL] Failed to create $TESTDB!" | tee -a "$LOGFILE"
ALL_OK=false
mastodon_alert "🚨 Database backup/verify FAILED: Could not create test DB $TESTDB on $(hostname) at $(date). See log: $LOGFILE"
continue
fi
# Restore the dump
echo "Restoring to $TESTDB" | tee -a "$LOGFILE"
sudo -u postgres psql -p "$PG_LOCAL_PORT" -d "$TESTDB" < "$DUMPFILE"
if [ $? -ne 0 ]; then
echo "[FAIL] Restore failed for $DB!" | tee -a "$LOGFILE"
sudo -u postgres dropdb -p "$PG_LOCAL_PORT" "$TESTDB"
ALL_OK=false
mastodon_alert "🚨 Database backup/verify FAILED: Restore failed for $DB on $(hostname) at $(date). See log: $LOGFILE"
continue
fi
# Sanity check: check row count of a critical table (e.g., 'users' table)
ROW_COUNT=$(sudo -u postgres psql -p "$PG_LOCAL_PORT" -d "$TESTDB" -t -c "SELECT count(*) FROM users;")
echo "Row count for 'users' table in $TESTDB: $ROW_COUNT" | tee -a "$LOGFILE"
if [ "$ROW_COUNT" -le 0 ]; then
echo "[FAIL] No rows in 'users' table after restore!" | tee -a "$LOGFILE"
ALL_OK=false
mastodon_alert "🚨 Database backup/verify FAILED: No rows found in 'users' table after restore for $DB on $(hostname) at $(date). See log: $LOGFILE"
continue
else
echo "[PASS] Row count OK for 'users' table in $DB." | tee -a "$LOGFILE"
fi
# Verify checksum
RESTORED_CHECKSUM_FILE="$DUMPFILE.sha256"
if ! cmp -s "$CHECKSUM_FILE" "$RESTORED_CHECKSUM_FILE"; then
echo "[FAIL] Checksum mismatch for $DB!" | tee -a "$LOGFILE"
ALL_OK=false
mastodon_alert "🚨 Database backup/verify FAILED: Checksum mismatch for $DB on $(hostname) at $(date). See log: $LOGFILE"
continue
else
echo "[PASS] Checksum verified for $DB." | tee -a "$LOGFILE"
fi
# Quick table listing for sanity
sudo -u postgres psql -p "$PG_LOCAL_PORT" -d "$TESTDB" -c "\dt" | tee -a "$LOGFILE"
if [ $? -eq 0 ]; then
echo "[PASS] $DB: Dump and restore OK." | tee -a "$LOGFILE"
UPLOAD_LIST+=("$DUMPFILE")
else
echo "[FAIL] $DB: Test query failed!" | tee -a "$LOGFILE"
ALL_OK=false
mastodon_alert "🚨 Database backup/verify FAILED: Test query failed for $DB on $(hostname) at $(date). See log: $LOGFILE"
fi
sudo -u postgres dropdb -p "$PG_LOCAL_PORT" "$TESTDB"
echo "Cleaned up $TESTDB" | tee -a "$LOGFILE"
echo "" | tee -a "$LOGFILE"
done
if $ALL_OK && [ "${#UPLOAD_LIST[@]}" -eq "${#DBS[@]}" ]; then
echo "All dumps verified, sending to $BACKUP_TARGET" | tee -a "$LOGFILE"
scp "${UPLOAD_LIST[@]}" "$BACKUP_TARGET"
if [ $? -eq 0 ]; then
echo "Uploads to thevault successful." | tee -a "$LOGFILE"
# --NEW: Also upload to ClusterControl controller
echo "Uploading to ClusterControl controller at $CC_TARGET" | tee -a "$LOGFILE"
scp "${UPLOAD_LIST[@]}" "$CC_TARGET"
if [ $? -eq 0 ]; then
echo "Uploads to ClusterControl successful." | tee -a "$LOGFILE"
rm -f "${UPLOAD_LIST[@]}"
else
echo "[WARN] Upload to ClusterControl controller failed!" | tee -a "$LOGFILE"
mastodon_alert "⚠️ Database backup verified, but upload to ClusterControl at $CC_TARGET failed on $(hostname) at $(date). See log: $LOGFILE"
fi
else
echo "[FAIL] Upload to thevault failed!" | tee -a "$LOGFILE"
mastodon_alert "🚨 Database backup/verify FAILED: Upload to $BACKUP_TARGET failed on $(hostname) at $(date). See log: $LOGFILE"
fi
else
echo "Not all backups verified! Nothing uploaded." | tee -a "$LOGFILE"
mastodon_alert "🚨 Database backup/verify FAILED: One or more DBs failed verification on $(hostname) at $(date). See log: $LOGFILE"
fi
echo "DONE. Log: $LOGFILE"