Auto commit from /home/doc/genesis-tools
This commit is contained in:
parent
4924d79c07
commit
16fd65acc4
File diff suppressed because it is too large
Load Diff
58
miscellaneous/dbcheck.log
Normal file
58
miscellaneous/dbcheck.log
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
WARNING: password file "/home/doc/.pgpass" has group or world access; permissions should be u=rw (0600) or less
|
||||||
|
WARNING: password file "/home/doc/.pgpass" has group or world access; permissions should be u=rw (0600) or less
|
||||||
|
🚨 Genesis Radio Multi-Node Healthcheck 2025-04-26 09:00:16 🚨
|
||||||
|
[db1] ERROR: SMART status unknown on /dev/sda. Output:
|
||||||
|
Exception (client): Error reading SSH protocol banner
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/transport.py", line 2369, in _check_banner
|
||||||
|
buf = self.packetizer.readline(timeout)
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/packet.py", line 395, in readline
|
||||||
|
buf += self._read_timeout(timeout)
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/packet.py", line 665, in _read_timeout
|
||||||
|
raise EOFError()
|
||||||
|
EOFError
|
||||||
|
|
||||||
|
During handling of the above exception, another exception occurred:
|
||||||
|
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/transport.py", line 2185, in run
|
||||||
|
self._check_banner()
|
||||||
|
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/transport.py", line 2373, in _check_banner
|
||||||
|
raise SSHException(
|
||||||
|
paramiko.ssh_exception.SSHException: Error reading SSH protocol banner
|
||||||
|
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/transport.py", line 2369, in _check_banner
|
||||||
|
buf = self.packetizer.readline(timeout)
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/packet.py", line 395, in readline
|
||||||
|
buf += self._read_timeout(timeout)
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/packet.py", line 665, in _read_timeout
|
||||||
|
raise EOFError()
|
||||||
|
EOFError
|
||||||
|
|
||||||
|
During handling of the above exception, another exception occurred:
|
||||||
|
|
||||||
|
Traceback (most recent call last):
|
||||||
|
File "/home/doc/genesis-tools/miscellaneous/dbcheck1.py", line 251, in <module>
|
||||||
|
main()
|
||||||
|
File "/home/doc/genesis-tools/miscellaneous/dbcheck1.py", line 218, in main
|
||||||
|
res = check_remote_disk(node["host"], node["ssh_user"], disk, node["name"])
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/doc/genesis-tools/miscellaneous/dbcheck1.py", line 85, in check_remote_disk
|
||||||
|
out = ssh_command(host, user, cmd)
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
File "/home/doc/genesis-tools/miscellaneous/dbcheck1.py", line 76, in ssh_command
|
||||||
|
ssh.connect(hostname=host, username=user, timeout=10)
|
||||||
|
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/client.py", line 451, in connect
|
||||||
|
t.start_client(timeout=timeout)
|
||||||
|
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/transport.py", line 773, in start_client
|
||||||
|
raise e
|
||||||
|
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/transport.py", line 2185, in run
|
||||||
|
self._check_banner()
|
||||||
|
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/transport.py", line 2373, in _check_banner
|
||||||
|
raise SSHException(
|
||||||
|
paramiko.ssh_exception.SSHException: Error reading SSH protocol banner
|
@ -18,7 +18,7 @@ NODES = [
|
|||||||
"name": "shredder",
|
"name": "shredder",
|
||||||
"host": "38.102.127.171",
|
"host": "38.102.127.171",
|
||||||
"ssh_user": "doc",
|
"ssh_user": "doc",
|
||||||
"services": [],
|
"services": ["minio.service"],
|
||||||
"disks": ["/", "/mnt/raid5"],
|
"disks": ["/", "/mnt/raid5"],
|
||||||
"type": "remote",
|
"type": "remote",
|
||||||
"db": False,
|
"db": False,
|
||||||
@ -48,7 +48,7 @@ NODES = [
|
|||||||
"name": "db2",
|
"name": "db2",
|
||||||
"host": "cluster.db2.genesishostingtechnologies.com",
|
"host": "cluster.db2.genesishostingtechnologies.com",
|
||||||
"ssh_user": "doc",
|
"ssh_user": "doc",
|
||||||
"services": ["postgresql@16-postgresqlreplica.service"],
|
"services": ["postgresql@16-main.service"],
|
||||||
"disks": ["/", "/var/lib/postgresql"],
|
"disks": ["/", "/var/lib/postgresql"],
|
||||||
"type": "remote",
|
"type": "remote",
|
||||||
"db": True,
|
"db": True,
|
||||||
|
131
miscellaneous/dbv1.sh
Executable file
131
miscellaneous/dbv1.sh
Executable file
@ -0,0 +1,131 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# ---- CONFIG ----
|
||||||
|
PG_REMOTE_USER="postgres"
|
||||||
|
PG_REMOTE_HOST="cluster.db2.genesishostingtechnologies.com"
|
||||||
|
PG_REMOTE_PORT="5432"
|
||||||
|
PG_LOCAL_PORT="5432"
|
||||||
|
DUMP_DIR="/tmp/pgbackup_verify"
|
||||||
|
BACKUP_TARGET="root@backup.sshjunkie.com:/mnt/backup/pgdumps"
|
||||||
|
CC_TARGET="doc@clustercontrol.sshjunkie.com:/home/doc/backups"
|
||||||
|
DBS=("mastodon_production" "giteaprod" "hostingtootdb" "radiotootdb")
|
||||||
|
LOGFILE="$DUMP_DIR/verify_log_$(date +%Y%m%d_%H%M%S).txt"
|
||||||
|
mkdir -p "$DUMP_DIR"
|
||||||
|
|
||||||
|
# ==== Mastodon DM function ====
|
||||||
|
mastodon_alert() {
|
||||||
|
local msg="$1"
|
||||||
|
curl -sS -X POST "https://chatwithus.live/api/v1/statuses" \
|
||||||
|
-H "Authorization: Bearer rimxBLi-eaJAcwagkmoj6UoW7Lc473tQY0cOM041Euw" \
|
||||||
|
--data-urlencode "status=$msg" \
|
||||||
|
--data "visibility=direct" \
|
||||||
|
--data "in_reply_to_account_id=114386383616633367" >/dev/null
|
||||||
|
}
|
||||||
|
|
||||||
|
ALL_OK=true
|
||||||
|
UPLOAD_LIST=()
|
||||||
|
|
||||||
|
for DB in "${DBS[@]}"; do
|
||||||
|
echo "=== [$(date)] Dumping $DB from $PG_REMOTE_HOST ===" | tee -a "$LOGFILE"
|
||||||
|
DUMPFILE="$DUMP_DIR/${DB}_$(date +%Y%m%d_%H%M%S).sql"
|
||||||
|
|
||||||
|
# Dump from remote
|
||||||
|
pg_dump -h "$PG_REMOTE_HOST" -p "$PG_REMOTE_PORT" -U "$PG_REMOTE_USER" -d "$DB" > "$DUMPFILE"
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "[FAIL] Failed to dump $DB! Skipping upload." | tee -a "$LOGFILE"
|
||||||
|
ALL_OK=false
|
||||||
|
mastodon_alert "🚨 Database backup/verify FAILED: Could not dump $DB from $PG_REMOTE_HOST on $(hostname) at $(date). See log: $LOGFILE"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Generate checksum for the dump file
|
||||||
|
CHECKSUM_FILE="$DUMPFILE.sha256"
|
||||||
|
sha256sum "$DUMPFILE" > "$CHECKSUM_FILE"
|
||||||
|
|
||||||
|
# Restore/verify on Krang
|
||||||
|
TESTDB="verify_${DB}_$RANDOM"
|
||||||
|
echo "Creating test database $TESTDB" | tee -a "$LOGFILE"
|
||||||
|
sudo -u postgres createdb -p "$PG_LOCAL_PORT" "$TESTDB"
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "[FAIL] Failed to create $TESTDB!" | tee -a "$LOGFILE"
|
||||||
|
ALL_OK=false
|
||||||
|
mastodon_alert "🚨 Database backup/verify FAILED: Could not create test DB $TESTDB on $(hostname) at $(date). See log: $LOGFILE"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Restore the dump
|
||||||
|
echo "Restoring to $TESTDB" | tee -a "$LOGFILE"
|
||||||
|
sudo -u postgres psql -p "$PG_LOCAL_PORT" -d "$TESTDB" < "$DUMPFILE"
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "[FAIL] Restore failed for $DB!" | tee -a "$LOGFILE"
|
||||||
|
sudo -u postgres dropdb -p "$PG_LOCAL_PORT" "$TESTDB"
|
||||||
|
ALL_OK=false
|
||||||
|
mastodon_alert "🚨 Database backup/verify FAILED: Restore failed for $DB on $(hostname) at $(date). See log: $LOGFILE"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Sanity check: check row count of a critical table (e.g., 'users' table)
|
||||||
|
ROW_COUNT=$(sudo -u postgres psql -p "$PG_LOCAL_PORT" -d "$TESTDB" -t -c "SELECT count(*) FROM users;")
|
||||||
|
echo "Row count for 'users' table in $TESTDB: $ROW_COUNT" | tee -a "$LOGFILE"
|
||||||
|
|
||||||
|
if [ "$ROW_COUNT" -le 0 ]; then
|
||||||
|
echo "[FAIL] No rows in 'users' table after restore!" | tee -a "$LOGFILE"
|
||||||
|
ALL_OK=false
|
||||||
|
mastodon_alert "🚨 Database backup/verify FAILED: No rows found in 'users' table after restore for $DB on $(hostname) at $(date). See log: $LOGFILE"
|
||||||
|
continue
|
||||||
|
else
|
||||||
|
echo "[PASS] Row count OK for 'users' table in $DB." | tee -a "$LOGFILE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Verify checksum
|
||||||
|
RESTORED_CHECKSUM_FILE="$DUMPFILE.sha256"
|
||||||
|
if ! cmp -s "$CHECKSUM_FILE" "$RESTORED_CHECKSUM_FILE"; then
|
||||||
|
echo "[FAIL] Checksum mismatch for $DB!" | tee -a "$LOGFILE"
|
||||||
|
ALL_OK=false
|
||||||
|
mastodon_alert "🚨 Database backup/verify FAILED: Checksum mismatch for $DB on $(hostname) at $(date). See log: $LOGFILE"
|
||||||
|
continue
|
||||||
|
else
|
||||||
|
echo "[PASS] Checksum verified for $DB." | tee -a "$LOGFILE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Quick table listing for sanity
|
||||||
|
sudo -u postgres psql -p "$PG_LOCAL_PORT" -d "$TESTDB" -c "\dt" | tee -a "$LOGFILE"
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo "[PASS] $DB: Dump and restore OK." | tee -a "$LOGFILE"
|
||||||
|
UPLOAD_LIST+=("$DUMPFILE")
|
||||||
|
else
|
||||||
|
echo "[FAIL] $DB: Test query failed!" | tee -a "$LOGFILE"
|
||||||
|
ALL_OK=false
|
||||||
|
mastodon_alert "🚨 Database backup/verify FAILED: Test query failed for $DB on $(hostname) at $(date). See log: $LOGFILE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
sudo -u postgres dropdb -p "$PG_LOCAL_PORT" "$TESTDB"
|
||||||
|
echo "Cleaned up $TESTDB" | tee -a "$LOGFILE"
|
||||||
|
echo "" | tee -a "$LOGFILE"
|
||||||
|
done
|
||||||
|
|
||||||
|
if $ALL_OK && [ "${#UPLOAD_LIST[@]}" -eq "${#DBS[@]}" ]; then
|
||||||
|
echo "All dumps verified, sending to $BACKUP_TARGET" | tee -a "$LOGFILE"
|
||||||
|
scp "${UPLOAD_LIST[@]}" "$BACKUP_TARGET"
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo "Uploads to thevault successful." | tee -a "$LOGFILE"
|
||||||
|
# --NEW: Also upload to ClusterControl controller
|
||||||
|
echo "Uploading to ClusterControl controller at $CC_TARGET" | tee -a "$LOGFILE"
|
||||||
|
scp "${UPLOAD_LIST[@]}" "$CC_TARGET"
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo "Uploads to ClusterControl successful." | tee -a "$LOGFILE"
|
||||||
|
rm -f "${UPLOAD_LIST[@]}"
|
||||||
|
else
|
||||||
|
echo "[WARN] Upload to ClusterControl controller failed!" | tee -a "$LOGFILE"
|
||||||
|
mastodon_alert "⚠️ Database backup verified, but upload to ClusterControl at $CC_TARGET failed on $(hostname) at $(date). See log: $LOGFILE"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "[FAIL] Upload to thevault failed!" | tee -a "$LOGFILE"
|
||||||
|
mastodon_alert "🚨 Database backup/verify FAILED: Upload to $BACKUP_TARGET failed on $(hostname) at $(date). See log: $LOGFILE"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "Not all backups verified! Nothing uploaded." | tee -a "$LOGFILE"
|
||||||
|
mastodon_alert "🚨 Database backup/verify FAILED: One or more DBs failed verification on $(hostname) at $(date). See log: $LOGFILE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "DONE. Log: $LOGFILE"
|
Loading…
x
Reference in New Issue
Block a user