Auto commit from /home/doc/genesis-tools
This commit is contained in:
parent
4924d79c07
commit
16fd65acc4
File diff suppressed because it is too large
Load Diff
58
miscellaneous/dbcheck.log
Normal file
58
miscellaneous/dbcheck.log
Normal file
@ -0,0 +1,58 @@
|
||||
WARNING: password file "/home/doc/.pgpass" has group or world access; permissions should be u=rw (0600) or less
|
||||
WARNING: password file "/home/doc/.pgpass" has group or world access; permissions should be u=rw (0600) or less
|
||||
🚨 Genesis Radio Multi-Node Healthcheck 2025-04-26 09:00:16 🚨
|
||||
[db1] ERROR: SMART status unknown on /dev/sda. Output:
|
||||
Exception (client): Error reading SSH protocol banner
|
||||
Traceback (most recent call last):
|
||||
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/transport.py", line 2369, in _check_banner
|
||||
buf = self.packetizer.readline(timeout)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/packet.py", line 395, in readline
|
||||
buf += self._read_timeout(timeout)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/packet.py", line 665, in _read_timeout
|
||||
raise EOFError()
|
||||
EOFError
|
||||
|
||||
During handling of the above exception, another exception occurred:
|
||||
|
||||
Traceback (most recent call last):
|
||||
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/transport.py", line 2185, in run
|
||||
self._check_banner()
|
||||
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/transport.py", line 2373, in _check_banner
|
||||
raise SSHException(
|
||||
paramiko.ssh_exception.SSHException: Error reading SSH protocol banner
|
||||
|
||||
Traceback (most recent call last):
|
||||
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/transport.py", line 2369, in _check_banner
|
||||
buf = self.packetizer.readline(timeout)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/packet.py", line 395, in readline
|
||||
buf += self._read_timeout(timeout)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/packet.py", line 665, in _read_timeout
|
||||
raise EOFError()
|
||||
EOFError
|
||||
|
||||
During handling of the above exception, another exception occurred:
|
||||
|
||||
Traceback (most recent call last):
|
||||
File "/home/doc/genesis-tools/miscellaneous/dbcheck1.py", line 251, in <module>
|
||||
main()
|
||||
File "/home/doc/genesis-tools/miscellaneous/dbcheck1.py", line 218, in main
|
||||
res = check_remote_disk(node["host"], node["ssh_user"], disk, node["name"])
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/home/doc/genesis-tools/miscellaneous/dbcheck1.py", line 85, in check_remote_disk
|
||||
out = ssh_command(host, user, cmd)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/home/doc/genesis-tools/miscellaneous/dbcheck1.py", line 76, in ssh_command
|
||||
ssh.connect(hostname=host, username=user, timeout=10)
|
||||
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/client.py", line 451, in connect
|
||||
t.start_client(timeout=timeout)
|
||||
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/transport.py", line 773, in start_client
|
||||
raise e
|
||||
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/transport.py", line 2185, in run
|
||||
self._check_banner()
|
||||
File "/home/doc/dbcheck/lib/python3.12/site-packages/paramiko/transport.py", line 2373, in _check_banner
|
||||
raise SSHException(
|
||||
paramiko.ssh_exception.SSHException: Error reading SSH protocol banner
|
@ -18,7 +18,7 @@ NODES = [
|
||||
"name": "shredder",
|
||||
"host": "38.102.127.171",
|
||||
"ssh_user": "doc",
|
||||
"services": [],
|
||||
"services": ["minio.service"],
|
||||
"disks": ["/", "/mnt/raid5"],
|
||||
"type": "remote",
|
||||
"db": False,
|
||||
@ -48,7 +48,7 @@ NODES = [
|
||||
"name": "db2",
|
||||
"host": "cluster.db2.genesishostingtechnologies.com",
|
||||
"ssh_user": "doc",
|
||||
"services": ["postgresql@16-postgresqlreplica.service"],
|
||||
"services": ["postgresql@16-main.service"],
|
||||
"disks": ["/", "/var/lib/postgresql"],
|
||||
"type": "remote",
|
||||
"db": True,
|
||||
|
131
miscellaneous/dbv1.sh
Executable file
131
miscellaneous/dbv1.sh
Executable file
@ -0,0 +1,131 @@
|
||||
#!/bin/bash
|
||||
|
||||
# ---- CONFIG ----
|
||||
PG_REMOTE_USER="postgres"
|
||||
PG_REMOTE_HOST="cluster.db2.genesishostingtechnologies.com"
|
||||
PG_REMOTE_PORT="5432"
|
||||
PG_LOCAL_PORT="5432"
|
||||
DUMP_DIR="/tmp/pgbackup_verify"
|
||||
BACKUP_TARGET="root@backup.sshjunkie.com:/mnt/backup/pgdumps"
|
||||
CC_TARGET="doc@clustercontrol.sshjunkie.com:/home/doc/backups"
|
||||
DBS=("mastodon_production" "giteaprod" "hostingtootdb" "radiotootdb")
|
||||
LOGFILE="$DUMP_DIR/verify_log_$(date +%Y%m%d_%H%M%S).txt"
|
||||
mkdir -p "$DUMP_DIR"
|
||||
|
||||
# ==== Mastodon DM function ====
|
||||
mastodon_alert() {
|
||||
local msg="$1"
|
||||
curl -sS -X POST "https://chatwithus.live/api/v1/statuses" \
|
||||
-H "Authorization: Bearer rimxBLi-eaJAcwagkmoj6UoW7Lc473tQY0cOM041Euw" \
|
||||
--data-urlencode "status=$msg" \
|
||||
--data "visibility=direct" \
|
||||
--data "in_reply_to_account_id=114386383616633367" >/dev/null
|
||||
}
|
||||
|
||||
ALL_OK=true
|
||||
UPLOAD_LIST=()
|
||||
|
||||
for DB in "${DBS[@]}"; do
|
||||
echo "=== [$(date)] Dumping $DB from $PG_REMOTE_HOST ===" | tee -a "$LOGFILE"
|
||||
DUMPFILE="$DUMP_DIR/${DB}_$(date +%Y%m%d_%H%M%S).sql"
|
||||
|
||||
# Dump from remote
|
||||
pg_dump -h "$PG_REMOTE_HOST" -p "$PG_REMOTE_PORT" -U "$PG_REMOTE_USER" -d "$DB" > "$DUMPFILE"
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "[FAIL] Failed to dump $DB! Skipping upload." | tee -a "$LOGFILE"
|
||||
ALL_OK=false
|
||||
mastodon_alert "🚨 Database backup/verify FAILED: Could not dump $DB from $PG_REMOTE_HOST on $(hostname) at $(date). See log: $LOGFILE"
|
||||
continue
|
||||
fi
|
||||
|
||||
# Generate checksum for the dump file
|
||||
CHECKSUM_FILE="$DUMPFILE.sha256"
|
||||
sha256sum "$DUMPFILE" > "$CHECKSUM_FILE"
|
||||
|
||||
# Restore/verify on Krang
|
||||
TESTDB="verify_${DB}_$RANDOM"
|
||||
echo "Creating test database $TESTDB" | tee -a "$LOGFILE"
|
||||
sudo -u postgres createdb -p "$PG_LOCAL_PORT" "$TESTDB"
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "[FAIL] Failed to create $TESTDB!" | tee -a "$LOGFILE"
|
||||
ALL_OK=false
|
||||
mastodon_alert "🚨 Database backup/verify FAILED: Could not create test DB $TESTDB on $(hostname) at $(date). See log: $LOGFILE"
|
||||
continue
|
||||
fi
|
||||
|
||||
# Restore the dump
|
||||
echo "Restoring to $TESTDB" | tee -a "$LOGFILE"
|
||||
sudo -u postgres psql -p "$PG_LOCAL_PORT" -d "$TESTDB" < "$DUMPFILE"
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "[FAIL] Restore failed for $DB!" | tee -a "$LOGFILE"
|
||||
sudo -u postgres dropdb -p "$PG_LOCAL_PORT" "$TESTDB"
|
||||
ALL_OK=false
|
||||
mastodon_alert "🚨 Database backup/verify FAILED: Restore failed for $DB on $(hostname) at $(date). See log: $LOGFILE"
|
||||
continue
|
||||
fi
|
||||
|
||||
# Sanity check: check row count of a critical table (e.g., 'users' table)
|
||||
ROW_COUNT=$(sudo -u postgres psql -p "$PG_LOCAL_PORT" -d "$TESTDB" -t -c "SELECT count(*) FROM users;")
|
||||
echo "Row count for 'users' table in $TESTDB: $ROW_COUNT" | tee -a "$LOGFILE"
|
||||
|
||||
if [ "$ROW_COUNT" -le 0 ]; then
|
||||
echo "[FAIL] No rows in 'users' table after restore!" | tee -a "$LOGFILE"
|
||||
ALL_OK=false
|
||||
mastodon_alert "🚨 Database backup/verify FAILED: No rows found in 'users' table after restore for $DB on $(hostname) at $(date). See log: $LOGFILE"
|
||||
continue
|
||||
else
|
||||
echo "[PASS] Row count OK for 'users' table in $DB." | tee -a "$LOGFILE"
|
||||
fi
|
||||
|
||||
# Verify checksum
|
||||
RESTORED_CHECKSUM_FILE="$DUMPFILE.sha256"
|
||||
if ! cmp -s "$CHECKSUM_FILE" "$RESTORED_CHECKSUM_FILE"; then
|
||||
echo "[FAIL] Checksum mismatch for $DB!" | tee -a "$LOGFILE"
|
||||
ALL_OK=false
|
||||
mastodon_alert "🚨 Database backup/verify FAILED: Checksum mismatch for $DB on $(hostname) at $(date). See log: $LOGFILE"
|
||||
continue
|
||||
else
|
||||
echo "[PASS] Checksum verified for $DB." | tee -a "$LOGFILE"
|
||||
fi
|
||||
|
||||
# Quick table listing for sanity
|
||||
sudo -u postgres psql -p "$PG_LOCAL_PORT" -d "$TESTDB" -c "\dt" | tee -a "$LOGFILE"
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "[PASS] $DB: Dump and restore OK." | tee -a "$LOGFILE"
|
||||
UPLOAD_LIST+=("$DUMPFILE")
|
||||
else
|
||||
echo "[FAIL] $DB: Test query failed!" | tee -a "$LOGFILE"
|
||||
ALL_OK=false
|
||||
mastodon_alert "🚨 Database backup/verify FAILED: Test query failed for $DB on $(hostname) at $(date). See log: $LOGFILE"
|
||||
fi
|
||||
|
||||
sudo -u postgres dropdb -p "$PG_LOCAL_PORT" "$TESTDB"
|
||||
echo "Cleaned up $TESTDB" | tee -a "$LOGFILE"
|
||||
echo "" | tee -a "$LOGFILE"
|
||||
done
|
||||
|
||||
if $ALL_OK && [ "${#UPLOAD_LIST[@]}" -eq "${#DBS[@]}" ]; then
|
||||
echo "All dumps verified, sending to $BACKUP_TARGET" | tee -a "$LOGFILE"
|
||||
scp "${UPLOAD_LIST[@]}" "$BACKUP_TARGET"
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "Uploads to thevault successful." | tee -a "$LOGFILE"
|
||||
# --NEW: Also upload to ClusterControl controller
|
||||
echo "Uploading to ClusterControl controller at $CC_TARGET" | tee -a "$LOGFILE"
|
||||
scp "${UPLOAD_LIST[@]}" "$CC_TARGET"
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "Uploads to ClusterControl successful." | tee -a "$LOGFILE"
|
||||
rm -f "${UPLOAD_LIST[@]}"
|
||||
else
|
||||
echo "[WARN] Upload to ClusterControl controller failed!" | tee -a "$LOGFILE"
|
||||
mastodon_alert "⚠️ Database backup verified, but upload to ClusterControl at $CC_TARGET failed on $(hostname) at $(date). See log: $LOGFILE"
|
||||
fi
|
||||
else
|
||||
echo "[FAIL] Upload to thevault failed!" | tee -a "$LOGFILE"
|
||||
mastodon_alert "🚨 Database backup/verify FAILED: Upload to $BACKUP_TARGET failed on $(hostname) at $(date). See log: $LOGFILE"
|
||||
fi
|
||||
else
|
||||
echo "Not all backups verified! Nothing uploaded." | tee -a "$LOGFILE"
|
||||
mastodon_alert "🚨 Database backup/verify FAILED: One or more DBs failed verification on $(hostname) at $(date). See log: $LOGFILE"
|
||||
fi
|
||||
|
||||
echo "DONE. Log: $LOGFILE"
|
Loading…
x
Reference in New Issue
Block a user