Auto-commit from giteapush.sh at 2025-05-19 16:53:23

This commit is contained in:
DocTator 2025-05-19 16:53:23 -04:00
parent b948144f71
commit 66dd79429b
2 changed files with 364 additions and 0 deletions

View File

@ -0,0 +1,76 @@
# Postmortem: SPL Media Disk Incident and Disaster Recovery Drill
**Date:** [05/19/2025]
**Author:** Doc (Genesis Hosting)
---
## Summary
On [05/19/2025], while attempting to remove a deprecated RAID5 drive from the SPL Windows host, the incorrect disk was accidentally detached. This disk contained the live SPL media volume. Due to Windows' handling of dynamic disks, the volume was marked as "Failed" and inaccessible, triggering an immediate DR response.
Despite the unintentional nature of the incident, it served as a live test of Genesis Hosting's SPL disaster recovery process. The full restore was completed successfully in under an hour using tarball-based SCP transfer from Shredder, validating both the local snapshot source and DR scripting approach.
---
## Timeline
- **T-0 (Start):** Attempt made to remove deprecated RAID5 disk
- **T+0:** Incorrect disk unplugged (live SPL media)
- **T+2m:** Disk appears in Windows as "Missing/Failed"
- **T+5m:** SCP-based restore initiated from Shredder
- **T+10m:** `.zfs` snapshot artifact detected and ignored
- **T+15m:** Decision made to continue full tarball-based SCP restore
- **T+58m:** Restore completed to `R:\` and SPL resumed functionality
---
## Impact
- SPL station was temporarily offline (estimated downtime < 1 hour)
- No data was lost
- No external users were affected due to off-air timing
---
## Root Cause
Human error during manual drive removal in a mixed-disk environment where Windows showed multiple 5TB drives.
---
## Resolution
- Restore initiated from validated ZFS source (Shredder)
- SCP-based tarball transfer completed
- Permissions and structure preserved
- SPL fully restored to operational state
---
## Lessons Learned
1. Windows dynamic disks are fragile and easily corrupted by hot-unplug events
2. SCP is reliable but not optimal for large restores
3. `.zfs` snapshot visibility can interfere with SCP unless explicitly excluded
4. Tarball-based transfers dramatically reduce restore time
5. Disaster recovery scripts should log and time every phase
---
## Action Items
- [x] Set up secondary disk on SPL host for test restores
- [x] Begin alternating restore tests from Shredder and Linode Object Storage
- [x] Convert restore flow to tarball-based for faster execution
- [ ] Formalize `genesisctl drill` command for DR testing
- [ ] Add timed logging to all DR scripts
- [ ] Expand approach to AzuraCast and Mastodon (in progress)
---
## Conclusion
While the incident began as a misstep, it evolved into a high-value test of Genesis Hosting's disaster recovery capabilities. The successful, timely restore validated the core backup architecture and highlighted key improvements to be made in automation, speed, and DR testing processes moving forward.
This will serve as Drill #1 in the GenesisOps DR series, codename: **Sterling Forest**.

288
miscellaneous/vps/genesisctlv3.sh Executable file
View File

@ -0,0 +1,288 @@
#!/usr/bin/env bash
# genesisctl - Genesis VPS Provisioning and Reboot CLI
# Usage:
# genesisctl provision <label> <region> <type> <image> [root_pass]
# genesisctl reboot <linode-id>
# genesisctl list regions|types|images
# genesisctl ultra <label> [root_pass]
# genesisctl safe <label> [root_pass]
# genesisctl micro <label> [root_pass]
# genesisctl mastodon <label> [root_pass]
# genesisctl destroy <label>
LINODE_API_TOKEN="f8b1552bf1f2f791e16fed0c1474d56014330de1c33810527523e44a7389cb6f"
# Package presets
PACKAGE_ULTRA_REGION="us-east"
PACKAGE_ULTRA_TYPE="g6-dedicated-4"
PACKAGE_ULTRA_IMAGE="linode/ubuntu22.04"
PACKAGE_SAFE_REGION="us-east"
PACKAGE_SAFE_TYPE="g6-standard-2"
PACKAGE_SAFE_IMAGE="linode/ubuntu22.04"
PACKAGE_MICRO_REGION="us-east"
PACKAGE_MICRO_TYPE="g6-nanode-1"
PACKAGE_MICRO_IMAGE="linode/ubuntu22.04"
PACKAGE_MASTODON_REGION="us-east"
PACKAGE_MASTODON_TYPE="g6-standard-4"
PACKAGE_MASTODON_IMAGE="linode/ubuntu22.04"
provision_vps() {
LABEL="$1"
REGION="$2"
TYPE="$3"
IMAGE="$4"
ROOT_PASS="${5:-$(openssl rand -base64 16)}"
if [[ "$LINODE_API_TOKEN" == "REPLACE_WITH_YOUR_LINODE_API_TOKEN" ]]; then
echo "❌ Error: You must set your LINODE_API_TOKEN at the top of this script."
exit 1
fi
USER_DATA=$(echo "#cloud-config
packages:
- rsync
- fail2ban
runcmd:
- curl -s https://help.sshjunkie.com/scripts/genesis-bootstrap.sh | bash" | base64 -w 0)
echo "Provisioning VPS '$LABEL' in $REGION with type $TYPE and image $IMAGE..."
echo "[DEBUG] Using API token prefix: ${LINODE_API_TOKEN:0:8}********"
echo "[DEBUG] JSON Payload to send:"; echo "$JSON_PAYLOAD"
TMP_FILE=$(mktemp)
JSON_PAYLOAD=$(cat <<EOF
{
"label": "$LABEL",
"region": "$REGION",
"type": "$TYPE",
"image": "$IMAGE",
"authorized_users": [],
"root_pass": "$ROOT_PASS",
"booted": true,
"metadata": {
"user_data": "$USER_DATA"
}
}
EOF
)
HTTP_STATUS=$(curl -s -o "$TMP_FILE" -w "%{http_code}" -X POST https://api.linode.com/v4/linode/instances \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $LINODE_API_TOKEN" \
-d "$JSON_PAYLOAD")
echo -e "
--- HTTP STATUS: $HTTP_STATUS ---"
echo "--- RAW RESPONSE: ---"
cat "$TMP_FILE"
if [[ "$HTTP_STATUS" != "200" && "$HTTP_STATUS" != "201" ]]; then
echo -e "
❌ Failed to provision VPS (HTTP $HTTP_STATUS)"
jq . "$TMP_FILE"
exit 1
fi
echo -e "
✅ VPS provisioned:"
IP=$(jq -r '.ipv4[0]' "$TMP_FILE")
echo "Label: $LABEL"
echo "IP Address: $IP"
echo "Root Password: $ROOT_PASS"
rm "$TMP_FILE"
}
reboot_vps() {
LINODE_ID="$1"
echo "Rebooting Linode VPS ID $LINODE_ID..."
curl -s -X POST https://api.linode.com/v4/linode/instances/$LINODE_ID/reboot \
-H "Authorization: Bearer $LINODE_API_TOKEN" | jq
}
destroy_vps_by_label() {
LABEL="$1"
echo "Looking for VPS with label '$LABEL'..."
LINODE_ID=$(curl -s -H "Authorization: Bearer $LINODE_API_TOKEN" \
https://api.linode.com/v4/linode/instances | \
jq -r --arg LABEL "$LABEL" '.data[] | select(.label == $LABEL) | .id')
if [ -z "$LINODE_ID" ]; then
echo "Error: No Linode found with label '$LABEL'"
exit 1
fi
read -p "Are you sure you want to destroy VPS '$LABEL' (ID: $LINODE_ID)? [y/N] " confirm
if [[ "$confirm" =~ ^[Yy]$ ]]; then
echo "Destroying Linode with ID $LINODE_ID (label: $LABEL)..."
HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X DELETE \
https://api.linode.com/v4/linode/instances/$LINODE_ID \
-H "Authorization: Bearer $LINODE_API_TOKEN")
if [[ "$HTTP_STATUS" == "204" ]]; then
echo "✅ Linode $LABEL (ID $LINODE_ID) has been destroyed."
else
echo "❌ Failed to destroy VPS. HTTP status: $HTTP_STATUS"
fi
else
echo "Cancelled. VPS '$LABEL' not destroyed."
fi
}
enable_backups_by_label() {
LABEL="$1"
LINODE_ID=$(curl -s -H "Authorization: Bearer $LINODE_API_TOKEN" \
https://api.linode.com/v4/linode/instances | \
jq -r --arg LABEL "$LABEL" '.data[] | select(.label == $LABEL) | .id')
if [ -z "$LINODE_ID" ]; then
echo "❌ No Linode found with label '$LABEL'"
exit 1
fi
echo "Enabling backups for Linode '$LABEL' (ID: $LINODE_ID)..."
HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST \
https://api.linode.com/v4/linode/instances/$LINODE_ID/backups/enable \
-H "Authorization: Bearer $LINODE_API_TOKEN")
if [[ "$HTTP_STATUS" == "200" ]]; then
echo "✅ Backups enabled for Linode $LABEL."
else
echo "❌ Failed to enable backups (HTTP $HTTP_STATUS)"
fi
}
disable_backups_by_label() {
LABEL="$1"
LINODE_ID=$(curl -s -H "Authorization: Bearer $LINODE_API_TOKEN" \
https://api.linode.com/v4/linode/instances | \
jq -r --arg LABEL "$LABEL" '.data[] | select(.label == $LABEL) | .id')
if [ -z "$LINODE_ID" ]; then
echo "❌ No Linode found with label '$LABEL'"
exit 1
fi
echo "Disabling backups for Linode '$LABEL' (ID: $LINODE_ID)..."
HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST \
https://api.linode.com/v4/linode/instances/$LINODE_ID/backups/disable \
-H "Authorization: Bearer $LINODE_API_TOKEN")
if [[ "$HTTP_STATUS" == "200" ]]; then
echo "✅ Backups disabled for Linode $LABEL."
else
echo "❌ Failed to disable backups (HTTP $HTTP_STATUS)"
fi
}
status_vps() {
LABEL="$1"
curl -s -H "Authorization: Bearer $LINODE_API_TOKEN" \
https://api.linode.com/v4/linode/instances | \
jq -r --arg LABEL "$LABEL" '
.data[] | select(.label == $LABEL) |
"Label: \(.label)\nID: \(.id)\nRegion: \(.region)\nType: \(.type)\nStatus: \(.status)\nIP: \(.ipv4[0])\nCreated: \(.created)"'
}
list_all_vps() {
curl -s -H "Authorization: Bearer $LINODE_API_TOKEN" \
https://api.linode.com/v4/linode/instances | \
jq -r '
.data[] | [.label, .id, .region, .type, .ipv4[0], .status] |
@tsv' | column -t -s $'\t' | \
awk 'BEGIN { print "LABEL ID REGION TYPE IP STATUS" }
{ printf "%-11s %-10s %-10s %-16s %-15s %s\n", $1, $2, $3, $4, $5, $6 }'
}
resize_vps() {
LABEL="$1"
NEW_TYPE="$2"
LINODE_ID=$(curl -s -H "Authorization: Bearer $LINODE_API_TOKEN" \
https://api.linode.com/v4/linode/instances | \
jq -r --arg LABEL "$LABEL" '.data[] | select(.label == $LABEL) | .id')
if [ -z "$LINODE_ID" ]; then
echo "❌ No Linode found with label '$LABEL'"
exit 1
fi
echo "Resizing Linode '$LABEL' to type '$NEW_TYPE'..."
HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $LINODE_API_TOKEN" \
-d '{"type": "'"$NEW_TYPE"'"}' \
https://api.linode.com/v4/linode/instances/$LINODE_ID/resize)
if [[ "$HTTP_STATUS" == "200" ]]; then
echo "✅ Linode $LABEL resized to $NEW_TYPE."
else
echo "❌ Failed to resize VPS. HTTP status: $HTTP_STATUS"
fi
}
safe_create_dataset() {
FULLPATH="$1"
# Remove any trailing slash
FULLPATH="${FULLPATH%/}"
POOL="${FULLPATH%%/*}"
DATASET="${FULLPATH#*/}"
echo "🛰 Connecting to Shredder to safely create '${POOL}/${DATASET}'..."
ssh shredder "/usr/local/bin/genesis-safe-zfs.sh $POOL $DATASET"
}
case "$1" in
provision)
provision_vps "$2" "$3" "$4" "$5" "$6"
;;
reboot)
reboot_vps "$2"
;;
destroy)
destroy_vps_by_label "$2"
;;
safe)
provision_vps "$2" "$PACKAGE_SAFE_REGION" "$PACKAGE_SAFE_TYPE" "$PACKAGE_SAFE_IMAGE" "$3"
;;
ultra)
provision_vps "$2" "$PACKAGE_ULTRA_REGION" "$PACKAGE_ULTRA_TYPE" "$PACKAGE_ULTRA_IMAGE" "$3"
;;
micro)
provision_vps "$2" "$PACKAGE_MICRO_REGION" "$PACKAGE_MICRO_TYPE" "$PACKAGE_MICRO_IMAGE" "$3"
;;
mastodon)
provision_vps "$2" "$PACKAGE_MASTODON_REGION" "$PACKAGE_MASTODON_TYPE" "$PACKAGE_MASTODON_IMAGE" "$3"
;;
backup)
enable_backups_by_label "$2"
;;
disable-backup)
disable_backups_by_label "$2"
;;
status)
status_vps "$2"
;;
listvps)
list_all_vps
;;
resize)
resize_vps "$2" "$3"
;;
safe-create)
safe_create_dataset "$2" "$3"
;;
*)
echo "Usage: $0 <command> [...]"
echo "Available commands: provision, reboot, destroy, safe, ultra, micro, mastodon"
exit 1
;;
esac