From b40dd0828a55044378e86e99f4ee7ed64dffa319 Mon Sep 17 00:00:00 2001 From: DocTator Date: Wed, 30 Apr 2025 08:55:37 -0400 Subject: [PATCH] Auto-commit from giteapush.sh at 2025-04-30 08:55:37 --- postmortem/genesisradiozfsmigration.md | 77 ++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 postmortem/genesisradiozfsmigration.md diff --git a/postmortem/genesisradiozfsmigration.md b/postmortem/genesisradiozfsmigration.md new file mode 100644 index 0000000..08e2676 --- /dev/null +++ b/postmortem/genesisradiozfsmigration.md @@ -0,0 +1,77 @@ +from datetime import datetime + +# Define the content of the post-mortem +post_mortem_content = f"""# 🔧 Post-Mortem: Genesis Radio Storage Migration +**Date:** April 30, 2025 +**Prepared by:** Doc +**Systems Affected:** StationPlaylist (SPL), Voice Tracker, Genesis Radio media backend + +--- + +## 🧠 Executive Summary + +Genesis Radio’s backend was migrated from a legacy MinIO instance using local disk (ext4) to a new **ZFS-based, encrypted MinIO deployment on `shredderv2`**. This change was driven by a need for more stable performance, improved security, and a cleaner storage architecture with proper bucket separation. + +This migration was completed **without touching production** until final validation, and all critical services remained online throughout the transition. We also revamped the rclone caching strategy to reduce freeze-ups and playback hiccups. + +--- + +## ✅ What We Did + +- Created **three new secure buckets**: `genesislibrary-secure`, `genesisassets-secure`, and `genesisshows-secure` +- Migrated data from backup server using `rclone sync`: + - `genesislibrary` came directly from backup + - `genesisassets` and `genesisshows` were pulled from the same bucket, with de-duping and cleanup to be completed post-migration +- Retained **original SPL drive letters** (`Q:\\`, `R:\\`) to avoid changes to the playout config +- Switched rclone mounts to point to the new secure buckets, with **aggressive VFS caching** using SSD-backed cache directories +- Took a clean **ZFS snapshot** (`@pre-s3-switch`) before switching over +- Confirmed no regression in SPL, VT Tracker, or streaming audio + +--- + +## ⚙️ Technical Improvements + +- **VFS caching overhaul**: + - Increased read-ahead (`1G`), lowered write-back wait + - Split cache between `X:\\librarycache` and `L:\\assetcache` + - No more rclone choking on large files or freezing during transitions +- **Encrypted S3 storage** with isolated buckets per functional role +- **TLS-secured** Console and MinIO endpoints with automated renewal +- Mounted buckets at startup via batch script (future systemd equivalents to be implemented) +- Snapshot-based rollback in ZFS enabled post-deployment resilience + +--- + +## 🩹 What Went Weird (and We Fixed It) + +- SPL froze during initial `mc mirror` attempts — solution: switched to `rclone`, which performed exponentially faster +- Some hiccups during early cache tuning, including sparse file support issues — solved by switching to ZFS +- Missing media files in Mastodon were traced to uploads during sync; resolved with staged sync + retry before final switch +- Certbot automation wasn’t configured — resolved with a systemd timer that stops nginx, renews, and restarts nginx automatically + +--- + +## 🧯 What We Learned + +- MinIO is solid, but **rclone wins for bulk sync performance** +- VFS cache settings **make or break** media-heavy workloads like SPL +- ZFS is a game-changer: no sparse file errors, reliable snapshots, clean rollback +- Planning matters: pre-syncing from backup avoided downtime +- Not touching prod until ready keeps stress and screwups to a minimum + +--- + +## 📦 Next Steps + +- [ ] Clean `genesisassets-secure` of misplaced show files +- [ ] Sync `azuracast` from live system (no backup copy yet) +- [ ] Build automated snapshot send-to-backup workflow (`zfs send | ssh backup zfs recv`) +- [ ] Stage full failover simulation (optional but fun) +""" + +# Save it as a Markdown file +file_path = "/mnt/data/genesis_radio_migration_postmortem.md" +with open(file_path, "w") as f: + f.write(post_mortem_content) + +file_path