Created
February 7, 2026 04:27
-
-
Save detain/7d2f7aa0577e831d03df68faa048292f to your computer and use it in GitHub Desktop.
Looks for settings in mysql, host os, and hardware config that are current bottlenecks and suggest how to fix them.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| set -euo pipefail | |
| ############################################ | |
| # VISUALS (gum optional) | |
| ############################################ | |
| if command -v gum >/dev/null 2>&1; then | |
| H() { gum style --bold --foreground 212 "$1"; } | |
| I() { gum style --foreground 5 "[INFO] $1"; } | |
| W() { gum style --bold --foreground 214 "[WARN] $1"; } | |
| E() { gum style --bold --foreground 196 "[ERROR] $1"; } | |
| else | |
| H() { echo "==== $1 ===="; } | |
| I() { echo "[INFO] $1"; } | |
| W() { echo "[WARN] $1"; } | |
| E() { echo "[ERROR] $1"; } | |
| fi | |
| WARNINGS=0 | |
| ERRORS=0 | |
| MYSQL="mysql --protocol=socket -N -B" | |
| DATE=$(date) | |
| warn() { W "$1"; WARNINGS=$((WARNINGS+1)); } | |
| error() { E "$1"; ERRORS=$((ERRORS+1)); } | |
| ############################################ | |
| # PRECHECKS | |
| ############################################ | |
| [[ $EUID -eq 0 ]] || { error "Must run as root"; exit 1; } | |
| if ! mysqladmin ping >/dev/null 2>&1; then | |
| error "Cannot connect to MySQL (check credentials/socket)" | |
| exit 1 | |
| fi | |
| ############################################ | |
| # SYSTEM MEMORY & CPU CONTEXT | |
| ############################################ | |
| H "System Context" | |
| TOTAL_MB=$(awk '/MemTotal/ {print int($2/1024)}' /proc/meminfo) | |
| CORES=$(nproc) | |
| I "System RAM: ${TOTAL_MB} MB" | |
| I "CPU cores: $CORES" | |
| ############################################ | |
| # SYSCTL BOTTLENECK ANALYSIS (WITH EXPLANATION) | |
| ############################################ | |
| H "Kernel / sysctl Bottleneck Analysis" | |
| explain_sysctl() { | |
| echo " Why this matters:" | |
| echo " $1" | |
| echo " Effect if tuned correctly:" | |
| echo " $2" | |
| } | |
| check_sysctl() { | |
| local key=$1 | |
| local recommended=$2 | |
| local explanation=$3 | |
| local effect=$4 | |
| local value | |
| value=$(sysctl -n "$key" 2>/dev/null || echo "N/A") | |
| I "$key = $value" | |
| explain_sysctl "$explanation" "$effect" | |
| if [[ "$value" != "N/A" && "$value" -gt "$recommended" ]]; then | |
| warn "$key is too high for MySQL workloads" | |
| echo " How to fix:" | |
| echo " Runtime: sysctl -w $key=$recommended" | |
| echo " Persist: echo '$key=$recommended' >> /etc/sysctl.d/mysql.conf" | |
| fi | |
| echo | |
| } | |
| check_sysctl vm.swappiness 1 \ | |
| "Swapping MySQL memory causes random latency spikes and stalls applier threads." \ | |
| "MySQL buffer pool remains resident; replication apply becomes stable." | |
| check_sysctl vm.dirty_ratio 10 \ | |
| "High dirty ratios cause sudden blocking flushes, freezing commits." \ | |
| "Smoother background flushing and predictable fsync latency." | |
| check_sysctl vm.dirty_background_ratio 5 \ | |
| "Late background flushing allows dirty pages to pile up." \ | |
| "Continuous flushing prevents I/O storms." | |
| ############################################ | |
| # ZFS ARC FULL ANALYSIS | |
| ############################################ | |
| H "ZFS ARC Deep Analysis & Recommendations" | |
| if command -v zfs >/dev/null 2>&1; then | |
| I "ZFS detected, checking ARC limits and usage..." | |
| # Read ZFS ARC max/min | |
| ARC_MAX=$(cat /sys/module/zfs/parameters/zfs_arc_max 2>/dev/null || echo 0) | |
| ARC_MIN=$(cat /sys/module/zfs/parameters/zfs_arc_min 2>/dev/null || echo 0) | |
| # Convert to MB for readability | |
| ARC_MAX_MB=$((ARC_MAX / 1024 / 1024)) | |
| ARC_MIN_MB=$((ARC_MIN / 1024 / 1024)) | |
| I "ZFS ARC max: ${ARC_MAX_MB} MB" | |
| I "ZFS ARC min: ${ARC_MIN_MB} MB" | |
| # Current ARC size from /proc/kstat | |
| ARC_CUR=$(awk '/^size/ {print $3}' /proc/spl/kstat/zfs/arcstats) | |
| ARC_CUR_MB=$((ARC_CUR / 1024 / 1024)) | |
| I "ZFS ARC current usage: ${ARC_CUR_MB} MB" | |
| # ARC hit/miss stats | |
| ARC_HITS=$(awk '/^hits/ {print $3}' /proc/spl/kstat/zfs/arcstats) | |
| ARC_MISSES=$(awk '/^misses/ {print $3}' /proc/spl/kstat/zfs/arcstats) | |
| TOTAL_ACCESS=$((ARC_HITS + ARC_MISSES)) | |
| if [[ $TOTAL_ACCESS -gt 0 ]]; then | |
| HIT_RATIO=$(( 100 * ARC_HITS / TOTAL_ACCESS )) | |
| else | |
| HIT_RATIO=0 | |
| fi | |
| I "ARC hit ratio: ${HIT_RATIO}%" | |
| # Compare ARC + InnoDB buffer pool vs RAM | |
| BP_BYTES=$($MYSQL -e "SHOW VARIABLES LIKE 'innodb_buffer_pool_size';" | awk '{print $2}') | |
| BP_MB=$((BP_BYTES/1024/1024)) | |
| I "InnoDB buffer pool: ${BP_MB} MB" | |
| TOTAL_RAM_MB=$((TOTAL_MB)) | |
| COMBINED=$((ARC_CUR_MB + BP_MB)) | |
| if [[ $COMBINED -gt $((TOTAL_RAM_MB * 80 / 100)) ]]; then | |
| warn "ARC + InnoDB buffer pool may exceed 80% of RAM → risk of swapping" | |
| echo " Why this matters:" | |
| echo " • MySQL buffer pool and ARC compete for RAM" | |
| echo " • Excessive ARC can evict MySQL pages → replication stalls" | |
| echo " Recommended fix:" | |
| echo " • Reduce zfs_arc_max to ~25% of RAM" | |
| echo " • Ensure InnoDB buffer pool ~70–75% of RAM" | |
| echo " Effect:" | |
| echo " • Better buffer pool hit rate" | |
| echo " • More predictable replication apply latency" | |
| else | |
| I "ARC + buffer pool sizing is within safe limits." | |
| fi | |
| # Check ARC efficiency | |
| if [[ $HIT_RATIO -lt 85 ]]; then | |
| warn "ARC hit ratio <85% → consider increasing ARC size if memory allows or tune dataset" | |
| echo " Effect of increase:" | |
| echo " • More reads served from cache → faster replication apply and queries" | |
| echo " • Less disk I/O" | |
| fi | |
| # Recommended ARC sizing based on RAM and buffer pool | |
| RECOMMENDED_ARC_MB=$((TOTAL_RAM_MB*25/100)) | |
| echo " Recommended max ARC size: ${RECOMMENDED_ARC_MB} MB (25% of RAM)" | |
| echo " How to change:" | |
| echo " Runtime: echo $((RECOMMENDED_ARC_MB*1024*1024)) > /sys/module/zfs/parameters/zfs_arc_max" | |
| echo " Persistent: add 'options zfs zfs_arc_max=$((RECOMMENDED_ARC_MB*1024*1024))' to /etc/modprobe.d/zfs.conf" | |
| else | |
| I "ZFS not detected → skipping ARC checks" | |
| fi | |
| ############################################ | |
| # ZFS ANALYSIS (DETAILED) | |
| ############################################ | |
| H "ZFS Storage Analysis" | |
| if command -v zfs >/dev/null 2>&1; then | |
| I "ZFS detected" | |
| ARC_MAX=$(cat /sys/module/zfs/parameters/zfs_arc_max 2>/dev/null || echo 0) | |
| ARC_MB=$((ARC_MAX/1024/1024)) | |
| I "ZFS ARC max: ${ARC_MB} MB" | |
| echo " Why this matters:" | |
| echo " ZFS ARC competes with InnoDB buffer pool for memory." | |
| echo " Double caching reduces effective MySQL cache size." | |
| warn "Potential double caching (ARC + InnoDB buffer pool)" | |
| echo " Recommended approach:" | |
| echo " • ARC ≈ 20–25% of RAM" | |
| echo " • InnoDB buffer pool ≈ 70–75% of RAM" | |
| echo " Effect:" | |
| echo " • Higher buffer hit rate" | |
| echo " • Faster replication apply" | |
| echo " • Lower memory pressure" | |
| zfs get recordsize,atime,sync -H -o property,value 2>/dev/null | while read p v; do | |
| I "ZFS $p = $v" | |
| done | |
| else | |
| I "ZFS not detected" | |
| fi | |
| ############################################ | |
| # GROUP REPLICATION – MEMBER STATS | |
| ############################################ | |
| H "Group Replication – Member Apply Backlog" | |
| $MYSQL -e " | |
| SELECT | |
| MEMBER_ID, | |
| COUNT_TRANSACTIONS_REMOTE_IN_APPLIER_QUEUE AS txns_pending, | |
| COUNT_CONFLICTS_DETECTED AS conflicts, | |
| COUNT_TRANSACTIONS_ROWS_VALIDATING AS validating | |
| FROM performance_schema.replication_group_member_stats; | |
| " | |
| ############################################ | |
| # MAX BACKLOG & TREND | |
| ############################################ | |
| MAX_LAG=$($MYSQL -e " | |
| SELECT COALESCE(MAX(COUNT_TRANSACTIONS_REMOTE_IN_APPLIER_QUEUE),0) | |
| FROM performance_schema.replication_group_member_stats; | |
| ") | |
| I "Max pending transactions (apply backlog): $MAX_LAG" | |
| echo " What this means:" | |
| echo " This is NOT time-based lag." | |
| echo " It is certified transactions waiting to be applied locally." | |
| if [[ $MAX_LAG -gt 0 ]]; then | |
| warn "Replication apply backlog detected" | |
| fi | |
| ############################################ | |
| # LAG TREND SAMPLING | |
| ############################################ | |
| H "Apply Backlog Trend (30s)" | |
| L1=$MAX_LAG | |
| sleep 30 | |
| L2=$($MYSQL -e " | |
| SELECT COALESCE(MAX(COUNT_TRANSACTIONS_REMOTE_IN_APPLIER_QUEUE),0) | |
| FROM performance_schema.replication_group_member_stats; | |
| ") | |
| I "Backlog start: $L1" | |
| I "Backlog end: $L2" | |
| ############################################ | |
| # AUTOMATIC BOTTLENECK CLASSIFICATION | |
| ############################################ | |
| H "Automatic Bottleneck Classification" | |
| CLASS="Healthy" | |
| if [[ $L2 -gt $L1 && $L2 -gt 50 ]]; then | |
| CLASS="I/O bound" | |
| elif [[ $L2 -gt 0 && $L2 -le 50 ]]; then | |
| CLASS="Memory pressure / flush lag" | |
| fi | |
| I "Detected primary bottleneck: $CLASS" | |
| case "$CLASS" in | |
| "I/O bound") | |
| echo " Why:" | |
| echo " Applier threads are blocked waiting for disk flush." | |
| echo " Expected symptoms:" | |
| echo " • Commit latency spikes" | |
| echo " • Flow control activation" | |
| echo " Recommended fixes:" | |
| echo " • Increase innodb_io_capacity" | |
| echo " • Reduce dirty ratios" | |
| echo " • Fix ZFS sync / fsync latency" | |
| ;; | |
| "Memory pressure / flush lag") | |
| echo " Why:" | |
| echo " Dirty pages accumulate faster than they are flushed." | |
| echo " Effect:" | |
| echo " • Periodic stalls" | |
| echo " • Slow replication apply" | |
| ;; | |
| "Healthy") | |
| echo " Apply rate matches commit rate." | |
| ;; | |
| esac | |
| ############################################ | |
| # PERFORMANCE SCHEMA – WAITS | |
| ############################################ | |
| H "Performance Schema – Top Global Waits" | |
| $MYSQL -e " | |
| SELECT EVENT_NAME, | |
| COUNT_STAR, | |
| ROUND(SUM_TIMER_WAIT/1e12,2) AS seconds | |
| FROM performance_schema.events_waits_summary_global_by_event_name | |
| ORDER BY SUM_TIMER_WAIT DESC | |
| LIMIT 8; | |
| " | |
| echo | |
| echo "How to read this:" | |
| echo "• wait/io/* → storage bottleneck" | |
| echo "• wait/synch/* → mutex / latch contention" | |
| echo "• wait/lock/* → schema or index issues" | |
| ############################################ | |
| # INNODB CONTENTION SIGNALS | |
| ############################################ | |
| H "InnoDB Internal Pressure Signals" | |
| $MYSQL -e " | |
| SELECT VARIABLE_NAME, VARIABLE_VALUE | |
| FROM performance_schema.global_status | |
| WHERE VARIABLE_NAME IN ( | |
| 'Innodb_buffer_pool_wait_free', | |
| 'Innodb_row_lock_waits', | |
| 'Innodb_row_lock_time' | |
| ); | |
| " | |
| echo | |
| echo "Interpretation:" | |
| echo "• buffer_pool_wait_free > 0 → buffer pool too small" | |
| echo "• row_lock_waits increasing → hot rows or missing indexes" | |
| ############################################ | |
| # MYSQL MEMORY SIZING (WITH EXPLANATION) | |
| ############################################ | |
| H "MySQL Memory Allocation" | |
| BP_BYTES=$($MYSQL -e " | |
| SHOW VARIABLES LIKE 'innodb_buffer_pool_size'; | |
| " | awk '{print $2}') | |
| BP_MB=$((BP_BYTES/1024/1024)) | |
| I "InnoDB buffer pool: ${BP_MB} MB" | |
| if [[ $BP_MB -lt $((TOTAL_MB*70/100)) ]]; then | |
| warn "InnoDB buffer pool undersized" | |
| echo " Why this matters:" | |
| echo " • More disk reads" | |
| echo " • Slower apply threads" | |
| echo " • Increased fsync pressure" | |
| echo " Effect of increasing it:" | |
| echo " • Higher cache hit rate" | |
| echo " • Faster replication apply" | |
| echo " • Lower disk latency" | |
| echo " How to fix (persistent, safe):" | |
| echo " SET PERSIST innodb_buffer_pool_size=$((TOTAL_MB*75/100))*1024*1024;" | |
| fi | |
| ############################################ | |
| # FINAL SUMMARY | |
| ############################################ | |
| H "Final Summary" | |
| echo "Scan time: $DATE" | |
| echo "Warnings: $WARNINGS" | |
| echo "Errors: $ERRORS" | |
| if [[ $ERRORS -gt 0 ]]; then | |
| echo "❌ Critical issues detected" | |
| elif [[ $WARNINGS -gt 0 ]]; then | |
| echo "⚠️ Performance tuning recommended" | |
| else | |
| echo "✅ System is well-aligned for InnoDB Cluster" | |
| fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment