Skip to content

Instantly share code, notes, and snippets.

@detain
Created February 7, 2026 04:27
Show Gist options
  • Select an option

  • Save detain/7d2f7aa0577e831d03df68faa048292f to your computer and use it in GitHub Desktop.

Select an option

Save detain/7d2f7aa0577e831d03df68faa048292f to your computer and use it in GitHub Desktop.
Looks for settings in mysql, host os, and hardware config that are current bottlenecks and suggest how to fix them.
#!/usr/bin/env bash
set -euo pipefail
############################################
# VISUALS (gum optional)
############################################
if command -v gum >/dev/null 2>&1; then
H() { gum style --bold --foreground 212 "$1"; }
I() { gum style --foreground 5 "[INFO] $1"; }
W() { gum style --bold --foreground 214 "[WARN] $1"; }
E() { gum style --bold --foreground 196 "[ERROR] $1"; }
else
H() { echo "==== $1 ===="; }
I() { echo "[INFO] $1"; }
W() { echo "[WARN] $1"; }
E() { echo "[ERROR] $1"; }
fi
WARNINGS=0
ERRORS=0
MYSQL="mysql --protocol=socket -N -B"
DATE=$(date)
warn() { W "$1"; WARNINGS=$((WARNINGS+1)); }
error() { E "$1"; ERRORS=$((ERRORS+1)); }
############################################
# PRECHECKS
############################################
[[ $EUID -eq 0 ]] || { error "Must run as root"; exit 1; }
if ! mysqladmin ping >/dev/null 2>&1; then
error "Cannot connect to MySQL (check credentials/socket)"
exit 1
fi
############################################
# SYSTEM MEMORY & CPU CONTEXT
############################################
H "System Context"
TOTAL_MB=$(awk '/MemTotal/ {print int($2/1024)}' /proc/meminfo)
CORES=$(nproc)
I "System RAM: ${TOTAL_MB} MB"
I "CPU cores: $CORES"
############################################
# SYSCTL BOTTLENECK ANALYSIS (WITH EXPLANATION)
############################################
H "Kernel / sysctl Bottleneck Analysis"
explain_sysctl() {
echo " Why this matters:"
echo " $1"
echo " Effect if tuned correctly:"
echo " $2"
}
check_sysctl() {
local key=$1
local recommended=$2
local explanation=$3
local effect=$4
local value
value=$(sysctl -n "$key" 2>/dev/null || echo "N/A")
I "$key = $value"
explain_sysctl "$explanation" "$effect"
if [[ "$value" != "N/A" && "$value" -gt "$recommended" ]]; then
warn "$key is too high for MySQL workloads"
echo " How to fix:"
echo " Runtime: sysctl -w $key=$recommended"
echo " Persist: echo '$key=$recommended' >> /etc/sysctl.d/mysql.conf"
fi
echo
}
check_sysctl vm.swappiness 1 \
"Swapping MySQL memory causes random latency spikes and stalls applier threads." \
"MySQL buffer pool remains resident; replication apply becomes stable."
check_sysctl vm.dirty_ratio 10 \
"High dirty ratios cause sudden blocking flushes, freezing commits." \
"Smoother background flushing and predictable fsync latency."
check_sysctl vm.dirty_background_ratio 5 \
"Late background flushing allows dirty pages to pile up." \
"Continuous flushing prevents I/O storms."
############################################
# ZFS ARC FULL ANALYSIS
############################################
H "ZFS ARC Deep Analysis & Recommendations"
if command -v zfs >/dev/null 2>&1; then
I "ZFS detected, checking ARC limits and usage..."
# Read ZFS ARC max/min
ARC_MAX=$(cat /sys/module/zfs/parameters/zfs_arc_max 2>/dev/null || echo 0)
ARC_MIN=$(cat /sys/module/zfs/parameters/zfs_arc_min 2>/dev/null || echo 0)
# Convert to MB for readability
ARC_MAX_MB=$((ARC_MAX / 1024 / 1024))
ARC_MIN_MB=$((ARC_MIN / 1024 / 1024))
I "ZFS ARC max: ${ARC_MAX_MB} MB"
I "ZFS ARC min: ${ARC_MIN_MB} MB"
# Current ARC size from /proc/kstat
ARC_CUR=$(awk '/^size/ {print $3}' /proc/spl/kstat/zfs/arcstats)
ARC_CUR_MB=$((ARC_CUR / 1024 / 1024))
I "ZFS ARC current usage: ${ARC_CUR_MB} MB"
# ARC hit/miss stats
ARC_HITS=$(awk '/^hits/ {print $3}' /proc/spl/kstat/zfs/arcstats)
ARC_MISSES=$(awk '/^misses/ {print $3}' /proc/spl/kstat/zfs/arcstats)
TOTAL_ACCESS=$((ARC_HITS + ARC_MISSES))
if [[ $TOTAL_ACCESS -gt 0 ]]; then
HIT_RATIO=$(( 100 * ARC_HITS / TOTAL_ACCESS ))
else
HIT_RATIO=0
fi
I "ARC hit ratio: ${HIT_RATIO}%"
# Compare ARC + InnoDB buffer pool vs RAM
BP_BYTES=$($MYSQL -e "SHOW VARIABLES LIKE 'innodb_buffer_pool_size';" | awk '{print $2}')
BP_MB=$((BP_BYTES/1024/1024))
I "InnoDB buffer pool: ${BP_MB} MB"
TOTAL_RAM_MB=$((TOTAL_MB))
COMBINED=$((ARC_CUR_MB + BP_MB))
if [[ $COMBINED -gt $((TOTAL_RAM_MB * 80 / 100)) ]]; then
warn "ARC + InnoDB buffer pool may exceed 80% of RAM → risk of swapping"
echo " Why this matters:"
echo " • MySQL buffer pool and ARC compete for RAM"
echo " • Excessive ARC can evict MySQL pages → replication stalls"
echo " Recommended fix:"
echo " • Reduce zfs_arc_max to ~25% of RAM"
echo " • Ensure InnoDB buffer pool ~70–75% of RAM"
echo " Effect:"
echo " • Better buffer pool hit rate"
echo " • More predictable replication apply latency"
else
I "ARC + buffer pool sizing is within safe limits."
fi
# Check ARC efficiency
if [[ $HIT_RATIO -lt 85 ]]; then
warn "ARC hit ratio <85% → consider increasing ARC size if memory allows or tune dataset"
echo " Effect of increase:"
echo " • More reads served from cache → faster replication apply and queries"
echo " • Less disk I/O"
fi
# Recommended ARC sizing based on RAM and buffer pool
RECOMMENDED_ARC_MB=$((TOTAL_RAM_MB*25/100))
echo " Recommended max ARC size: ${RECOMMENDED_ARC_MB} MB (25% of RAM)"
echo " How to change:"
echo " Runtime: echo $((RECOMMENDED_ARC_MB*1024*1024)) > /sys/module/zfs/parameters/zfs_arc_max"
echo " Persistent: add 'options zfs zfs_arc_max=$((RECOMMENDED_ARC_MB*1024*1024))' to /etc/modprobe.d/zfs.conf"
else
I "ZFS not detected → skipping ARC checks"
fi
############################################
# ZFS ANALYSIS (DETAILED)
############################################
H "ZFS Storage Analysis"
if command -v zfs >/dev/null 2>&1; then
I "ZFS detected"
ARC_MAX=$(cat /sys/module/zfs/parameters/zfs_arc_max 2>/dev/null || echo 0)
ARC_MB=$((ARC_MAX/1024/1024))
I "ZFS ARC max: ${ARC_MB} MB"
echo " Why this matters:"
echo " ZFS ARC competes with InnoDB buffer pool for memory."
echo " Double caching reduces effective MySQL cache size."
warn "Potential double caching (ARC + InnoDB buffer pool)"
echo " Recommended approach:"
echo " • ARC ≈ 20–25% of RAM"
echo " • InnoDB buffer pool ≈ 70–75% of RAM"
echo " Effect:"
echo " • Higher buffer hit rate"
echo " • Faster replication apply"
echo " • Lower memory pressure"
zfs get recordsize,atime,sync -H -o property,value 2>/dev/null | while read p v; do
I "ZFS $p = $v"
done
else
I "ZFS not detected"
fi
############################################
# GROUP REPLICATION – MEMBER STATS
############################################
H "Group Replication – Member Apply Backlog"
$MYSQL -e "
SELECT
MEMBER_ID,
COUNT_TRANSACTIONS_REMOTE_IN_APPLIER_QUEUE AS txns_pending,
COUNT_CONFLICTS_DETECTED AS conflicts,
COUNT_TRANSACTIONS_ROWS_VALIDATING AS validating
FROM performance_schema.replication_group_member_stats;
"
############################################
# MAX BACKLOG & TREND
############################################
MAX_LAG=$($MYSQL -e "
SELECT COALESCE(MAX(COUNT_TRANSACTIONS_REMOTE_IN_APPLIER_QUEUE),0)
FROM performance_schema.replication_group_member_stats;
")
I "Max pending transactions (apply backlog): $MAX_LAG"
echo " What this means:"
echo " This is NOT time-based lag."
echo " It is certified transactions waiting to be applied locally."
if [[ $MAX_LAG -gt 0 ]]; then
warn "Replication apply backlog detected"
fi
############################################
# LAG TREND SAMPLING
############################################
H "Apply Backlog Trend (30s)"
L1=$MAX_LAG
sleep 30
L2=$($MYSQL -e "
SELECT COALESCE(MAX(COUNT_TRANSACTIONS_REMOTE_IN_APPLIER_QUEUE),0)
FROM performance_schema.replication_group_member_stats;
")
I "Backlog start: $L1"
I "Backlog end: $L2"
############################################
# AUTOMATIC BOTTLENECK CLASSIFICATION
############################################
H "Automatic Bottleneck Classification"
CLASS="Healthy"
if [[ $L2 -gt $L1 && $L2 -gt 50 ]]; then
CLASS="I/O bound"
elif [[ $L2 -gt 0 && $L2 -le 50 ]]; then
CLASS="Memory pressure / flush lag"
fi
I "Detected primary bottleneck: $CLASS"
case "$CLASS" in
"I/O bound")
echo " Why:"
echo " Applier threads are blocked waiting for disk flush."
echo " Expected symptoms:"
echo " • Commit latency spikes"
echo " • Flow control activation"
echo " Recommended fixes:"
echo " • Increase innodb_io_capacity"
echo " • Reduce dirty ratios"
echo " • Fix ZFS sync / fsync latency"
;;
"Memory pressure / flush lag")
echo " Why:"
echo " Dirty pages accumulate faster than they are flushed."
echo " Effect:"
echo " • Periodic stalls"
echo " • Slow replication apply"
;;
"Healthy")
echo " Apply rate matches commit rate."
;;
esac
############################################
# PERFORMANCE SCHEMA – WAITS
############################################
H "Performance Schema – Top Global Waits"
$MYSQL -e "
SELECT EVENT_NAME,
COUNT_STAR,
ROUND(SUM_TIMER_WAIT/1e12,2) AS seconds
FROM performance_schema.events_waits_summary_global_by_event_name
ORDER BY SUM_TIMER_WAIT DESC
LIMIT 8;
"
echo
echo "How to read this:"
echo "• wait/io/* → storage bottleneck"
echo "• wait/synch/* → mutex / latch contention"
echo "• wait/lock/* → schema or index issues"
############################################
# INNODB CONTENTION SIGNALS
############################################
H "InnoDB Internal Pressure Signals"
$MYSQL -e "
SELECT VARIABLE_NAME, VARIABLE_VALUE
FROM performance_schema.global_status
WHERE VARIABLE_NAME IN (
'Innodb_buffer_pool_wait_free',
'Innodb_row_lock_waits',
'Innodb_row_lock_time'
);
"
echo
echo "Interpretation:"
echo "• buffer_pool_wait_free > 0 → buffer pool too small"
echo "• row_lock_waits increasing → hot rows or missing indexes"
############################################
# MYSQL MEMORY SIZING (WITH EXPLANATION)
############################################
H "MySQL Memory Allocation"
BP_BYTES=$($MYSQL -e "
SHOW VARIABLES LIKE 'innodb_buffer_pool_size';
" | awk '{print $2}')
BP_MB=$((BP_BYTES/1024/1024))
I "InnoDB buffer pool: ${BP_MB} MB"
if [[ $BP_MB -lt $((TOTAL_MB*70/100)) ]]; then
warn "InnoDB buffer pool undersized"
echo " Why this matters:"
echo " • More disk reads"
echo " • Slower apply threads"
echo " • Increased fsync pressure"
echo " Effect of increasing it:"
echo " • Higher cache hit rate"
echo " • Faster replication apply"
echo " • Lower disk latency"
echo " How to fix (persistent, safe):"
echo " SET PERSIST innodb_buffer_pool_size=$((TOTAL_MB*75/100))*1024*1024;"
fi
############################################
# FINAL SUMMARY
############################################
H "Final Summary"
echo "Scan time: $DATE"
echo "Warnings: $WARNINGS"
echo "Errors: $ERRORS"
if [[ $ERRORS -gt 0 ]]; then
echo "❌ Critical issues detected"
elif [[ $WARNINGS -gt 0 ]]; then
echo "⚠️ Performance tuning recommended"
else
echo "✅ System is well-aligned for InnoDB Cluster"
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment