Skip to content

Instantly share code, notes, and snippets.

@Guiorgy
Forked from petervanderdoes/zfs_health.sh
Last active December 20, 2025 14:19
Show Gist options
  • Select an option

  • Save Guiorgy/08ead139a34481a6c3a93a86b7bb4769 to your computer and use it in GitHub Desktop.

Select an option

Save Guiorgy/08ead139a34481a6c3a93a86b7bb4769 to your computer and use it in GitHub Desktop.
ZFS Health Check Script
#!/usr/bin/env sh
# Copyright 2025 Guiorgy
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the “Software”), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
set -eu
# Exit status codes
# Avoid those codes:
# - Standard: 2, 126-128, 130, 137, 139, 143, 255
# - Convention: 3, 5-6, 64-78
CODE_OK=0 # All ZFS volumes are healthy
CODE_ERROR=1 # General script error
CODE_UNHEALTHY=4 # One of the ZFS volumes is unhealthy
CODE_RWC=7 # One of the ZFS volumes has READ/WRITE/CKSUM errors
CODE_CAPACITY=8 # One of the ZFS volumes is reaching its capacity
CODE_SCRUB=9 # One of the ZFS requires a scrub
print_help() {
cat <<EOF
Usage $(basename "$0") [OPTIONS]
Options:
-v, --verbose Print details instead of executing silently
-h, --help Display this help message and exit
Exit codes:
$CODE_OK All ZFS volumes are healthy
$CODE_ERROR General script error
$CODE_UNHEALTHY One of the ZFS volumes is unhealthy
$CODE_RWC One of the ZFS volumes has READ/WRITE/CKSUM errors
$CODE_CAPACITY One of the ZFS volumes is reaching its capacity
$CODE_SCRUB One of the ZFS requires a scrub
EOF
}
# Options
VERBOSE=0
while [ $# -gt 0 ]; do
case "$1" in
-v|--verbose) VERBOSE=1; shift ;;
-h|--help) print_help; exit $CODE_ERROR ;;
-*) echo "Unrecognized option '$1'" >&2; echo '' >&2; print_help >&2; exit $CODE_ERROR ;;
*) echo "Unrecognized argument '$1'" >&2; echo '' >&2; print_help >&2; exit $CODE_ERROR ;;
esac
done
STATUS="$(/sbin/zpool status)"
# Health - Check if all zfs volumes are in good condition.
# We are looking for any keyword signifying a degraded or broken array.
bad_conditions="$(echo "$STATUS" | grep -oE '(DEGRADED|FAULTED|OFFLINE|UNAVAIL|REMOVED|FAIL|DESTROYED)' | awk '{print "One of the pools is", $0}')"
if [ -n "$bad_conditions" ]; then
if [ "$VERBOSE" -ne 0 ]; then
echo "$bad_conditions" >&2
fi
exit $CODE_UNHEALTHY
fi
if echo "$STATUS" | grep -oE '(corrupt|cannot|unrecover)'; then
if [ "$VERBOSE" -ne 0 ]; then
echo "One of the pools is unhealthy" >&2
fi
exit $CODE_UNHEALTHY
fi
# Errors - Check the columns for READ, WRITE and CKSUM (checksum) drive errors
# on all volumes and all drives using "zpool status". If any non-zero errors
# are reported an email will be sent out. You should then look to replace the
# faulty drive and run "zpool scrub" on the affected volume after resilvering.
rwc_errors="$(echo "$STATUS" | grep 'ONLINE' | grep -v 'state' | awk '{print $1, $3, $4, $5}' | grep -v '0 0 0' | awk '{msg="Vdev " $1 " has errors:"; printf "%-80s READ %-5d WRITE %-5d CKSUM %-5d\n", msg, $2, $3, $4}')"
if [ -n "$rwc_errors" ]; then
if [ "$VERBOSE" -ne 0 ]; then
echo "$rwc_errors" >&2
fi
exit $CODE_RWC
fi
# Capacity
#
# ZFS uses a copy-on-write scheme. The file system writes new data to
# sequential free blocks first and when the uberblock has been updated the new
# inode pointers become valid. This method is true only when the pool has
# enough free sequential blocks. If the pool is at capacity and space limited,
# ZFS will have to randomly write blocks. This means ZFS can not create an
# optimal set of sequential writes and write performance is severely impacted.
#
# As such, it is recommended to set a quote of 80%-95% of the total capacity.
# The percentage really depends on how large your volume is. If you have a
# 128GB SSD then 80% is reasonable. If you have a 60TB raid-z2 array then you
# can probably set the warning closer to 95%. 90% is a good default.
defaultQuotaCapacity=90
minRemainingCapacity=5
zfsVolumes=$(/sbin/zpool list -H -o name)
for volume in ${zfsVolumes}; do
size=$(/sbin/zpool list -H -p -o size $volume)
if [ "$VERBOSE" -ne 0 ]; then
echo "Pool $volume size: $size Bytes"
fi
quota=$(/sbin/zfs get -H -p -o value quota $volume)
if [ "$VERBOSE" -ne 0 ]; then
echo "Pool $volume quota: $quota Bytes"
fi
if [ $quota -ne 0 ]; then
quotaCapacity=$(echo "$quota / $size * 100" | bc -l)
else
quotaCapacity=$defaultQuotaCapacity
fi
if [ "$VERBOSE" -ne 0 ]; then
printf 'Pool %s quota capacity: %.1f%%\n' "$volume" "$quotaCapacity"
fi
capacity=$(/sbin/zpool list -H -p -o capacity $volume)
if [ "$VERBOSE" -ne 0 ]; then
printf 'Pool %s used capacity: %.1f%%\n' "$volume" "$capacity"
fi
capacityLeft=$(echo "$quotaCapacity - $capacity" | bc -l)
if [ "$VERBOSE" -ne 0 ]; then
printf 'Pool %s remaining capacity: %.1f%%\n' "$volume" "$capacityLeft"
fi
if [ $(echo "$capacityLeft < $minRemainingCapacity" | bc -l) -eq 1 ]; then
if [ "$VERBOSE" -ne 0 ]; then
echo "Pool $volume is low on capacity" >&2
fi
exit $CODE_CAPACITY
fi
done
exit $CODE_OK
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment