Skip to content

Instantly share code, notes, and snippets.

@clementnuss
Created November 20, 2025 15:14
Show Gist options
  • Select an option

  • Save clementnuss/985f09116a7ddf9d88a373b98cf01e1c to your computer and use it in GitHub Desktop.

Select an option

Save clementnuss/985f09116a7ddf9d88a373b98cf01e1c to your computer and use it in GitHub Desktop.
netapp trident nas-economy snapshot export policy bug
#!/bin/bash
# Script to reproduce the read-only clone export policy bug in Trident nas-economy driver
#
# Bug Description:
# When a read-only clone is deleted on the same node as its source volume, the unpublish
# operation incorrectly removes export policy rules that are still needed by the source.
# This is NOT a race condition - it's a deterministic logic bug where the code fails to
# check for remaining publications of the source volume before removing export rules.
#
# Test Scenario:
# - Create a source PVC with a persistent read-only pod running throughout all iterations
# - Each iteration creates a new VolumeSnapshot and read-only clone PVC on the same node
# - Create a short-lived clone pod, then delete it to trigger unpublish
# - The bug causes the source pod to lose NFS access with "Stale file handle" errors
#
# Known Issues:
# - There's a separate Trident bug where VolumeSnapshots may incorrectly reference a
# TridentVolume in 'Deleting' phase instead of the active volume with the same name.
# This can prevent this test from triggering the export policy bug.
# Workaround: Ensure no stale TridentVolumes exist before running this test.
#
# Usage:
# ./test-readonly-clone-export-policy-bug.sh # Run the test
# ./test-readonly-clone-export-policy-bug.sh cleanup # Clean up test resources
set -e
# Configuration
NAMESPACE="${NAMESPACE:-default}"
PVC_NAME="${PVC_NAME:-test-export-bug-pvc}"
STORAGE_CLASS="${STORAGE_CLASS:-nfs}" # Adjust to your nas-economy storage class
ITERATIONS="${ITERATIONS:-10}" # Number of times to repeat the test
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
log() {
echo -e "${GREEN}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $*"
}
error() {
echo -e "${RED}[$(date '+%Y-%m-%d %H:%M:%S')] ERROR:${NC} $*"
}
warn() {
echo -e "${YELLOW}[$(date '+%Y-%m-%d %H:%M:%S')] WARN:${NC} $*"
}
# Detect nodes
detect_nodes() {
log "Auto-detecting worker nodes..."
AVAILABLE_NODES=($(kubectl get nodes --selector='!node-role.kubernetes.io/master,!node-role.kubernetes.io/control-plane' -o jsonpath='{.items[*].metadata.name}'))
if [[ ${#AVAILABLE_NODES[@]} -lt 1 ]]; then
error "Need at least 1 worker node. Found: ${#AVAILABLE_NODES[@]}"
exit 1
fi
log "Available nodes: ${AVAILABLE_NODES[*]}"
log "Will use node ${AVAILABLE_NODES[0]} for all pods"
}
# Create PVC
create_pvc() {
log "Creating PVC: $PVC_NAME"
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: $PVC_NAME
namespace: $NAMESPACE
spec:
accessModes:
- ReadWriteMany
storageClassName: $STORAGE_CLASS
resources:
requests:
storage: 1Gi
EOF
log "Waiting for PVC to be bound..."
kubectl wait --for=jsonpath='{.status.phase}'=Bound pvc/$PVC_NAME -n $NAMESPACE --timeout=60s
log "PVC is bound"
}
# Wait for pod to be running
wait_for_pod() {
local pod_name=$1
local timeout=${2:-30}
log "Waiting for pod $pod_name to be running..."
if kubectl wait --for=condition=Ready pod/$pod_name -n $NAMESPACE --timeout=${timeout}s 2>/dev/null; then
log "Pod $pod_name is running"
return 0
else
warn "Pod $pod_name failed to start within ${timeout}s"
return 1
fi
}
# Delete pod
delete_pod() {
local pod_name=$1
log "Deleting pod $pod_name"
kubectl delete pod $pod_name -n $NAMESPACE --grace-period=1 2>/dev/null || true
}
# Check for tvolpub vs export policy mismatch
check_publication_state() {
local pv_name=$(kubectl get pvc $PVC_NAME -n $NAMESPACE -o jsonpath='{.spec.volumeName}' 2>/dev/null)
if [[ -z "$pv_name" ]]; then
return 0
fi
# Get volume publications for this PVC
local tvolpubs=$(kubectl get tvp -A -o jsonpath="{.items[?(@.spec.volumeID=='$pv_name')].metadata.name}" 2>/dev/null || true)
if [[ -n "$tvolpubs" ]]; then
log "Active TridentVolumePublications for $pv_name: $tvolpubs"
fi
}
# Long-running source pod name (created once, runs throughout all iterations)
SOURCE_POD_NAME="test-pod-source-persistent"
# Create the persistent source pod before iterations start
create_persistent_source_pod() {
local current_node="${AVAILABLE_NODES[0]}"
log "Creating persistent read-only source pod on $current_node..."
# This pod runs indefinitely, just reading /data - exits with error if it fails
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: $SOURCE_POD_NAME
namespace: $NAMESPACE
spec:
nodeSelector:
kubernetes.io/hostname: $current_node
containers:
- name: test
image: busybox
command:
- sh
- -c
- "while true; do if ! ls -la /data > /dev/null 2>&1; then echo 'ERROR: Cannot access /data - Stale file handle or access denied' >&2; exit 1; fi; echo Read at \$(date); sleep 2; done"
volumeMounts:
- name: data
mountPath: /data
readOnly: true
volumes:
- name: data
persistentVolumeClaim:
claimName: $PVC_NAME
restartPolicy: Never
EOF
if ! wait_for_pod $SOURCE_POD_NAME 60; then
error "Persistent source pod failed to start!"
kubectl describe pod $SOURCE_POD_NAME -n $NAMESPACE
return 1
fi
log "Persistent source pod is running"
return 0
}
# Main test loop - creates snapshots and clones while source pod runs
run_bug_test() {
local iteration=$1
local current_node="${AVAILABLE_NODES[0]}"
log "========== Iteration $iteration/$ITERATIONS (node: $current_node) =========="
local pod_clone="test-pod-clone-${iteration}"
local clone_pvc="test-clone-pvc-${iteration}"
local snapshot_name="test-snapshot-${iteration}"
# Check if source pod is still running
local source_phase=$(kubectl get pod $SOURCE_POD_NAME -n $NAMESPACE -o jsonpath='{.status.phase}' 2>/dev/null)
if [[ "$source_phase" != "Running" ]]; then
error "Source pod is not running (phase: $source_phase)! Bug may have occurred."
BUG_FOUND=true
echo ""
echo "=== Source Pod Logs ==="
kubectl logs $SOURCE_POD_NAME -n $NAMESPACE 2>/dev/null || true
echo ""
echo "=== TridentVolumePublications ==="
check_publication_state
echo ""
return 1
fi
# Step 1: Create a VolumeSnapshot from the source PVC
log "Creating VolumeSnapshot from source PVC..."
create_volume_snapshot $snapshot_name $PVC_NAME
# Wait for snapshot to be ready
# NOTE: There's a separate Trident bug where if a TridentVolume with the same PVC name
# exists in 'Deleting' phase, the VolumeSnapshot may incorrectly reference the deleting
# volume instead of the active one. This prevents the export policy bug from being triggered.
# Workaround: Ensure no stale TridentVolumes exist before running this test.
if ! kubectl wait --for=jsonpath='{.status.readyToUse}'=true volumesnapshot/$snapshot_name -n $NAMESPACE --timeout=120s 2>/dev/null; then
warn "VolumeSnapshot failed to become ready, skipping this iteration"
kubectl delete volumesnapshot $snapshot_name -n $NAMESPACE 2>/dev/null || true
return 0
fi
# Step 2: Create a read-only clone PVC from the snapshot
log "Creating read-only clone PVC from snapshot..."
create_readonly_clone_pvc $clone_pvc $snapshot_name
# Wait for clone to be bound
if ! kubectl wait --for=jsonpath='{.status.phase}'=Bound pvc/$clone_pvc -n $NAMESPACE --timeout=120s 2>/dev/null; then
warn "Clone PVC failed to bind, skipping this iteration"
kubectl delete pvc $clone_pvc -n $NAMESPACE 2>/dev/null || true
kubectl delete volumesnapshot $snapshot_name -n $NAMESPACE 2>/dev/null || true
return 0
fi
# Step 3: Create short-lived clone pod on the SAME node (read-only)
log "Creating short-lived clone pod on $current_node..."
create_pod_with_pvc $pod_clone $current_node $clone_pvc "ro"
if ! wait_for_pod $pod_clone 60; then
error "Clone Pod failed to start!"
kubectl describe pod $pod_clone -n $NAMESPACE
check_publication_state
delete_pod $pod_clone
kubectl delete pvc $clone_pvc -n $NAMESPACE 2>/dev/null || true
kubectl delete volumesnapshot $snapshot_name -n $NAMESPACE 2>/dev/null || true
read -p "Continue testing? (y/n) " -n 1 -r
echo
[[ ! $REPLY =~ ^[Yy]$ ]] && return 1
return 0
fi
# Step 4: Wait for clone pod to complete (it reads for 5 seconds)
log "Waiting for clone pod to complete..."
for i in $(seq 1 15); do
local phase=$(kubectl get pod $pod_clone -n $NAMESPACE -o jsonpath='{.status.phase}' 2>/dev/null)
if [[ "$phase" == "Succeeded" ]] || [[ "$phase" == "Failed" ]]; then
break
fi
sleep 1
done
# Step 5: Delete the clone pod (triggers unpublish - THIS IS WHERE THE BUG HAPPENS)
log "Deleting clone pod (triggering unpublish)..."
delete_pod $pod_clone
# Step 6: Check if source pod can still read (it should still have access!)
log "Checking if source pod still has access after clone pod unpublish..."
# Give it a moment for any race condition to manifest
sleep 2
# Check source pod status
source_phase=$(kubectl get pod $SOURCE_POD_NAME -n $NAMESPACE -o jsonpath='{.status.phase}' 2>/dev/null)
if [[ "$source_phase" != "Running" ]]; then
error "✗ Source pod FAILED after clone pod unpublish! RACE CONDITION DETECTED!"
BUG_FOUND=true
echo ""
echo "=== Source Pod Logs ==="
kubectl logs $SOURCE_POD_NAME -n $NAMESPACE 2>/dev/null || true
echo ""
echo "=== TridentVolumePublications ==="
check_publication_state
echo ""
echo "=== Trident CSI Logs ==="
kubectl logs -n trident daemonset/trident-csi --all-containers=true --tail=200 2>/dev/null | grep -i "export\|policy\|publish\|$PVC_NAME\|$clone_pvc" || true
echo ""
echo "=== Resources preserved for investigation ==="
echo " - Clone PVC: $clone_pvc"
echo " - Snapshot: $snapshot_name"
echo ""
return 1
fi
log "✓ Source pod still running - no race condition this iteration"
# Wait between iterations (don't delete anything - reuse clone PVC and let snapshots accumulate)
log "Waiting 5s between iterations..."
sleep 5
}
# Create a VolumeSnapshot from source PVC
create_volume_snapshot() {
local snapshot_name=$1
local source_pvc=$2
log "Creating VolumeSnapshot $snapshot_name from $source_pvc"
cat <<EOF | kubectl apply -f -
apiVersion: snapshot.storage.k8s.io/v1
kind: VolumeSnapshot
metadata:
name: $snapshot_name
namespace: $NAMESPACE
spec:
volumeSnapshotClassName: csi-snapclass
source:
persistentVolumeClaimName: $source_pvc
EOF
}
# Create a read-only clone PVC from a VolumeSnapshot
create_readonly_clone_pvc() {
local clone_name=$1
local snapshot_name=$2
log "Creating read-only clone PVC $clone_name from snapshot $snapshot_name"
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: $clone_name
namespace: $NAMESPACE
annotations:
trident.netapp.io/readOnlyClone: "true"
spec:
accessModes:
- ReadWriteMany
storageClassName: $STORAGE_CLASS
resources:
requests:
storage: 1Gi
dataSource:
kind: VolumeSnapshot
apiGroup: snapshot.storage.k8s.io
name: $snapshot_name
EOF
}
# Create pod with specific PVC
create_pod_with_pvc() {
local pod_name=$1
local node_name=$2
local pvc_name=$3
local mode=$4
local command
local readonly_mount="false"
if [[ "$mode" == "ro" ]]; then
# Read-only pod just lists /data repeatedly for 5 seconds
command="for i in 1 2 3 4 5; do ls -la /data && echo Read \$i done && sleep 1; done; echo All reads completed"
readonly_mount="true"
else
# RW pod writes for 15 seconds
command="for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do date >> /data/writes-${pod_name}.log && echo Write \$i done && sleep 1; done; echo All writes completed"
fi
log "Creating pod $pod_name on node $node_name with PVC $pvc_name (mode: $mode)"
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: $pod_name
namespace: $NAMESPACE
spec:
nodeSelector:
kubernetes.io/hostname: $node_name
containers:
- name: test
image: busybox
command:
- sh
- -c
- "$command"
volumeMounts:
- name: data
mountPath: /data
readOnly: $readonly_mount
volumes:
- name: data
persistentVolumeClaim:
claimName: $pvc_name
restartPolicy: Never
EOF
}
# Cleanup function
cleanup() {
log "Cleaning up..."
# Delete the persistent source pod
kubectl delete pod $SOURCE_POD_NAME -n $NAMESPACE --grace-period=1 2>/dev/null || true
# Delete all clone pods (test-pod-clone-*)
log "Deleting clone pods..."
kubectl delete pod -n $NAMESPACE --selector="" --field-selector="metadata.name!=x" 2>/dev/null | grep "test-pod-clone-" | awk '{print $1}' | xargs -r kubectl delete pod -n $NAMESPACE 2>/dev/null || true
# Delete all clone PVCs (test-clone-pvc-*)
log "Deleting clone PVCs..."
kubectl get pvc -n $NAMESPACE -o name 2>/dev/null | grep "test-clone-pvc-" | xargs -r kubectl delete -n $NAMESPACE 2>/dev/null || true
# Delete all VolumeSnapshots (test-snapshot-*)
log "Deleting VolumeSnapshots..."
kubectl get volumesnapshot -n $NAMESPACE -o name 2>/dev/null | grep "test-snapshot-" | xargs -r kubectl delete -n $NAMESPACE 2>/dev/null || true
# Delete the source PVC
log "Deleting source PVC..."
kubectl delete pvc $PVC_NAME -n $NAMESPACE 2>/dev/null || true
log "Cleanup finished"
}
# Main execution
main() {
log "=== Trident NAS-Economy Read-Only Clone Export Policy Bug Test ==="
log "This test reproduces a deterministic bug (NOT a race condition) where"
log "read-only clone unpublish incorrectly removes export rules needed by the source."
log ""
log "Test scenario:"
log " 1. Create source PVC with persistent read-only pod"
log " 2. Each iteration: create snapshot + clone PVC + clone pod on same node"
log " 3. Delete clone pod (triggers unpublish)"
log " 4. Check if source pod loses access (bug is reproduced if it does)"
log ""
log "Configuration:"
log " Namespace: $NAMESPACE"
log " PVC: $PVC_NAME"
log " Storage Class: $STORAGE_CLASS"
log " Iterations: $ITERATIONS"
log ""
detect_nodes
log ""
log "Prerequisites check..."
if ! kubectl get storageclass $STORAGE_CLASS > /dev/null 2>&1; then
error "Storage class '$STORAGE_CLASS' not found"
exit 1
fi
# Cleanup any existing resources
cleanup
sleep 2
# Create PVC
create_pvc
# Create the persistent source pod that will run throughout all iterations
if ! create_persistent_source_pod; then
error "Failed to create persistent source pod"
exit 1
fi
local success_count=0
local failure_count=0
# Run iterations
for i in $(seq 1 $ITERATIONS); do
if run_bug_test $i; then
((success_count++))
else
((failure_count++))
error "Test failed at iteration $i"
break
fi
# Small delay between iterations
sleep 1
done
log ""
log "=== Test Summary ==="
log "Successful iterations: $success_count"
log "Failed iterations: $failure_count"
if [[ $failure_count -gt 0 ]]; then
error "Bug was triggered - export policy rule incorrectly removed!"
exit 1
else
log "Bug not reproduced in $ITERATIONS iterations"
log "Note: This may indicate the fix is working correctly"
# Final cleanup only if no bug found
cleanup
fi
}
# Global flag to track if bug was found
BUG_FOUND=false
# Trap cleanup on exit (only if bug not found)
cleanup_on_exit() {
if [[ "$BUG_FOUND" == "false" ]]; then
cleanup
else
log "Bug was found - skipping cleanup to preserve state for investigation"
log "Resources left for investigation:"
log " - Source PVC: $PVC_NAME"
log " - Source Pod: $SOURCE_POD_NAME"
log "Run 'cleanup' manually when done investigating"
fi
}
trap cleanup_on_exit EXIT
# Check for cleanup mode
if [[ "$1" == "cleanup" ]]; then
log "Running in cleanup mode - removing all test resources"
cleanup
log "Cleanup complete"
exit 0
fi
main "$@"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment