Created
November 20, 2025 15:14
-
-
Save clementnuss/985f09116a7ddf9d88a373b98cf01e1c to your computer and use it in GitHub Desktop.
netapp trident nas-economy snapshot export policy bug
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Script to reproduce the read-only clone export policy bug in Trident nas-economy driver | |
| # | |
| # Bug Description: | |
| # When a read-only clone is deleted on the same node as its source volume, the unpublish | |
| # operation incorrectly removes export policy rules that are still needed by the source. | |
| # This is NOT a race condition - it's a deterministic logic bug where the code fails to | |
| # check for remaining publications of the source volume before removing export rules. | |
| # | |
| # Test Scenario: | |
| # - Create a source PVC with a persistent read-only pod running throughout all iterations | |
| # - Each iteration creates a new VolumeSnapshot and read-only clone PVC on the same node | |
| # - Create a short-lived clone pod, then delete it to trigger unpublish | |
| # - The bug causes the source pod to lose NFS access with "Stale file handle" errors | |
| # | |
| # Known Issues: | |
| # - There's a separate Trident bug where VolumeSnapshots may incorrectly reference a | |
| # TridentVolume in 'Deleting' phase instead of the active volume with the same name. | |
| # This can prevent this test from triggering the export policy bug. | |
| # Workaround: Ensure no stale TridentVolumes exist before running this test. | |
| # | |
| # Usage: | |
| # ./test-readonly-clone-export-policy-bug.sh # Run the test | |
| # ./test-readonly-clone-export-policy-bug.sh cleanup # Clean up test resources | |
| set -e | |
| # Configuration | |
| NAMESPACE="${NAMESPACE:-default}" | |
| PVC_NAME="${PVC_NAME:-test-export-bug-pvc}" | |
| STORAGE_CLASS="${STORAGE_CLASS:-nfs}" # Adjust to your nas-economy storage class | |
| ITERATIONS="${ITERATIONS:-10}" # Number of times to repeat the test | |
| # Colors for output | |
| RED='\033[0;31m' | |
| GREEN='\033[0;32m' | |
| YELLOW='\033[1;33m' | |
| NC='\033[0m' # No Color | |
| log() { | |
| echo -e "${GREEN}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $*" | |
| } | |
| error() { | |
| echo -e "${RED}[$(date '+%Y-%m-%d %H:%M:%S')] ERROR:${NC} $*" | |
| } | |
| warn() { | |
| echo -e "${YELLOW}[$(date '+%Y-%m-%d %H:%M:%S')] WARN:${NC} $*" | |
| } | |
| # Detect nodes | |
| detect_nodes() { | |
| log "Auto-detecting worker nodes..." | |
| AVAILABLE_NODES=($(kubectl get nodes --selector='!node-role.kubernetes.io/master,!node-role.kubernetes.io/control-plane' -o jsonpath='{.items[*].metadata.name}')) | |
| if [[ ${#AVAILABLE_NODES[@]} -lt 1 ]]; then | |
| error "Need at least 1 worker node. Found: ${#AVAILABLE_NODES[@]}" | |
| exit 1 | |
| fi | |
| log "Available nodes: ${AVAILABLE_NODES[*]}" | |
| log "Will use node ${AVAILABLE_NODES[0]} for all pods" | |
| } | |
| # Create PVC | |
| create_pvc() { | |
| log "Creating PVC: $PVC_NAME" | |
| cat <<EOF | kubectl apply -f - | |
| apiVersion: v1 | |
| kind: PersistentVolumeClaim | |
| metadata: | |
| name: $PVC_NAME | |
| namespace: $NAMESPACE | |
| spec: | |
| accessModes: | |
| - ReadWriteMany | |
| storageClassName: $STORAGE_CLASS | |
| resources: | |
| requests: | |
| storage: 1Gi | |
| EOF | |
| log "Waiting for PVC to be bound..." | |
| kubectl wait --for=jsonpath='{.status.phase}'=Bound pvc/$PVC_NAME -n $NAMESPACE --timeout=60s | |
| log "PVC is bound" | |
| } | |
| # Wait for pod to be running | |
| wait_for_pod() { | |
| local pod_name=$1 | |
| local timeout=${2:-30} | |
| log "Waiting for pod $pod_name to be running..." | |
| if kubectl wait --for=condition=Ready pod/$pod_name -n $NAMESPACE --timeout=${timeout}s 2>/dev/null; then | |
| log "Pod $pod_name is running" | |
| return 0 | |
| else | |
| warn "Pod $pod_name failed to start within ${timeout}s" | |
| return 1 | |
| fi | |
| } | |
| # Delete pod | |
| delete_pod() { | |
| local pod_name=$1 | |
| log "Deleting pod $pod_name" | |
| kubectl delete pod $pod_name -n $NAMESPACE --grace-period=1 2>/dev/null || true | |
| } | |
| # Check for tvolpub vs export policy mismatch | |
| check_publication_state() { | |
| local pv_name=$(kubectl get pvc $PVC_NAME -n $NAMESPACE -o jsonpath='{.spec.volumeName}' 2>/dev/null) | |
| if [[ -z "$pv_name" ]]; then | |
| return 0 | |
| fi | |
| # Get volume publications for this PVC | |
| local tvolpubs=$(kubectl get tvp -A -o jsonpath="{.items[?(@.spec.volumeID=='$pv_name')].metadata.name}" 2>/dev/null || true) | |
| if [[ -n "$tvolpubs" ]]; then | |
| log "Active TridentVolumePublications for $pv_name: $tvolpubs" | |
| fi | |
| } | |
| # Long-running source pod name (created once, runs throughout all iterations) | |
| SOURCE_POD_NAME="test-pod-source-persistent" | |
| # Create the persistent source pod before iterations start | |
| create_persistent_source_pod() { | |
| local current_node="${AVAILABLE_NODES[0]}" | |
| log "Creating persistent read-only source pod on $current_node..." | |
| # This pod runs indefinitely, just reading /data - exits with error if it fails | |
| cat <<EOF | kubectl apply -f - | |
| apiVersion: v1 | |
| kind: Pod | |
| metadata: | |
| name: $SOURCE_POD_NAME | |
| namespace: $NAMESPACE | |
| spec: | |
| nodeSelector: | |
| kubernetes.io/hostname: $current_node | |
| containers: | |
| - name: test | |
| image: busybox | |
| command: | |
| - sh | |
| - -c | |
| - "while true; do if ! ls -la /data > /dev/null 2>&1; then echo 'ERROR: Cannot access /data - Stale file handle or access denied' >&2; exit 1; fi; echo Read at \$(date); sleep 2; done" | |
| volumeMounts: | |
| - name: data | |
| mountPath: /data | |
| readOnly: true | |
| volumes: | |
| - name: data | |
| persistentVolumeClaim: | |
| claimName: $PVC_NAME | |
| restartPolicy: Never | |
| EOF | |
| if ! wait_for_pod $SOURCE_POD_NAME 60; then | |
| error "Persistent source pod failed to start!" | |
| kubectl describe pod $SOURCE_POD_NAME -n $NAMESPACE | |
| return 1 | |
| fi | |
| log "Persistent source pod is running" | |
| return 0 | |
| } | |
| # Main test loop - creates snapshots and clones while source pod runs | |
| run_bug_test() { | |
| local iteration=$1 | |
| local current_node="${AVAILABLE_NODES[0]}" | |
| log "========== Iteration $iteration/$ITERATIONS (node: $current_node) ==========" | |
| local pod_clone="test-pod-clone-${iteration}" | |
| local clone_pvc="test-clone-pvc-${iteration}" | |
| local snapshot_name="test-snapshot-${iteration}" | |
| # Check if source pod is still running | |
| local source_phase=$(kubectl get pod $SOURCE_POD_NAME -n $NAMESPACE -o jsonpath='{.status.phase}' 2>/dev/null) | |
| if [[ "$source_phase" != "Running" ]]; then | |
| error "Source pod is not running (phase: $source_phase)! Bug may have occurred." | |
| BUG_FOUND=true | |
| echo "" | |
| echo "=== Source Pod Logs ===" | |
| kubectl logs $SOURCE_POD_NAME -n $NAMESPACE 2>/dev/null || true | |
| echo "" | |
| echo "=== TridentVolumePublications ===" | |
| check_publication_state | |
| echo "" | |
| return 1 | |
| fi | |
| # Step 1: Create a VolumeSnapshot from the source PVC | |
| log "Creating VolumeSnapshot from source PVC..." | |
| create_volume_snapshot $snapshot_name $PVC_NAME | |
| # Wait for snapshot to be ready | |
| # NOTE: There's a separate Trident bug where if a TridentVolume with the same PVC name | |
| # exists in 'Deleting' phase, the VolumeSnapshot may incorrectly reference the deleting | |
| # volume instead of the active one. This prevents the export policy bug from being triggered. | |
| # Workaround: Ensure no stale TridentVolumes exist before running this test. | |
| if ! kubectl wait --for=jsonpath='{.status.readyToUse}'=true volumesnapshot/$snapshot_name -n $NAMESPACE --timeout=120s 2>/dev/null; then | |
| warn "VolumeSnapshot failed to become ready, skipping this iteration" | |
| kubectl delete volumesnapshot $snapshot_name -n $NAMESPACE 2>/dev/null || true | |
| return 0 | |
| fi | |
| # Step 2: Create a read-only clone PVC from the snapshot | |
| log "Creating read-only clone PVC from snapshot..." | |
| create_readonly_clone_pvc $clone_pvc $snapshot_name | |
| # Wait for clone to be bound | |
| if ! kubectl wait --for=jsonpath='{.status.phase}'=Bound pvc/$clone_pvc -n $NAMESPACE --timeout=120s 2>/dev/null; then | |
| warn "Clone PVC failed to bind, skipping this iteration" | |
| kubectl delete pvc $clone_pvc -n $NAMESPACE 2>/dev/null || true | |
| kubectl delete volumesnapshot $snapshot_name -n $NAMESPACE 2>/dev/null || true | |
| return 0 | |
| fi | |
| # Step 3: Create short-lived clone pod on the SAME node (read-only) | |
| log "Creating short-lived clone pod on $current_node..." | |
| create_pod_with_pvc $pod_clone $current_node $clone_pvc "ro" | |
| if ! wait_for_pod $pod_clone 60; then | |
| error "Clone Pod failed to start!" | |
| kubectl describe pod $pod_clone -n $NAMESPACE | |
| check_publication_state | |
| delete_pod $pod_clone | |
| kubectl delete pvc $clone_pvc -n $NAMESPACE 2>/dev/null || true | |
| kubectl delete volumesnapshot $snapshot_name -n $NAMESPACE 2>/dev/null || true | |
| read -p "Continue testing? (y/n) " -n 1 -r | |
| echo | |
| [[ ! $REPLY =~ ^[Yy]$ ]] && return 1 | |
| return 0 | |
| fi | |
| # Step 4: Wait for clone pod to complete (it reads for 5 seconds) | |
| log "Waiting for clone pod to complete..." | |
| for i in $(seq 1 15); do | |
| local phase=$(kubectl get pod $pod_clone -n $NAMESPACE -o jsonpath='{.status.phase}' 2>/dev/null) | |
| if [[ "$phase" == "Succeeded" ]] || [[ "$phase" == "Failed" ]]; then | |
| break | |
| fi | |
| sleep 1 | |
| done | |
| # Step 5: Delete the clone pod (triggers unpublish - THIS IS WHERE THE BUG HAPPENS) | |
| log "Deleting clone pod (triggering unpublish)..." | |
| delete_pod $pod_clone | |
| # Step 6: Check if source pod can still read (it should still have access!) | |
| log "Checking if source pod still has access after clone pod unpublish..." | |
| # Give it a moment for any race condition to manifest | |
| sleep 2 | |
| # Check source pod status | |
| source_phase=$(kubectl get pod $SOURCE_POD_NAME -n $NAMESPACE -o jsonpath='{.status.phase}' 2>/dev/null) | |
| if [[ "$source_phase" != "Running" ]]; then | |
| error "✗ Source pod FAILED after clone pod unpublish! RACE CONDITION DETECTED!" | |
| BUG_FOUND=true | |
| echo "" | |
| echo "=== Source Pod Logs ===" | |
| kubectl logs $SOURCE_POD_NAME -n $NAMESPACE 2>/dev/null || true | |
| echo "" | |
| echo "=== TridentVolumePublications ===" | |
| check_publication_state | |
| echo "" | |
| echo "=== Trident CSI Logs ===" | |
| kubectl logs -n trident daemonset/trident-csi --all-containers=true --tail=200 2>/dev/null | grep -i "export\|policy\|publish\|$PVC_NAME\|$clone_pvc" || true | |
| echo "" | |
| echo "=== Resources preserved for investigation ===" | |
| echo " - Clone PVC: $clone_pvc" | |
| echo " - Snapshot: $snapshot_name" | |
| echo "" | |
| return 1 | |
| fi | |
| log "✓ Source pod still running - no race condition this iteration" | |
| # Wait between iterations (don't delete anything - reuse clone PVC and let snapshots accumulate) | |
| log "Waiting 5s between iterations..." | |
| sleep 5 | |
| } | |
| # Create a VolumeSnapshot from source PVC | |
| create_volume_snapshot() { | |
| local snapshot_name=$1 | |
| local source_pvc=$2 | |
| log "Creating VolumeSnapshot $snapshot_name from $source_pvc" | |
| cat <<EOF | kubectl apply -f - | |
| apiVersion: snapshot.storage.k8s.io/v1 | |
| kind: VolumeSnapshot | |
| metadata: | |
| name: $snapshot_name | |
| namespace: $NAMESPACE | |
| spec: | |
| volumeSnapshotClassName: csi-snapclass | |
| source: | |
| persistentVolumeClaimName: $source_pvc | |
| EOF | |
| } | |
| # Create a read-only clone PVC from a VolumeSnapshot | |
| create_readonly_clone_pvc() { | |
| local clone_name=$1 | |
| local snapshot_name=$2 | |
| log "Creating read-only clone PVC $clone_name from snapshot $snapshot_name" | |
| cat <<EOF | kubectl apply -f - | |
| apiVersion: v1 | |
| kind: PersistentVolumeClaim | |
| metadata: | |
| name: $clone_name | |
| namespace: $NAMESPACE | |
| annotations: | |
| trident.netapp.io/readOnlyClone: "true" | |
| spec: | |
| accessModes: | |
| - ReadWriteMany | |
| storageClassName: $STORAGE_CLASS | |
| resources: | |
| requests: | |
| storage: 1Gi | |
| dataSource: | |
| kind: VolumeSnapshot | |
| apiGroup: snapshot.storage.k8s.io | |
| name: $snapshot_name | |
| EOF | |
| } | |
| # Create pod with specific PVC | |
| create_pod_with_pvc() { | |
| local pod_name=$1 | |
| local node_name=$2 | |
| local pvc_name=$3 | |
| local mode=$4 | |
| local command | |
| local readonly_mount="false" | |
| if [[ "$mode" == "ro" ]]; then | |
| # Read-only pod just lists /data repeatedly for 5 seconds | |
| command="for i in 1 2 3 4 5; do ls -la /data && echo Read \$i done && sleep 1; done; echo All reads completed" | |
| readonly_mount="true" | |
| else | |
| # RW pod writes for 15 seconds | |
| command="for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do date >> /data/writes-${pod_name}.log && echo Write \$i done && sleep 1; done; echo All writes completed" | |
| fi | |
| log "Creating pod $pod_name on node $node_name with PVC $pvc_name (mode: $mode)" | |
| cat <<EOF | kubectl apply -f - | |
| apiVersion: v1 | |
| kind: Pod | |
| metadata: | |
| name: $pod_name | |
| namespace: $NAMESPACE | |
| spec: | |
| nodeSelector: | |
| kubernetes.io/hostname: $node_name | |
| containers: | |
| - name: test | |
| image: busybox | |
| command: | |
| - sh | |
| - -c | |
| - "$command" | |
| volumeMounts: | |
| - name: data | |
| mountPath: /data | |
| readOnly: $readonly_mount | |
| volumes: | |
| - name: data | |
| persistentVolumeClaim: | |
| claimName: $pvc_name | |
| restartPolicy: Never | |
| EOF | |
| } | |
| # Cleanup function | |
| cleanup() { | |
| log "Cleaning up..." | |
| # Delete the persistent source pod | |
| kubectl delete pod $SOURCE_POD_NAME -n $NAMESPACE --grace-period=1 2>/dev/null || true | |
| # Delete all clone pods (test-pod-clone-*) | |
| log "Deleting clone pods..." | |
| kubectl delete pod -n $NAMESPACE --selector="" --field-selector="metadata.name!=x" 2>/dev/null | grep "test-pod-clone-" | awk '{print $1}' | xargs -r kubectl delete pod -n $NAMESPACE 2>/dev/null || true | |
| # Delete all clone PVCs (test-clone-pvc-*) | |
| log "Deleting clone PVCs..." | |
| kubectl get pvc -n $NAMESPACE -o name 2>/dev/null | grep "test-clone-pvc-" | xargs -r kubectl delete -n $NAMESPACE 2>/dev/null || true | |
| # Delete all VolumeSnapshots (test-snapshot-*) | |
| log "Deleting VolumeSnapshots..." | |
| kubectl get volumesnapshot -n $NAMESPACE -o name 2>/dev/null | grep "test-snapshot-" | xargs -r kubectl delete -n $NAMESPACE 2>/dev/null || true | |
| # Delete the source PVC | |
| log "Deleting source PVC..." | |
| kubectl delete pvc $PVC_NAME -n $NAMESPACE 2>/dev/null || true | |
| log "Cleanup finished" | |
| } | |
| # Main execution | |
| main() { | |
| log "=== Trident NAS-Economy Read-Only Clone Export Policy Bug Test ===" | |
| log "This test reproduces a deterministic bug (NOT a race condition) where" | |
| log "read-only clone unpublish incorrectly removes export rules needed by the source." | |
| log "" | |
| log "Test scenario:" | |
| log " 1. Create source PVC with persistent read-only pod" | |
| log " 2. Each iteration: create snapshot + clone PVC + clone pod on same node" | |
| log " 3. Delete clone pod (triggers unpublish)" | |
| log " 4. Check if source pod loses access (bug is reproduced if it does)" | |
| log "" | |
| log "Configuration:" | |
| log " Namespace: $NAMESPACE" | |
| log " PVC: $PVC_NAME" | |
| log " Storage Class: $STORAGE_CLASS" | |
| log " Iterations: $ITERATIONS" | |
| log "" | |
| detect_nodes | |
| log "" | |
| log "Prerequisites check..." | |
| if ! kubectl get storageclass $STORAGE_CLASS > /dev/null 2>&1; then | |
| error "Storage class '$STORAGE_CLASS' not found" | |
| exit 1 | |
| fi | |
| # Cleanup any existing resources | |
| cleanup | |
| sleep 2 | |
| # Create PVC | |
| create_pvc | |
| # Create the persistent source pod that will run throughout all iterations | |
| if ! create_persistent_source_pod; then | |
| error "Failed to create persistent source pod" | |
| exit 1 | |
| fi | |
| local success_count=0 | |
| local failure_count=0 | |
| # Run iterations | |
| for i in $(seq 1 $ITERATIONS); do | |
| if run_bug_test $i; then | |
| ((success_count++)) | |
| else | |
| ((failure_count++)) | |
| error "Test failed at iteration $i" | |
| break | |
| fi | |
| # Small delay between iterations | |
| sleep 1 | |
| done | |
| log "" | |
| log "=== Test Summary ===" | |
| log "Successful iterations: $success_count" | |
| log "Failed iterations: $failure_count" | |
| if [[ $failure_count -gt 0 ]]; then | |
| error "Bug was triggered - export policy rule incorrectly removed!" | |
| exit 1 | |
| else | |
| log "Bug not reproduced in $ITERATIONS iterations" | |
| log "Note: This may indicate the fix is working correctly" | |
| # Final cleanup only if no bug found | |
| cleanup | |
| fi | |
| } | |
| # Global flag to track if bug was found | |
| BUG_FOUND=false | |
| # Trap cleanup on exit (only if bug not found) | |
| cleanup_on_exit() { | |
| if [[ "$BUG_FOUND" == "false" ]]; then | |
| cleanup | |
| else | |
| log "Bug was found - skipping cleanup to preserve state for investigation" | |
| log "Resources left for investigation:" | |
| log " - Source PVC: $PVC_NAME" | |
| log " - Source Pod: $SOURCE_POD_NAME" | |
| log "Run 'cleanup' manually when done investigating" | |
| fi | |
| } | |
| trap cleanup_on_exit EXIT | |
| # Check for cleanup mode | |
| if [[ "$1" == "cleanup" ]]; then | |
| log "Running in cleanup mode - removing all test resources" | |
| cleanup | |
| log "Cleanup complete" | |
| exit 0 | |
| fi | |
| main "$@" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment