Skip to content

Instantly share code, notes, and snippets.

@liweinan
Created January 29, 2026 15:11
Show Gist options
  • Select an option

  • Save liweinan/9d65abf9759370f141d4eec93fa7de17 to your computer and use it in GitHub Desktop.

Select an option

Save liweinan/9d65abf9759370f141d4eec93fa7de17 to your computer and use it in GitHub Desktop.
quick-test.sh
#!/bin/bash
# Quick test script for OCPBUGS-69923
# Usage: ./quick-test.sh [pull-secret-file] [iterations]
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PULL_SECRET="${1:-$HOME/works/oc-swarm/openshift-versions/auth.json}"
ITERATIONS="${2:-5}"
echo "Quick Zone Consistency Test"
echo "============================"
echo "Installer: $SCRIPT_DIR/openshift-install"
echo "Pull Secret: $PULL_SECRET"
echo "Iterations: $ITERATIONS"
echo ""
exec "$SCRIPT_DIR/test-zone-consistency-local.sh" \
"$SCRIPT_DIR/openshift-install" \
"$PULL_SECRET" \
"qe.devcluster.openshift.com" \
"us-east-1" \
"$ITERATIONS"
@liweinan
Copy link
Author

#!/bin/bash
# OCPBUGS-69923 - Local test script for zone consistency bug
# Test zone consistency for OpenShift installer
# Run multiple iterations to verify CAPI and MAPI zone allocation is deterministic

set -o errexit
set -o pipefail
set -o nounset

# Default values
INSTALLER="${1:-/Users/weli/works/oc-swarm/installer/bin/openshift-install}"
PULL_SECRET_FILE="${2:-}"
BASE_DOMAIN="${3:-qe.devcluster.openshift.com}"
REGION="${4:-us-east-1}"
ITERATIONS="${5:-10}"

WORK_DIR="/tmp/test-zone-consistency"
ARTIFACT_DIR="/tmp/test-zone-consistency-artifacts"

# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

usage() {
    cat << EOF
Usage: $0 [installer-path] [pull-secret-file] [base-domain] [region] [iterations]

Arguments:
  installer-path    Path to openshift-install binary (default: /Users/weli/works/oc-swarm/installer/bin/openshift-install)
  pull-secret-file  Path to pull secret JSON file (required)
  base-domain       Base domain for cluster (default: qe.devcluster.openshift.com)
  region            AWS region (default: us-east-1)
  iterations        Number of iterations (default: 10)

Example:
  $0 ./openshift-install ~/works/oc-swarm/openshift-versions/auth.json qe.devcluster.openshift.com us-east-1 10

  # Quick test with 5 iterations
  $0 ./openshift-install ~/auth.json "" "" 5
EOF
    exit 1
}

# Check prerequisites
if [[ -z "$PULL_SECRET_FILE" || ! -f "$PULL_SECRET_FILE" ]]; then
    echo -e "${RED}Error: Pull secret file is required${NC}"
    usage
fi

if [[ ! -x "$INSTALLER" ]]; then
    echo -e "${RED}Error: Installer not found or not executable: $INSTALLER${NC}"
    exit 1
fi

if ! command -v yq &> /dev/null; then
    echo -e "${RED}Error: yq is required. Install with: brew install yq${NC}"
    exit 1
fi

# Read pull secret
PULL_SECRET=$(jq -c '.' "$PULL_SECRET_FILE" 2>/dev/null || cat "$PULL_SECRET_FILE")

# Generate temporary SSH key
TEMP_SSH_DIR=$(mktemp -d)
ssh-keygen -t ed25519 -f "$TEMP_SSH_DIR/temp_key" -N "" -q
SSH_PUB_KEY=$(cat "$TEMP_SSH_DIR/temp_key.pub")

# Cleanup temp SSH key on exit
trap "rm -rf '$TEMP_SSH_DIR'" EXIT

# Clean up old artifact directory
rm -rf "$ARTIFACT_DIR"
mkdir -p "$ARTIFACT_DIR"

echo "=========================================="
echo "OCPBUGS-69923 Zone Consistency Test"
echo "=========================================="
echo "Installer: $INSTALLER"
"$INSTALLER" version 2>/dev/null || echo "(version info unavailable)"
echo ""
echo "Region: $REGION"
echo "Base Domain: $BASE_DOMAIN"
echo "Iterations: $ITERATIONS"
echo "Work directory: $WORK_DIR"
echo "Artifact directory: $ARTIFACT_DIR"
echo "=========================================="
echo ""

TOTAL_FAILURES=0

for iteration in $(seq 1 "$ITERATIONS"); do
  echo "=========================================="
  echo "Iteration $iteration/$ITERATIONS"
  echo "=========================================="

  # Clean up everything from previous iteration
  rm -rf "$WORK_DIR"
  mkdir -p "$WORK_DIR"

  # Create install-config.yaml (without specifying zones - triggers the bug path)
  cat > "$WORK_DIR/install-config.yaml" << EOF
apiVersion: v1
baseDomain: ${BASE_DOMAIN}
metadata:
  name: test-zone-${iteration}
controlPlane:
  architecture: amd64
  hyperthreading: Enabled
  name: master
  replicas: 3
compute:
- architecture: amd64
  hyperthreading: Enabled
  name: worker
  replicas: 3
platform:
  aws:
    region: ${REGION}
pullSecret: '${PULL_SECRET}'
sshKey: '${SSH_PUB_KEY}'
EOF

  # Generate manifests
  echo "  Generating manifests..."
  if ! "$INSTALLER" create manifests --dir "$WORK_DIR" 2>&1; then
    echo -e "${YELLOW}Warning: Iteration $iteration manifest generation failed${NC}"
    continue
  fi

  # Extract CAPI zones from cluster-api/machines/10_inframachine_*-master-*.yaml
  # These are the REAL CAPI AWSMachine objects (not the misleadingly named openshift/99_openshift-cluster-api_* files)
  capi_zones=""
  for file in $(find "$WORK_DIR"/cluster-api/machines -name "10_inframachine_*-master-*.yaml" -type f 2>/dev/null | sort); do
    # Extract zone from subnet filter name (e.g., "cluster-subnet-private-us-east-1a" -> "us-east-1a")
    subnet_name=$(yq eval '.spec.subnet.filters[0].values[0]' "$file" 2>/dev/null || echo "")
    # Extract zone from subnet name using region pattern
    zone=$(echo "$subnet_name" | grep -oE "${REGION}[a-z]$" || echo "")
    if [[ -n "$zone" && "$zone" != "null" ]]; then
      capi_zones="${capi_zones} ${zone}"
    fi
  done
  capi_zones=$(echo "$capi_zones" | xargs)
  capi_count=$(echo "$capi_zones" | wc -w | xargs)

  # Extract MAPI zones from ControlPlaneMachineSet failureDomains
  # File: openshift/99_openshift-machine-api_master-control-plane-machine-set.yaml
  mapi_zones=""
  mapi_count=0
  cpms_file="$WORK_DIR/openshift/99_openshift-machine-api_master-control-plane-machine-set.yaml"
  if [[ -f "$cpms_file" ]]; then
    idx=0
    while [[ $mapi_count -lt $capi_count ]]; do
      zone=$(yq eval ".spec.template.machines_v1beta1_machine_openshift_io.failureDomains.aws[$idx].placement.availabilityZone" "$cpms_file" 2>/dev/null || echo "")
      if [[ -z "$zone" || "$zone" == "null" ]]; then
        break
      fi
      mapi_zones="${mapi_zones} ${zone}"
      mapi_count=$((mapi_count + 1))
      idx=$((idx + 1))
    done
  else
    echo -e "  ${RED}ERROR: ControlPlaneMachineSet file not found!${NC}"
  fi
  mapi_zones=$(echo "$mapi_zones" | xargs)

  # Compare
  echo -e "  ${BLUE}CAPI zones${NC} (from cluster-api/machines/10_inframachine_*): $capi_zones"
  echo -e "  ${BLUE}MAPI zones${NC} (from ControlPlaneMachineSet failureDomains): $mapi_zones"

  if [[ "$capi_zones" == "$mapi_zones" ]]; then
    echo -e "  ${GREEN}✓ PASS${NC}"
  else
    echo -e "  ${RED}✗ FAIL: zones mismatch - CAPI and MAPI have different zone assignments${NC}"

    # Print detailed zone differences
    echo -e "  ${RED}ERROR DETAILS:${NC}"
    IFS=' ' read -ra capi_array <<< "$capi_zones"
    IFS=' ' read -ra mapi_array <<< "$mapi_zones"

    for i in "${!capi_array[@]}"; do
      capi_zone="${capi_array[$i]:-}"
      mapi_zone="${mapi_array[$i]:-}"
      if [[ "$capi_zone" != "$mapi_zone" ]]; then
        echo -e "    ${RED}Position $((i+1)): CAPI has '$capi_zone' but MAPI has '$mapi_zone'${NC}"
      fi
    done

    # Handle case where arrays have different lengths
    if [[ ${#capi_array[@]} -ne ${#mapi_array[@]} ]]; then
      echo -e "    ${RED}Zone count mismatch: CAPI has ${#capi_array[@]} zones, MAPI has ${#mapi_array[@]} zones${NC}"
    fi

    # Save manifests to ARTIFACT_DIR for debugging
    iteration_artifact_dir="${ARTIFACT_DIR}/iteration-${iteration}"
    mkdir -p "${iteration_artifact_dir}"

    echo -e "  ${YELLOW}Saving manifests to ${iteration_artifact_dir} for debugging...${NC}"

    # Copy CAPI machine manifests
    if [[ -d "$WORK_DIR/cluster-api/machines" ]]; then
      cp -r "$WORK_DIR/cluster-api/machines" "${iteration_artifact_dir}/capi-machines"
    fi

    # Copy ControlPlaneMachineSet
    if [[ -f "$cpms_file" ]]; then
      cp "$cpms_file" "${iteration_artifact_dir}/control-plane-machine-set.yaml"
    fi

    # Copy install-config for reference
    if [[ -f "$WORK_DIR/install-config.yaml" ]]; then
      cp "$WORK_DIR/install-config.yaml" "${iteration_artifact_dir}/install-config.yaml"
    fi

    TOTAL_FAILURES=$((TOTAL_FAILURES + 1))
  fi

  # Delete all generated files for next iteration
  rm -rf "$WORK_DIR"
  echo ""
done

echo ""
echo "=========================================="
echo "Final Result: $ITERATIONS iterations completed"
if [[ $TOTAL_FAILURES -eq 0 ]]; then
  echo -e "${GREEN}PASS: All iterations have consistent zone allocation between CAPI and MAPI${NC}"
  echo -e "${GREEN}This installer version does NOT have the bug.${NC}"
else
  echo -e "${RED}FAIL: $TOTAL_FAILURES/$ITERATIONS iterations had zone mismatches (OCPBUGS-69923)${NC}"
  failure_rate=$(echo "scale=2; $TOTAL_FAILURES * 100 / $ITERATIONS" | bc)
  echo -e "${RED}Failure rate: ${failure_rate}%${NC}"
  echo ""
  echo -e "${YELLOW}Manifest artifacts saved to ${ARTIFACT_DIR} for debugging${NC}"
  echo ""
  echo "To inspect failed manifests:"
  echo "  ls -la ${ARTIFACT_DIR}/iteration-*/"
  echo "  cat ${ARTIFACT_DIR}/iteration-*/control-plane-machine-set.yaml"
  echo "  ls ${ARTIFACT_DIR}/iteration-*/capi-machines/"
fi
echo "=========================================="

exit $TOTAL_FAILURES

@liweinan
Copy link
Author

weli@tower ~/works/oc-swarm/installer/bin (release-4.20) 
❯ cd /Users/weli/works/oc-swarm/installer/bin && ./quick-test.sh ~/works/oc-swarm/openshift-versions/auth.json 2
Quick Zone Consistency Test
============================
Installer: /Users/weli/works/oc-swarm/installer/bin/openshift-install
Pull Secret: /Users/weli/works/oc-swarm/openshift-versions/auth.json
Iterations: 2

==========================================
OCPBUGS-69923 Zone Consistency Test
==========================================
Installer: /Users/weli/works/oc-swarm/installer/bin/openshift-install
/Users/weli/works/oc-swarm/installer/bin/openshift-install unreleased-master-12138-g4068682841f807383c1ada67691f53cd1f2022bc-dirty
built from commit 4068682841f807383c1ada67691f53cd1f2022bc
release image registry.ci.openshift.org/origin/release:4.20
release architecture unknown
default architecture amd64

Region: us-east-1
Base Domain: qe.devcluster.openshift.com
Iterations: 2
Work directory: /tmp/test-zone-consistency
Artifact directory: /tmp/test-zone-consistency-artifacts
==========================================

==========================================
Iteration 1/2
==========================================
  Generating manifests...
WARNING Release Image Architecture not detected. Release Image Architecture is unknown 
INFO Credentials loaded from the "default" profile in file "/Users/weli/.aws/credentials" 
INFO Credentials loaded from the AWS config using "SharedConfigCredentials: /Users/weli/.aws/credentials" provider 
INFO Consuming Install Config from target directory 
INFO Successfully populated MCS CA cert information: root-ca 2036-01-27T15:00:04Z 2026-01-29T15:00:04Z 
INFO Successfully populated MCS TLS cert information: root-ca 2036-01-27T15:00:04Z 2026-01-29T15:00:04Z 
INFO Adding clusters...                           
INFO Manifests created in: /tmp/test-zone-consistency/cluster-api, /tmp/test-zone-consistency/manifests and /tmp/test-zone-consistency/openshift 
  CAPI zones (from cluster-api/machines/10_inframachine_*): us-east-1f us-east-1a us-east-1b
  MAPI zones (from ControlPlaneMachineSet failureDomains): us-east-1c us-east-1d us-east-1f
  ✗ FAIL: zones mismatch - CAPI and MAPI have different zone assignments
  ERROR DETAILS:
    Position 1: CAPI has 'us-east-1f' but MAPI has 'us-east-1c'
    Position 2: CAPI has 'us-east-1a' but MAPI has 'us-east-1d'
    Position 3: CAPI has 'us-east-1b' but MAPI has 'us-east-1f'
  Saving manifests to /tmp/test-zone-consistency-artifacts/iteration-1 for debugging...

==========================================
Iteration 2/2
==========================================
  Generating manifests...
WARNING Release Image Architecture not detected. Release Image Architecture is unknown 
INFO Credentials loaded from the "default" profile in file "/Users/weli/.aws/credentials" 
INFO Credentials loaded from the AWS config using "SharedConfigCredentials: /Users/weli/.aws/credentials" provider 
INFO Consuming Install Config from target directory 
INFO Successfully populated MCS CA cert information: root-ca 2036-01-27T15:00:19Z 2026-01-29T15:00:19Z 
INFO Successfully populated MCS TLS cert information: root-ca 2036-01-27T15:00:19Z 2026-01-29T15:00:19Z 
INFO Adding clusters...                           
INFO Manifests created in: /tmp/test-zone-consistency/cluster-api, /tmp/test-zone-consistency/manifests and /tmp/test-zone-consistency/openshift 
  CAPI zones (from cluster-api/machines/10_inframachine_*): us-east-1b us-east-1c us-east-1d
  MAPI zones (from ControlPlaneMachineSet failureDomains): us-east-1f us-east-1a us-east-1b
  ✗ FAIL: zones mismatch - CAPI and MAPI have different zone assignments
  ERROR DETAILS:
    Position 1: CAPI has 'us-east-1b' but MAPI has 'us-east-1f'
    Position 2: CAPI has 'us-east-1c' but MAPI has 'us-east-1a'
    Position 3: CAPI has 'us-east-1d' but MAPI has 'us-east-1b'
  Saving manifests to /tmp/test-zone-consistency-artifacts/iteration-2 for debugging...


==========================================
Final Result: 2 iterations completed
FAIL: 2/2 iterations had zone mismatches (OCPBUGS-69923)
Failure rate: 100.00%

Manifest artifacts saved to /tmp/test-zone-consistency-artifacts for debugging

To inspect failed manifests:
  ls -la /tmp/test-zone-consistency-artifacts/iteration-*/
  cat /tmp/test-zone-consistency-artifacts/iteration-*/control-plane-machine-set.yaml
  ls /tmp/test-zone-consistency-artifacts/iteration-*/capi-machines/
==========================================

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment