Last active
February 24, 2025 17:32
-
-
Save martinkennelly/ca62a7eebabe5ffd715a98c86ceaf764 to your computer and use it in GitHub Desktop.
Check OCP Nodes with SDN CNI for stale IPs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Iterate over all Nodes and checked for stale IP allocations. Retry problamatic Nodes until success. | |
| # Ctrl + C to cancel | |
| set -eu pipefail | |
| CNI_NETWORK_DIR="/host/var/lib/cni/networks/openshift-sdn" | |
| function is_number { | |
| if [ -n "$1" ] && [ "$1" -eq "$1" ] 2>/dev/null; then | |
| return 0 | |
| fi | |
| return 1 | |
| } | |
| function check_node { | |
| node_name=$1 | |
| # field selector for hostNetwork doesnt work so pipe it to json and filter | |
| expected_num_ips="$(oc get pods -A --field-selector=spec.nodeName=$node_name,status.phase=Running -o json --no-headers | jq -r '.items[] | select(.spec.hostNetwork | not).metadata.name' | wc -l)" | |
| if ! is_number "$expected_num_ips"; then | |
| echo "expected integer: $expected_num_ips" | |
| return 2 | |
| fi | |
| actual_num_ips="$(oc debug --quiet node/$node_name -- find $CNI_NETWORK_DIR -type f -regextype posix-extended -regex '.*/([0-9]{1,3}\.){3}[0-9]{1,3}$' | wc -l)" | |
| if ! is_number "$actual_num_ips"; then | |
| echo "expected integer: $expected_num_ips" | |
| return 2 | |
| fi | |
| if ! [ "$expected_num_ips" -eq "$actual_num_ips" ]; then | |
| echo "Mismatch in IPs. ONLY, if this persists, please gather must-gather, sosreport for Node $node_name and also zip the CNI directory /var/lib/cni." | |
| echo "Expected $expected_num_ips IP(s) but found $actual_num_ips IP(s)" | |
| echo "If possible, please also save a snapshot of your prometheus database for the last two hours" | |
| return 1 | |
| fi | |
| return 0 | |
| } | |
| NODE_NAMES=$(oc get nodes -o 'jsonpath={.items[*].metadata.name}' --no-headers) | |
| for node_name in $NODE_NAMES; do | |
| echo "$(date '+%d/%m/%Y %H:%M:%S') checking Node $node_name for stale IPs (warning that there maybe a race between this script checking IPs in kapi and whats written on disk - rerun to ensure consistent fail signature)" | |
| until [[ "$(check_node $node_name)" -eq 0 ]] 2>/dev/null; do | |
| echo "Retrying Node $node_name in 2 seconds..." | |
| sleep 2 | |
| done | |
| done | |
| function is_number { | |
| if [ -n "$1" ] && [ "$1" -eq "$1" ] 2>/dev/null; then | |
| return 0 | |
| fi | |
| return 1 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment