Skip to content

Instantly share code, notes, and snippets.

@irizzant
Last active February 6, 2026 16:07
Show Gist options
  • Select an option

  • Save irizzant/4674e7eca3e981c74f2e6afe6f78410a to your computer and use it in GitHub Desktop.

Select an option

Save irizzant/4674e7eca3e981c74f2e6afe6f78410a to your computer and use it in GitHub Desktop.
#!/usr/bin/env bash
# Runbook: Simula il problema vcluster endpoint sync in locale con Kind
# Questo script replica l'ambiente di produzione per testare il bug
set -euo pipefail
# Colori per output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Configurazione
CLUSTER_NAME="vcluster-test"
NAMESPACE="test-env-development"
VCLUSTER_NAME="development-vcluster"
log_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Step 1: Crea cluster Kind con 3 worker nodes
create_kind_cluster() {
log_info "Creazione cluster Kind con 3 worker nodes..."
cat <<EOF | kind create cluster --name ${CLUSTER_NAME} --config=-
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
networking:
disableDefaultCNI: true # Disabiliteremo il CNI di default per installare Cilium
podSubnet: "10.244.0.0/16"
serviceSubnet: "10.96.0.0/12"
nodes:
- role: control-plane
kubeadmConfigPatches:
- |
kind: InitConfiguration
nodeRegistration:
kubeletExtraArgs:
node-labels: "ingress-ready=true"
- role: worker
labels:
team: "test-project"
eng.it/project: "test-project"
eng.it/karpenter-role: "project-test-project-env"
- role: worker
labels:
team: "test-project"
eng.it/project: "test-project"
eng.it/karpenter-role: "project-test-project-env"
- role: worker
labels:
team: "test-project"
eng.it/project: "test-project"
eng.it/karpenter-role: "project-test-project-env"
EOF
log_info "Cluster Kind creato con successo"
}
# Step 2: Installa Cilium (configurazione adattata per Kind)
install_cilium() {
log_info "Installazione Cilium..."
# Aggiungi repo Helm (ignora se già esiste)
helm repo add cilium https://helm.cilium.io/ 2>/dev/null || true
helm repo update
# Installa Cilium con configurazione simile a EKS ma adattata per Kind
# Nota: Non usiamo ENI mode su Kind, usiamo il tunnel mode
helm install cilium cilium/cilium \
--namespace kube-system \
--version 1.18.5 \
--set ipam.mode=kubernetes \
--set hubble.relay.enabled=true \
--set hubble.ui.enabled=true \
--set kubeProxyReplacement=true \
--set k8sServiceHost=${CLUSTER_NAME}-control-plane \
--set k8sServicePort=6443 \
--set socketLB.hostNamespaceOnly=true \
--set localRedirectPolicy=true \
--set loadBalancer.serviceTopology=true \
--wait
log_info "Cilium installato con successo"
# Attendi che Cilium sia pronto
log_info "Attesa ready Cilium..."
kubectl wait --for=condition=ready pod -l k8s-app=cilium -n kube-system --timeout=300s
}
# Step 3: Crea namespace con taint (simula il NodePool di Karpenter)
prepare_environment() {
log_info "Preparazione ambiente..."
# Crea namespace con label per nodeSelector
kubectl create namespace ${NAMESPACE}
kubectl label namespace ${NAMESPACE} team=test-project
# Applica taint ai worker nodes (simula il comportamento Karpenter)
log_info "Applicazione taints ai worker nodes..."
for node in $(kubectl get nodes -l role=worker -o jsonpath='{.items[*].metadata.name}'); do
kubectl taint nodes ${node} eng.it/karpenter-role=project-test-project-env:NoSchedule --overwrite
done
log_info "Ambiente preparato"
}
# Step 4: Installa vcluster con configurazione identica a produzione
install_vcluster() {
log_info "Installazione vcluster con configurazione produzione..."
# Crea file values
cat > /tmp/vcluster-values.yaml <<EOF
# Configurazione semplificata per Kind - senza hostname esterno
exportKubeConfig:
server: "https://localhost:8443"
controlPlane:
distro:
k8s:
enabled: true
version: "v1.32.2"
coredns:
deployment:
replicas: 2
backingStore:
etcd:
deploy:
enabled: true
statefulSet:
highAvailability:
replicas: 3
extraArgs:
- "--auto-compaction-retention=30m"
- "--auto-compaction-mode=periodic"
- "--quota-backend-bytes=8589934592"
scheduling:
tolerations:
- effect: "NoSchedule"
key: "eng.it/karpenter-role"
operator: "Equal"
value: "project-test-project-env"
nodeSelector:
team: "test-project"
"eng.it/project": "test-project"
"eng.it/karpenter-role": "project-test-project-env"
affinity:
podAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: "app"
operator: "In"
values: ["vcluster-etcd"]
- key: "release"
operator: "In"
values: ["${VCLUSTER_NAME}"]
topologyKey: "topology.kubernetes.io/zone"
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: "app"
operator: "In"
values: ["vcluster-etcd"]
- key: "release"
operator: "In"
values: ["${VCLUSTER_NAME}"]
topologyKey: "kubernetes.io/hostname"
statefulSet:
highAvailability:
replicas: 3
scheduling:
tolerations:
- effect: "NoSchedule"
key: "eng.it/karpenter-role"
operator: "Equal"
value: "project-test-project-env"
nodeSelector:
team: "test-project"
"eng.it/project": "test-project"
"eng.it/karpenter-role": "project-test-project-env"
affinity:
podAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: "app"
operator: "In"
values: ["vcluster"]
- key: "release"
operator: "In"
values: ["${VCLUSTER_NAME}"]
topologyKey: "topology.kubernetes.io/zone"
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: "app"
operator: "In"
values: ["vcluster"]
- key: "release"
operator: "In"
values: ["${VCLUSTER_NAME}"]
topologyKey: "kubernetes.io/hostname"
sync:
toHost:
podDisruptionBudgets:
enabled: true
pods:
enabled: true
enforceTolerations:
- "eng.it/karpenter-role=project-test-project-env:NoSchedule"
serviceAccounts:
enabled: true
ingresses:
enabled: true
fromHost:
nodes:
enabled: true
selector:
labels:
team: "test-project"
"eng.it/project": "test-project"
"eng.it/karpenter-role": "project-test-project-env"
integrations:
metricsServer:
enabled: true
nodes: true
pods: true
EOF
# Aggiungi repo vcluster
helm repo add loft-sh https://charts.loft.sh
helm repo update
# Installa vcluster
helm install ${VCLUSTER_NAME} loft-sh/vcluster \
--namespace ${NAMESPACE} \
--version 0.31.0 \
--values /tmp/vcluster-values.yaml \
--wait \
--timeout 600s
log_info "vcluster installato con successo"
# Attendi che tutti i pod siano ready
log_info "Attesa ready vcluster..."
kubectl wait --for=condition=ready pod -l app=vcluster -n ${NAMESPACE} --timeout=300s
kubectl wait --for=condition=ready pod -l app=vcluster-etcd -n ${NAMESPACE} --timeout=300s
}
# Step 5: Crea workload di test
create_test_workload() {
log_info "Creazione workload di test..."
# Ottieni kubeconfig del vcluster
kubectl get secret vc-${VCLUSTER_NAME} -n ${NAMESPACE} -o jsonpath='{.data.config}' | base64 -d > /tmp/vcluster-kubeconfig.yaml
# Verifica che il secret esista
if [ ! -s /tmp/vcluster-kubeconfig.yaml ]; then
log_error "Impossibile ottenere kubeconfig dal secret vc-${VCLUSTER_NAME}"
exit 1
fi
log_info "Kubeconfig salvata in /tmp/vcluster-kubeconfig.yaml"
# Modifica il kubeconfig per usare port-forward invece dell'hostname
# Sostituisci il server con localhost:8443 per il port-forward
sed -i "s|server: https://.*|server: https://localhost:8443|" /tmp/vcluster-kubeconfig.yaml
# Avvia port-forward in background
log_info "Avvio port-forward verso il vcluster..."
kubectl port-forward -n ${NAMESPACE} service/${VCLUSTER_NAME} 8443:443 &
PORT_FORWARD_PID=$!
# Attendi che il port-forward sia pronto
log_info "Attesa port-forward (10s)..."
sleep 10
# Esporta il kubeconfig modificato
export KUBECONFIG=/tmp/vcluster-kubeconfig.yaml
# Crea namespace test-app
log_info "Creazione namespace test-app..."
kubectl create namespace test-app 2>/dev/null || log_info "Namespace test-app esiste già"
# Crea deployment nginx
log_info "Creazione deployment nginx..."
kubectl apply -f - <<'DEPLOYMENT'
apiVersion: apps/v1
kind: Deployment
metadata:
name: nginx-test
namespace: test-app
spec:
replicas: 2
selector:
matchLabels:
app: nginx
template:
metadata:
labels:
app: nginx
spec:
containers:
- name: nginx
image: nginx:alpine
ports:
- containerPort: 80
readinessProbe:
httpGet:
path: /
port: 80
initialDelaySeconds: 5
periodSeconds: 5
DEPLOYMENT
# Crea service nginx
log_info "Creazione service nginx..."
kubectl apply -f - <<'SERVICE'
apiVersion: v1
kind: Service
metadata:
name: nginx
namespace: test-app
spec:
selector:
app: nginx
ports:
- port: 80
targetPort: 80
type: ClusterIP
SERVICE
log_info "Workload di test creato"
# Attendi che i pod siano ready
log_info "Attesa ready pod (30s)..."
sleep 30
# Verifica stato
log_info "Verifica stato workload..."
kubectl get pods -n test-app
log_info "Stato endpoint:"
kubectl get endpoints -n test-app
# Salva il PID del port-forward per pulizia successiva
echo ${PORT_FORWARD_PID} > /tmp/vcluster-port-forward.pid
}
# Step 6: Simula il problema (elimina syncer + etcd + restart workload test)
simulate_problem() {
log_warn "SIMULAZIONE: Eliminazione syncer + etcd + restart workload test..."
ORIGINAL_KUBECONFIG=${KUBECONFIG:-}
unset KUBECONFIG
SYNCER_POD=$(kubectl get pods -n ${NAMESPACE} -l app=vcluster -o jsonpath='{.items[0].metadata.name}')
ETCD_POD=$(kubectl get pods -n ${NAMESPACE} -l app=vcluster-etcd -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
if [ -f /tmp/vcluster-port-forward.pid ]; then
OLD_PID=$(cat /tmp/vcluster-port-forward.pid)
kill ${OLD_PID} 2>/dev/null || true
sleep 2
fi
# STEP 1: Elimina syncer + etcd
if [ -n "$ETCD_POD" ]; then
log_info "Modalità DEPLOYED etcd rilevata"
log_info "Eliminazione pod: ${SYNCER_POD} e ${ETCD_POD}"
kubectl delete pod ${SYNCER_POD} -n ${NAMESPACE} --grace-period=0 --force &
kubectl delete pod ${ETCD_POD} -n ${NAMESPACE} --grace-period=0 --force &
wait
else
log_info "Modalità EMBEDDED etcd rilevata"
log_info "Eliminazione pod: ${SYNCER_POD}"
kubectl delete pod ${SYNCER_POD} -n ${NAMESPACE} --grace-period=0 --force
fi
# STEP 2: Riavvia port-forward PRIMA di eliminare i pod nginx
log_info "Riavvio port-forward..."
kubectl port-forward -n ${NAMESPACE} service/${VCLUSTER_NAME} 8443:443 &
PORT_FORWARD_PID=$!
echo ${PORT_FORWARD_PID} > /tmp/vcluster-port-forward.pid
sleep 10
# STEP 3: IMMEDIATAMENTE riavvia pod test
log_info "Riavvio workload test..."
export KUBECONFIG=/tmp/vcluster-kubeconfig.yaml
kubectl delete pod -l app=nginx -n test-app --grace-period=0 --force
unset KUBECONFIG
# STEP 4: Attendi ripresa vcluster
log_info "Attesa ripresa syncer + etcd..."
if [ -n "$ETCD_POD" ]; then
kubectl wait --for=condition=ready pod -l app=vcluster -n ${NAMESPACE} --timeout=300s
kubectl wait --for=condition=ready pod -l app=vcluster-etcd -n ${NAMESPACE} --timeout=300s
else
kubectl wait --for=condition=ready pod -l app=vcluster -n ${NAMESPACE} --timeout=300s
fi
# STEP 5: Attesa stabilizzazione prima verifica
sleep 15
verify_state
}
# Step 7: Verifica lo stato e rileva il problema
verify_state() {
log_info "Verifica stato post-simulazione..."
# Verifica che il kubeconfig del vcluster esista
if [ ! -f /tmp/vcluster-kubeconfig.yaml ]; then
log_error "Kubeconfig del vcluster non trovato"
return 1
fi
# Usa il kubeconfig con port-forward
export KUBECONFIG=/tmp/vcluster-kubeconfig.yaml
# Verifica che il port-forward sia ancora attivo
if [ -f /tmp/vcluster-port-forward.pid ]; then
PORT_FORWARD_PID=$(cat /tmp/vcluster-port-forward.pid)
if ! kill -0 ${PORT_FORWARD_PID} 2>/dev/null; then
log_warn "Port-forward non attivo, riavvio..."
kubectl port-forward -n ${NAMESPACE} service/${VCLUSTER_NAME} 8443:443 &
NEW_PID=$!
echo ${NEW_PID} > /tmp/vcluster-port-forward.pid
sleep 10
fi
fi
# Test di connettività al vcluster
log_info "Test connettività al vcluster..."
if ! kubectl get nodes &>/dev/null; then
log_error "Impossibile connettersi al vcluster"
return 1
fi
# Controlla se ci sono pod con container ready ma Ready condition mancante
log_info "Controllo pod readiness..."
kubectl get pods -n test-app -o json 2>/dev/null | jq -r '
.items[] |
select(.status.containerStatuses[0].ready == true) |
select(.status.conditions | map(select(.type == "Ready")) | length == 0) |
"PROBLEMA RILEVATO: Pod " + .metadata.name + " ha container ready ma Ready condition mancante!"
' 2>/dev/null || true
# Controlla endpoints
log_info "Controllo endpoints..."
kubectl get endpoints -n test-app -o json 2>/dev/null | jq -r '
.items[] |
select(.subsets[0].notReadyAddresses) |
"PROBLEMA RILEVATO: Endpoint " + .metadata.name + " ha indirizzi in notReadyAddresses: " + (.subsets[0].notReadyAddresses | map(.ip) | join(", "))
' 2>/dev/null || true
# Stato dettagliato
log_info "Stato endpoint nginx:"
kubectl get endpoints nginx -n test-app -o yaml 2>/dev/null || log_warn "Impossibile ottenere endpoint"
log_info "Stato pod nginx:"
kubectl get pods -n test-app -o wide 2>/dev/null || log_warn "Impossibile ottenere pod"
log_info "Condizioni pod nginx:"
kubectl get pods -n test-app -o json 2>/dev/null | jq '.items[].status.conditions' || log_warn "Impossibile ottenere condizioni"
# Test connettività
log_info "Test connettività servizio..."
kubectl run test-client --image=curlimages/curl:latest --rm -i --restart=Never -n test-app -- curl -s -m 5 http://nginx 2>&1 || log_error "Test connettività FALLITO!"
}
# Step 8: Pulizia
cleanup() {
log_warn "Pulizia ambiente..."
# Ferma il port-forward se attivo
if [ -f /tmp/vcluster-port-forward.pid ]; then
PORT_FORWARD_PID=$(cat /tmp/vcluster-port-forward.pid)
if kill -0 ${PORT_FORWARD_PID} 2>/dev/null; then
log_info "Arresto port-forward..."
kill ${PORT_FORWARD_PID} 2>/dev/null || true
fi
rm -f /tmp/vcluster-port-forward.pid
fi
kind delete cluster --name ${CLUSTER_NAME}
rm -f /tmp/vcluster-values.yaml /tmp/vcluster-kubeconfig.yaml
log_info "Pulizia completata"
}
# Menu principale
main() {
echo "================================"
echo "vcluster Endpoint Sync Test Tool"
echo "================================"
echo ""
echo "Questo script simula il problema di endpoint sync in locale"
echo ""
case "${1:-}" in
setup)
create_kind_cluster
install_cilium
prepare_environment
install_vcluster
create_test_workload
log_info "Setup completato! Usa './runbook.sh test' per simulare il problema"
;;
test)
simulate_problem
verify_state
;;
cleanup)
cleanup
;;
full)
create_kind_cluster
install_cilium
prepare_environment
install_vcluster
create_test_workload
simulate_problem
verify_state
log_warn "Test completato. Usa './runbook.sh cleanup' per eliminare il cluster"
;;
*)
echo "Uso: $0 {setup|test|cleanup|full}"
echo ""
echo "Comandi:"
echo " setup - Crea cluster Kind e installa vcluster"
echo " test - Simula il problema (richiede setup)"
echo " cleanup- Elimina cluster Kind"
echo " full - Esegue setup + test"
echo ""
echo "Esempio:"
echo " $0 full"
exit 1
;;
esac
}
main "$@"
#!/usr/bin/env bash
# Runbook: Simula il problema vcluster endpoint sync in locale con Kind
# Questo script replica l'ambiente di produzione per testare il bug
set -euo pipefail
# Colori per output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Configurazione
CLUSTER_NAME="vcluster-test"
NAMESPACE="test-env-development"
VCLUSTER_NAME="development-vcluster"
log_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Step 1: Crea cluster Kind con 3 worker nodes
create_kind_cluster() {
log_info "Creazione cluster Kind con 3 worker nodes..."
cat <<EOF | kind create cluster --name ${CLUSTER_NAME} --config=-
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
networking:
disableDefaultCNI: true # Disabiliteremo il CNI di default per installare Cilium
podSubnet: "10.244.0.0/16"
serviceSubnet: "10.96.0.0/12"
nodes:
- role: control-plane
kubeadmConfigPatches:
- |
kind: InitConfiguration
nodeRegistration:
kubeletExtraArgs:
node-labels: "ingress-ready=true"
- role: worker
labels:
team: "test-project"
eng.it/project: "test-project"
eng.it/karpenter-role: "project-test-project-env"
- role: worker
labels:
team: "test-project"
eng.it/project: "test-project"
eng.it/karpenter-role: "project-test-project-env"
- role: worker
labels:
team: "test-project"
eng.it/project: "test-project"
eng.it/karpenter-role: "project-test-project-env"
EOF
log_info "Cluster Kind creato con successo"
}
# Step 2: Installa Cilium (configurazione adattata per Kind)
install_cilium() {
log_info "Installazione Cilium..."
# Aggiungi repo Helm (ignora se già esiste)
helm repo add cilium https://helm.cilium.io/ 2>/dev/null || true
helm repo update
# Installa Cilium con configurazione simile a EKS ma adattata per Kind
# Nota: Non usiamo ENI mode su Kind, usiamo il tunnel mode
helm install cilium cilium/cilium \
--namespace kube-system \
--version 1.18.5 \
--set ipam.mode=kubernetes \
--set hubble.relay.enabled=true \
--set hubble.ui.enabled=true \
--set kubeProxyReplacement=true \
--set k8sServiceHost=${CLUSTER_NAME}-control-plane \
--set k8sServicePort=6443 \
--set socketLB.hostNamespaceOnly=true \
--set localRedirectPolicy=true \
--set loadBalancer.serviceTopology=true \
--wait
log_info "Cilium installato con successo"
# Attendi che Cilium sia pronto
log_info "Attesa ready Cilium..."
kubectl wait --for=condition=ready pod -l k8s-app=cilium -n kube-system --timeout=300s
}
# Step 3: Crea namespace con taint (simula il NodePool di Karpenter)
prepare_environment() {
log_info "Preparazione ambiente..."
# Crea namespace con label per nodeSelector
kubectl create namespace ${NAMESPACE}
kubectl label namespace ${NAMESPACE} team=test-project
# Applica taint ai worker nodes (simula il comportamento Karpenter)
log_info "Applicazione taints ai worker nodes..."
for node in $(kubectl get nodes -l role=worker -o jsonpath='{.items[*].metadata.name}'); do
kubectl taint nodes ${node} eng.it/karpenter-role=project-test-project-env:NoSchedule --overwrite
done
log_info "Ambiente preparato"
}
# Step 4: Installa vcluster con configurazione identica a produzione
install_vcluster() {
log_info "Installazione vcluster con configurazione produzione..."
# Crea file values
cat > /tmp/vcluster-values.yaml <<EOF
# Configurazione semplificata per Kind - senza hostname esterno
exportKubeConfig:
server: "https://localhost:8443"
controlPlane:
distro:
k8s:
enabled: true
version: "v1.32.2"
coredns:
deployment:
replicas: 2
backingStore:
etcd:
deploy:
enabled: true
statefulSet:
highAvailability:
replicas: 3
extraArgs:
- "--auto-compaction-retention=30m"
- "--auto-compaction-mode=periodic"
- "--quota-backend-bytes=8589934592"
scheduling:
tolerations:
- effect: "NoSchedule"
key: "eng.it/karpenter-role"
operator: "Equal"
value: "project-test-project-env"
nodeSelector:
team: "test-project"
"eng.it/project": "test-project"
"eng.it/karpenter-role": "project-test-project-env"
affinity:
podAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: "app"
operator: "In"
values: ["vcluster-etcd"]
- key: "release"
operator: "In"
values: ["${VCLUSTER_NAME}"]
topologyKey: "topology.kubernetes.io/zone"
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: "app"
operator: "In"
values: ["vcluster-etcd"]
- key: "release"
operator: "In"
values: ["${VCLUSTER_NAME}"]
topologyKey: "kubernetes.io/hostname"
statefulSet:
highAvailability:
replicas: 3
scheduling:
tolerations:
- effect: "NoSchedule"
key: "eng.it/karpenter-role"
operator: "Equal"
value: "project-test-project-env"
nodeSelector:
team: "test-project"
"eng.it/project": "test-project"
"eng.it/karpenter-role": "project-test-project-env"
affinity:
podAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: "app"
operator: "In"
values: ["vcluster"]
- key: "release"
operator: "In"
values: ["${VCLUSTER_NAME}"]
topologyKey: "topology.kubernetes.io/zone"
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: "app"
operator: "In"
values: ["vcluster"]
- key: "release"
operator: "In"
values: ["${VCLUSTER_NAME}"]
topologyKey: "kubernetes.io/hostname"
sync:
toHost:
podDisruptionBudgets:
enabled: true
pods:
enabled: true
enforceTolerations:
- "eng.it/karpenter-role=project-test-project-env:NoSchedule"
serviceAccounts:
enabled: true
ingresses:
enabled: true
fromHost:
nodes:
enabled: true
selector:
labels:
team: "test-project"
"eng.it/project": "test-project"
"eng.it/karpenter-role": "project-test-project-env"
integrations:
metricsServer:
enabled: true
nodes: true
pods: true
EOF
# Aggiungi repo vcluster
helm repo add loft-sh https://charts.loft.sh
helm repo update
# Installa vcluster
helm install ${VCLUSTER_NAME} loft-sh/vcluster \
--namespace ${NAMESPACE} \
--version 0.31.0 \
--values /tmp/vcluster-values.yaml \
--wait \
--timeout 600s
log_info "vcluster installato con successo"
# Attendi che tutti i pod siano ready
log_info "Attesa ready vcluster..."
kubectl wait --for=condition=ready pod -l app=vcluster -n ${NAMESPACE} --timeout=300s
kubectl wait --for=condition=ready pod -l app=vcluster-etcd -n ${NAMESPACE} --timeout=300s
}
# Step 5: Crea workload di test
create_test_workload() {
log_info "Creazione workload di test..."
# Ottieni kubeconfig del vcluster
kubectl get secret vc-${VCLUSTER_NAME} -n ${NAMESPACE} -o jsonpath='{.data.config}' | base64 -d > /tmp/vcluster-kubeconfig.yaml
# Verifica che il secret esista
if [ ! -s /tmp/vcluster-kubeconfig.yaml ]; then
log_error "Impossibile ottenere kubeconfig dal secret vc-${VCLUSTER_NAME}"
exit 1
fi
log_info "Kubeconfig salvata in /tmp/vcluster-kubeconfig.yaml"
# Modifica il kubeconfig per usare port-forward invece dell'hostname
# Sostituisci il server con localhost:8443 per il port-forward
sed -i "s|server: https://.*|server: https://localhost:8443|" /tmp/vcluster-kubeconfig.yaml
# Avvia port-forward in background
log_info "Avvio port-forward verso il vcluster..."
kubectl port-forward -n ${NAMESPACE} service/${VCLUSTER_NAME} 8443:443 &
PORT_FORWARD_PID=$!
# Attendi che il port-forward sia pronto
log_info "Attesa port-forward (10s)..."
sleep 10
# Esporta il kubeconfig modificato
export KUBECONFIG=/tmp/vcluster-kubeconfig.yaml
# Crea namespace test-app
log_info "Creazione namespace test-app..."
kubectl create namespace test-app 2>/dev/null || log_info "Namespace test-app esiste già"
# Crea deployment nginx
log_info "Creazione deployment nginx..."
kubectl apply -f - <<'DEPLOYMENT'
apiVersion: apps/v1
kind: Deployment
metadata:
name: nginx-test
namespace: test-app
spec:
replicas: 2
selector:
matchLabels:
app: nginx
template:
metadata:
labels:
app: nginx
spec:
containers:
- name: nginx
image: nginx:alpine
ports:
- containerPort: 80
readinessProbe:
httpGet:
path: /
port: 80
initialDelaySeconds: 5
periodSeconds: 5
DEPLOYMENT
# Crea service nginx
log_info "Creazione service nginx..."
kubectl apply -f - <<'SERVICE'
apiVersion: v1
kind: Service
metadata:
name: nginx
namespace: test-app
spec:
selector:
app: nginx
ports:
- port: 80
targetPort: 80
type: ClusterIP
SERVICE
log_info "Workload di test creato"
# Attendi che i pod siano ready
log_info "Attesa ready pod (30s)..."
sleep 30
# Verifica stato
log_info "Verifica stato workload..."
kubectl get pods -n test-app
log_info "Stato endpoint:"
kubectl get endpoints -n test-app
# Salva il PID del port-forward per pulizia successiva
echo ${PORT_FORWARD_PID} > /tmp/vcluster-port-forward.pid
}
# Step 6: Simula il problema (elimina 1 syncer + 1 etcd contemporaneamente)
simulate_problem() {
log_warn "SIMULAZIONE PROBLEMA: Eliminazione pod vcluster..."
# Salva il kubeconfig originale
ORIGINAL_KUBECONFIG=${KUBECONFIG:-}
unset KUBECONFIG
# Seleziona un pod syncer da eliminare (usa kubeconfig di default del cluster host)
SYNCER_POD=$(kubectl get pods -n ${NAMESPACE} -l app=vcluster -o jsonpath='{.items[0].metadata.name}')
# Verifica se esiste un etcd separato (deployed mode) o embedded
ETCD_POD=$(kubectl get pods -n ${NAMESPACE} -l app=vcluster-etcd -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
# Ferma il port-forward prima di eliminare i pod
log_info "Arresto port-forward esistente..."
if [ -f /tmp/vcluster-port-forward.pid ]; then
OLD_PID=$(cat /tmp/vcluster-port-forward.pid)
kill ${OLD_PID} 2>/dev/null || true
sleep 2
fi
if [ -n "$ETCD_POD" ]; then
log_info "Modalità DEPLOYED etcd rilevata"
log_info "Eliminazione pod: ${SYNCER_POD} e ${ETCD_POD}"
# Elimina entrambi contemporaneamente (simula eviction Karpenter)
kubectl delete pod ${SYNCER_POD} -n ${NAMESPACE} --grace-period=0 --force &
kubectl delete pod ${ETCD_POD} -n ${NAMESPACE} --grace-period=0 --force &
wait
log_warn "Pod eliminati. Attesa ricreazione..."
sleep 30
# Attendi ricreazione di entrambi
log_info "Attesa ricreazione pod..."
kubectl wait --for=condition=ready pod -l app=vcluster -n ${NAMESPACE} --timeout=300s || true
kubectl wait --for=condition=ready pod -l app=vcluster-etcd -n ${NAMESPACE} --timeout=300s || true
else
log_info "Modalità EMBEDDED etcd rilevata"
log_info "Eliminazione pod: ${SYNCER_POD} (contiene anche etcd)"
# Elimina solo il pod vcluster (che contiene anche etcd embedded)
kubectl delete pod ${SYNCER_POD} -n ${NAMESPACE} --grace-period=0 --force
log_warn "Pod eliminato. Attesa ricreazione..."
sleep 30
# Attendi ricreazione
log_info "Attesa ricreazione pod..."
kubectl wait --for=condition=ready pod -l app=vcluster -n ${NAMESPACE} --timeout=300s || true
fi
# Riavvia il port-forward dopo la ricreazione dei pod
log_info "Riavvio port-forward..."
kubectl port-forward -n ${NAMESPACE} service/${VCLUSTER_NAME} 8443:443 &
PORT_FORWARD_PID=$!
echo ${PORT_FORWARD_PID} > /tmp/vcluster-port-forward.pid
# Ripristina kubeconfig
if [ -n "$ORIGINAL_KUBECONFIG" ]; then
export KUBECONFIG=$ORIGINAL_KUBECONFIG
fi
# Attendi che il port-forward sia pronto
log_info "Attesa port-forward (15s)..."
sleep 15
# Attendi ulteriormente per stabilizzazione
log_info "Attesa stabilizzazione (60s)..."
sleep 60
}
# Step 7: Verifica lo stato e rileva il problema
verify_state() {
log_info "Verifica stato post-simulazione..."
# Verifica che il kubeconfig del vcluster esista
if [ ! -f /tmp/vcluster-kubeconfig.yaml ]; then
log_error "Kubeconfig del vcluster non trovato"
return 1
fi
# Usa il kubeconfig con port-forward
export KUBECONFIG=/tmp/vcluster-kubeconfig.yaml
# Verifica che il port-forward sia ancora attivo
if [ -f /tmp/vcluster-port-forward.pid ]; then
PORT_FORWARD_PID=$(cat /tmp/vcluster-port-forward.pid)
if ! kill -0 ${PORT_FORWARD_PID} 2>/dev/null; then
log_warn "Port-forward non attivo, riavvio..."
kubectl port-forward -n ${NAMESPACE} service/${VCLUSTER_NAME} 8443:443 &
NEW_PID=$!
echo ${NEW_PID} > /tmp/vcluster-port-forward.pid
sleep 10
fi
fi
# Test di connettività al vcluster
log_info "Test connettività al vcluster..."
if ! kubectl get nodes &>/dev/null; then
log_error "Impossibile connettersi al vcluster"
return 1
fi
# Controlla se ci sono pod con container ready ma Ready condition mancante
log_info "Controllo pod readiness..."
kubectl get pods -n test-app -o json 2>/dev/null | jq -r '
.items[] |
select(.status.containerStatuses[0].ready == true) |
select(.status.conditions | map(select(.type == "Ready")) | length == 0) |
"PROBLEMA RILEVATO: Pod " + .metadata.name + " ha container ready ma Ready condition mancante!"
' 2>/dev/null || true
# Controlla endpoints
log_info "Controllo endpoints..."
kubectl get endpoints -n test-app -o json 2>/dev/null | jq -r '
.items[] |
select(.subsets[0].notReadyAddresses) |
"PROBLEMA RILEVATO: Endpoint " + .metadata.name + " ha indirizzi in notReadyAddresses: " + (.subsets[0].notReadyAddresses | map(.ip) | join(", "))
' 2>/dev/null || true
# Stato dettagliato
log_info "Stato endpoint nginx:"
kubectl get endpoints nginx -n test-app -o yaml 2>/dev/null || log_warn "Impossibile ottenere endpoint"
log_info "Stato pod nginx:"
kubectl get pods -n test-app -o wide 2>/dev/null || log_warn "Impossibile ottenere pod"
log_info "Condizioni pod nginx:"
kubectl get pods -n test-app -o json 2>/dev/null | jq '.items[].status.conditions' || log_warn "Impossibile ottenere condizioni"
# Test connettività
log_info "Test connettività servizio..."
kubectl run test-client --image=curlimages/curl:latest --rm -i --restart=Never -n test-app -- curl -s -m 5 http://nginx 2>&1 || log_error "Test connettività FALLITO!"
}
# Step 8: Pulizia
cleanup() {
log_warn "Pulizia ambiente..."
# Ferma il port-forward se attivo
if [ -f /tmp/vcluster-port-forward.pid ]; then
PORT_FORWARD_PID=$(cat /tmp/vcluster-port-forward.pid)
if kill -0 ${PORT_FORWARD_PID} 2>/dev/null; then
log_info "Arresto port-forward..."
kill ${PORT_FORWARD_PID} 2>/dev/null || true
fi
rm -f /tmp/vcluster-port-forward.pid
fi
kind delete cluster --name ${CLUSTER_NAME}
rm -f /tmp/vcluster-values.yaml /tmp/vcluster-kubeconfig.yaml
log_info "Pulizia completata"
}
# Menu principale
main() {
echo "================================"
echo "vcluster Endpoint Sync Test Tool"
echo "================================"
echo ""
echo "Questo script simula il problema di endpoint sync in locale"
echo ""
case "${1:-}" in
setup)
create_kind_cluster
install_cilium
prepare_environment
install_vcluster
create_test_workload
log_info "Setup completato! Usa './runbook.sh test' per simulare il problema"
;;
test)
simulate_problem
verify_state
;;
cleanup)
cleanup
;;
full)
create_kind_cluster
install_cilium
prepare_environment
install_vcluster
create_test_workload
simulate_problem
verify_state
log_warn "Test completato. Usa './runbook.sh cleanup' per eliminare il cluster"
;;
*)
echo "Uso: $0 {setup|test|cleanup|full}"
echo ""
echo "Comandi:"
echo " setup - Crea cluster Kind e installa vcluster"
echo " test - Simula il problema (richiede setup)"
echo " cleanup- Elimina cluster Kind"
echo " full - Esegue setup + test"
echo ""
echo "Esempio:"
echo " $0 full"
exit 1
;;
esac
}
main "$@"
exportKubeConfig:
server: "https://localhost:8443"
controlPlane:
distro:
k8s:
enabled: true
version: "v1.32.2"
coredns:
deployment:
replicas: 2
backingStore:
etcd:
deploy:
enabled: true
statefulSet:
highAvailability:
replicas: 3
extraArgs:
- "--auto-compaction-retention=30m"
- "--auto-compaction-mode=periodic"
- "--quota-backend-bytes=8589934592"
scheduling:
tolerations:
- effect: "NoSchedule"
key: "eng.it/karpenter-role"
operator: "Equal"
value: "project-test-project-env"
nodeSelector:
team: "test-project"
"eng.it/project": "test-project"
"eng.it/karpenter-role": "project-test-project-env"
affinity:
podAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: "app"
operator: "In"
values: ["vcluster-etcd"]
- key: "release"
operator: "In"
values: ["development-vcluster"]
topologyKey: "topology.kubernetes.io/zone"
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: "app"
operator: "In"
values: ["vcluster-etcd"]
- key: "release"
operator: "In"
values: ["development-vcluster"]
topologyKey: "kubernetes.io/hostname"
statefulSet:
probes:
readinessProbe:
enabled: true
failureThreshold: 60
periodSeconds: 2
timeoutSeconds: 3
# Use startupProbe to give the syncer time to warm up its cache
# 30 attempts * 2 seconds = 60 seconds total warmup time
startupProbe:
enabled: true
failureThreshold: 30
periodSeconds: 2
timeoutSeconds: 3
highAvailability:
replicas: 3
scheduling:
tolerations:
- effect: "NoSchedule"
key: "eng.it/karpenter-role"
operator: "Equal"
value: "project-test-project-env"
nodeSelector:
team: "test-project"
"eng.it/project": "test-project"
"eng.it/karpenter-role": "project-test-project-env"
affinity:
podAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: "app"
operator: "In"
values: ["vcluster"]
- key: "release"
operator: "In"
values: ["development-vcluster"]
topologyKey: "topology.kubernetes.io/zone"
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: "app"
operator: "In"
values: ["vcluster"]
- key: "release"
operator: "In"
values: ["development-vcluster"]
topologyKey: "kubernetes.io/hostname"
sync:
toHost:
podDisruptionBudgets:
enabled: true
pods:
enabled: true
enforceTolerations:
- "eng.it/karpenter-role=project-test-project-env:NoSchedule"
serviceAccounts:
enabled: true
ingresses:
enabled: true
fromHost:
nodes:
enabled: true
selector:
labels:
team: "test-project"
"eng.it/project": "test-project"
"eng.it/karpenter-role": "project-test-project-env"
integrations:
metricsServer:
enabled: true
nodes: true
pods: true
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment