arubis · December 18, 2025 18:33
diff --git a/apex-arena-entrypoint-wait.patch b/apex-arena-entrypoint-wait.patch
 From: Claude Code
 Subject: [PATCH] Fix race condition in test-solution by waiting for entrypoint completion

 The test-solution command starts containers with `docker run -d ... sleep infinity`
 which returns immediately, then runs `docker exec setup.sh` before the container's
 entrypoint has finished critical initialization (node cleanup, PV recreation, etc.).

 This causes Nebula-based tasks to fail because:
 1. Snapshot contains stale node ID from when it was created
 2. Entrypoint cleans up old nodes at line ~585
 3. But setup.sh runs before entrypoint reaches cleanup
 4. Pods get scheduled on non-existent old node
 5. Deployments show 0/0 replicas

 The fix adds a wait loop that monitors container logs for "Fast-boot complete!"
 before proceeding with setup.sh execution.

 Tested with scale-deployment task:
 - Before: 0/0 replicas, grader fails with "Timeline-service has 0 replicas"
 - After: 3/3 replicas, scaling works correctly

 ---
 apex_arena/cli.py | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

 diff --git a/apex_arena/cli.py b/apex_arena/cli.py
 --- a/apex_arena/cli.py
 +++ b/apex_arena/cli.py
 @@ -3233,6 +3233,46 @@ def test_solution(task_id: str, force: bool = False):
                 )
                 sys.exit(1)
 
 +            # Wait for container entrypoint to complete (node cleanup, PV recreation, etc.)
 +            # The entrypoint prints "Fast-boot complete!" when initialization is done
 +            console.print("⏳ Waiting for container entrypoint to complete...")
 +            boot_timeout = 300  # 5 minutes for boot
 +            boot_start = time.time()
 +            entrypoint_complete = False
 +
 +            while time.time() - boot_start < boot_timeout:
 +                # Check container logs for completion marker
 +                logs_result = subprocess.run(
 +                    ["docker", "logs", test_container_name],
 +                    capture_output=True,
 +                    text=True,
 +                )
 +                if "Fast-boot complete!" in logs_result.stdout or "Fast-boot complete!" in logs_result.stderr:
 +                    entrypoint_complete = True
 +                    console.print("✅ Container entrypoint completed")
 +                    break
 +
 +                # Also check if container exited unexpectedly
 +                inspect_result = subprocess.run(
 +                    ["docker", "inspect", test_container_name, "--format", "{{.State.Running}}"],
 +                    capture_output=True,
 +                    text=True,
 +                )
 +                if inspect_result.stdout.strip() != "true":
 +                    console.print(f"[red]❌ Container exited unexpectedly[/red]")
 +                    console.print(f"Logs: {logs_result.stdout}")
 +                    sys.exit(1)
 +
 +                # Brief status update every 30 seconds
 +                elapsed = int(time.time() - boot_start)
 +                if elapsed > 0 and elapsed % 30 == 0:
 +                    console.print(f"    Still waiting... ({elapsed}s elapsed)")
 +
 +                time.sleep(5)
 +
 +            if not entrypoint_complete:
 +                console.print(f"[yellow]⚠️  Entrypoint did not complete within {boot_timeout}s, proceeding anyway[/yellow]")
 +
             # Apply iptables rules to block internet
             try:
                 # Get container IP and gateway
diff --git a/apex-arena-race-condition-fix.md b/apex-arena-race-condition-fix.md
	From: Claude Code
	Subject: [PATCH] Fix race condition in test-solution by waiting for entrypoint completion

	The test-solution command starts containers with `docker run -d ... sleep infinity`
	which returns immediately, then runs `docker exec setup.sh` before the container's
	entrypoint has finished critical initialization (node cleanup, PV recreation, etc.).

	This causes Nebula-based tasks to fail because:
	1. Snapshot contains stale node ID from when it was created
	2. Entrypoint cleans up old nodes at line ~585
	3. But setup.sh runs before entrypoint reaches cleanup
	4. Pods get scheduled on non-existent old node
	5. Deployments show 0/0 replicas

	The fix adds a wait loop that monitors container logs for "Fast-boot complete!"
	before proceeding with setup.sh execution.

	Tested with scale-deployment task:
	- Before: 0/0 replicas, grader fails with "Timeline-service has 0 replicas"
	- After: 3/3 replicas, scaling works correctly

	---
	apex_arena/cli.py \| 40 ++++++++++++++++++++++++++++++++++++++++
	1 file changed, 40 insertions(+)

	diff --git a/apex_arena/cli.py b/apex_arena/cli.py
	--- a/apex_arena/cli.py
	+++ b/apex_arena/cli.py
	@@ -3233,6 +3233,46 @@ def test_solution(task_id: str, force: bool = False):
	)
	sys.exit(1)

	+ # Wait for container entrypoint to complete (node cleanup, PV recreation, etc.)
	+ # The entrypoint prints "Fast-boot complete!" when initialization is done
	+ console.print("⏳ Waiting for container entrypoint to complete...")
	+ boot_timeout = 300 # 5 minutes for boot
	+ boot_start = time.time()
	+ entrypoint_complete = False
	+
	+ while time.time() - boot_start < boot_timeout:
	+ # Check container logs for completion marker
	+ logs_result = subprocess.run(
	+ ["docker", "logs", test_container_name],
	+ capture_output=True,
	+ text=True,
	+ )
	+ if "Fast-boot complete!" in logs_result.stdout or "Fast-boot complete!" in logs_result.stderr:
	+ entrypoint_complete = True
	+ console.print("✅ Container entrypoint completed")
	+ break
	+
	+ # Also check if container exited unexpectedly
	+ inspect_result = subprocess.run(
	+ ["docker", "inspect", test_container_name, "--format", "{{.State.Running}}"],
	+ capture_output=True,
	+ text=True,
	+ )
	+ if inspect_result.stdout.strip() != "true":
	+ console.print(f"[red]❌ Container exited unexpectedly[/red]")
	+ console.print(f"Logs: {logs_result.stdout}")
	+ sys.exit(1)
	+
	+ # Brief status update every 30 seconds
	+ elapsed = int(time.time() - boot_start)
	+ if elapsed > 0 and elapsed % 30 == 0:
	+ console.print(f" Still waiting... ({elapsed}s elapsed)")
	+
	+ time.sleep(5)
	+
	+ if not entrypoint_complete:
	+ console.print(f"[yellow]⚠️ Entrypoint did not complete within {boot_timeout}s, proceeding anyway[/yellow]")
	+
	# Apply iptables rules to block internet
	try:
	# Get container IP and gateway
Metric	Before Fix	After Fix
timeline-service replicas	0/0	3/3
Scaling test	FAIL	PASS
Test suite	N/A (couldn't run)	94/96 passed