Skip to content

Instantly share code, notes, and snippets.

@NorikDavtian
Last active February 14, 2026 08:17
Show Gist options
  • Select an option

  • Save NorikDavtian/bcc2e2f6c15a8dbb5c4c5c17bc88a86f to your computer and use it in GitHub Desktop.

Select an option

Save NorikDavtian/bcc2e2f6c15a8dbb5c4c5c17bc88a86f to your computer and use it in GitHub Desktop.
Run a multi agent Solo Software Shop
#!/bin/bash
###############################################################################
# agent.sh — Multi-Agent Development Pipeline
#
# Usage:
# ./agent.sh # Run full daily cycle
# ./agent.sh plan # Run only the planning agent
# ./agent.sh dev # Run only the dev agent
# ./agent.sh review # Run only the code review agent
# ./agent.sh fix # Run only the fix agent
# ./agent.sh qa # Deep coverage mode: hunt untested code across codebase
# ./agent.sh ux # Run only the UX audit agent
# ./agent.sh security # Run only the security audit agent
# ./agent.sh arch # Run only the architecture review agent
# ./agent.sh hacker # Run only the hacker/bug-hunter agent (find bugs, dead UI, suggest improvements)
# ./agent.sh verify # Run only the final verification gate
# ./agent.sh quick # Run the minimal 3-agent loop (build → review → fix)
# ./agent.sh from <stage> # Run pipeline from a stage (e.g. from dev)
# ./agent.sh status # Show pipeline progress (no AI, instant)
# ./agent.sh manager # AI progress report with completion percentage
# ./agent.sh abort # Gracefully stop the running pipeline
#
# Prerequisites:
# - claude CLI installed and authenticated
# - git repo initialized
# - PRODUCT_SPEC.md in repo root (the source of truth for your product)
#
# Optional:
# - CLAUDE.md in repo root (persistent instructions for all agents)
# - /tasks/templates/ with structured templates (created on first run)
###############################################################################
set -euo pipefail
# ─── Self-copy guard ────────────────────────────────────────────────────────
# Bash reads scripts incrementally — if agent.sh is edited while running,
# the process crashes. Copy to a temp file and re-exec from there so the
# original can be safely modified during a pipeline run.
if [ -z "${AGENT_RUNNING_FROM_COPY:-}" ]; then
tmp_copy="$(mktemp /tmp/agent.sh.XXXXXX)"
cp "$0" "$tmp_copy"
chmod +x "$tmp_copy"
AGENT_RUNNING_FROM_COPY=1 exec "$tmp_copy" "$@"
fi
# ─── Configuration ───────────────────────────────────────────────────────────
PROJECT_ROOT="$(git rev-parse --show-toplevel 2>/dev/null || pwd)"
TASKS_DIR="$PROJECT_ROOT/tasks"
TEMPLATES_DIR="$TASKS_DIR/templates"
DATE_STAMP="$(date +%Y-%m-%d)"
TIME_STAMP="$(date +%H%M%S)"
LOG_DIR="$PROJECT_ROOT/logs/$DATE_STAMP/$TIME_STAMP"
RUN_FILE="$PROJECT_ROOT/.agent.run"
BRANCH_NAME="feature/$DATE_STAMP-$TIME_STAMP"
# Abort / kill switch state
PID_FILE="$PROJECT_ROOT/.agent.pid"
ABORT_FILE="$PROJECT_ROOT/.agent.abort"
CURRENT_PHASE=""
# Colors for terminal output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
PURPLE='\033[0;35m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color
# ─── Setup ───────────────────────────────────────────────────────────────────
setup() {
mkdir -p "$TASKS_DIR" "$TEMPLATES_DIR" "$LOG_DIR"
# Create PRODUCT_SPEC.md if it doesn't exist
if [ ! -f "$PROJECT_ROOT/PRODUCT_SPEC.md" ]; then
echo -e "${YELLOW}⚠ No PRODUCT_SPEC.md found. Creating a starter template...${NC}"
cat > "$PROJECT_ROOT/PRODUCT_SPEC.md" << 'SPEC'
# Product Specification
## Vision
<!-- What is this product and why does it exist? -->
## Target User
<!-- Who is this for? What's their key problem? -->
## Core Features (Priority Order)
<!-- List features with acceptance criteria -->
### Feature 1: [Name]
- **User Story:** As a [user], I want to [action] so that [benefit]
- **Acceptance Criteria:**
- [ ] Criterion 1
- [ ] Criterion 2
- **Edge Cases:**
- What if...
- **Status:** Not Started
## Tech Stack
<!-- Languages, frameworks, infrastructure -->
## UX Principles
<!-- Design values, accessibility requirements, interaction patterns -->
## Non-Functional Requirements
<!-- Performance, security, accessibility standards -->
## Completed Work
<!-- Agents will update this section as features are shipped -->
SPEC
echo -e "${GREEN}✓ Created PRODUCT_SPEC.md — please fill it in before running agents.${NC}"
exit 0
fi
# Create handoff templates
if [ ! -f "$TEMPLATES_DIR/ticket.md" ]; then
create_templates
fi
}
create_templates() {
cat > "$TEMPLATES_DIR/ticket.md" << 'TMPL'
# Feature: [Title]
## Priority
[Critical / High / Medium / Low]
## User Story
As a [user type], I want to [action] so that [benefit].
## Acceptance Criteria
- [ ] Criterion 1
- [ ] Criterion 2
- [ ] Criterion 3
## Edge Cases
1. What happens when input is empty?
2. What happens when input is extremely large?
3. What happens with concurrent users?
4. What happens when the network fails?
5. What happens with unexpected data types?
## Error States
| Trigger | User Sees | System Does |
|---------|-----------|-------------|
| ... | ... | ... |
## UX Requirements
- **Loading state:** ...
- **Empty state:** ...
- **Error state:** ...
- **Success feedback:** ...
- **Mobile behavior:** ...
## Technical Approach
- Files to create/modify: ...
- Dependencies needed: ...
- Key design decisions: ...
## Out of Scope
- ...
TMPL
cat > "$TEMPLATES_DIR/review.md" << 'TMPL'
# Code Review: [Feature Name]
## Review Date
[Date]
## Files Reviewed
- file1.ext
- file2.ext
## Critical Issues (must fix before merge)
| # | File:Line | Issue | Suggested Fix |
|---|-----------|-------|---------------|
## Major Issues (should fix)
| # | File:Line | Issue | Suggested Fix |
|---|-----------|-------|---------------|
## Minor Issues (nice to fix)
| # | File:Line | Issue | Suggested Fix |
|---|-----------|-------|---------------|
## Security Concerns
- ...
## Performance Concerns
- ...
## Quality Score: X/10
## Recommendation: APPROVE / REQUEST CHANGES / BLOCK
TMPL
cat > "$TEMPLATES_DIR/qa-report.md" << 'TMPL'
# QA Report: [Feature Name]
## Test Date
[Date]
## Tests Written
| Test Name | Type | What It Verifies |
|-----------|------|------------------|
## Test Results
- Total: X
- Passed: X
- Failed: X
- Skipped: X
## Failed Tests
| Test | Expected | Actual | Root Cause |
|------|----------|--------|------------|
## Edge Cases Tested
- [ ] Empty input
- [ ] Boundary values
- [ ] Invalid types
- [ ] Concurrent access
- [ ] Network failure
- [ ] Large payloads
## Acceptance Criteria Verification
- [ ] Criterion 1 — PASS/FAIL
- [ ] Criterion 2 — PASS/FAIL
## Bugs Found Outside Tests
| # | Severity | Description | Steps to Reproduce |
|---|----------|-------------|-------------------|
## Confidence Level: HIGH / MEDIUM / LOW
TMPL
cat > "$TEMPLATES_DIR/ux-audit.md" << 'TMPL'
# UX Audit: [Feature Name]
## Audit Date
[Date]
## Usability Issues
| # | Severity | Screen/Component | Issue | Recommendation |
|---|----------|-----------------|-------|----------------|
## Accessibility Issues
| # | WCAG Level | Issue | Fix |
|---|------------|-------|-----|
## Missing States
- [ ] Loading / skeleton
- [ ] Empty / zero data
- [ ] Error / failure
- [ ] Success / confirmation
- [ ] Offline / degraded
- [ ] Permission denied
## Consistency Issues
- ...
## Copy / Labeling Issues
- ...
## Mobile / Responsive Issues
- ...
## Overall UX Score: X/10
TMPL
cat > "$TEMPLATES_DIR/security-audit.md" << 'TMPL'
# Security Audit: [Feature Name]
## Audit Date
[Date]
## Secrets & Credentials
| # | Severity | File:Line | Issue | Fix |
|---|----------|-----------|-------|-----|
## Checklist
- [ ] No secrets, API keys, passwords, or tokens in source code or docs
- [ ] No secrets in git history (check staged diffs)
- [ ] All user input sanitized (SQL injection, XSS, command injection)
- [ ] Authentication checked on all new endpoints
- [ ] Authorization checked — correct role/permission guards
- [ ] No IDOR (Insecure Direct Object Reference) vulnerabilities
- [ ] File uploads validated (type, size, content)
- [ ] Rate limiting on sensitive endpoints
- [ ] Error messages don't leak internal details
- [ ] CORS policy not overly permissive
- [ ] No hardcoded credentials or default passwords
- [ ] Sensitive data encrypted at rest and in transit
## Injection Vulnerabilities
| # | Type | File:Line | Issue | Fix |
|---|------|-----------|-------|-----|
## Auth & Authz Issues
| # | Severity | Endpoint | Issue | Fix |
|---|----------|----------|-------|-----|
## Data Exposure Risks
| # | Severity | Issue | Fix |
|---|----------|-------|-----|
## Dependency Vulnerabilities
- [ ] No known CVEs in dependencies
- [ ] Dependencies pinned to specific versions
## Security Score: X/10
## Recommendation: PASS / CONDITIONAL PASS / FAIL
TMPL
cat > "$TEMPLATES_DIR/architecture-review.md" << 'TMPL'
# Architecture Review: [Feature Name]
## Review Date
[Date]
## Scope of Changes
- Files created: ...
- Files modified: ...
- New dependencies: ...
## Architectural Alignment
- [ ] Follows existing layered architecture (Router → Service → Repository)
- [ ] Models/schemas in correct locations
- [ ] No business logic in routers
- [ ] No database access outside repositories
- [ ] Consistent with existing patterns in the codebase
## Data Model Assessment
| Concern | Status | Notes |
|---------|--------|-------|
| Schema changes backward-compatible | | |
| Migrations reversible | | |
| Indexes added for new queries | | |
| No N+1 query patterns | | |
| Relationships correctly defined | | |
## Scalability Concerns
| # | Area | Issue | Recommendation |
|---|------|-------|----------------|
## API Design
- [ ] RESTful conventions followed
- [ ] Consistent error response format
- [ ] Pagination on list endpoints
- [ ] Proper HTTP status codes
- [ ] Idempotent where appropriate
## Frontend Architecture
- [ ] Components follow existing patterns
- [ ] State management appropriate (local vs Zustand vs context)
- [ ] No prop drilling — uses hooks or context
- [ ] Lazy loading for heavy components
- [ ] API calls in hooks, not components
## Technical Debt Introduced
| # | Description | Severity | Suggested Resolution |
|---|-------------|----------|---------------------|
## Architecture Score: X/10
## Recommendation: APPROVE / REFACTOR / REDESIGN
TMPL
cat > "$TEMPLATES_DIR/hacker-report.md" << 'TMPL'
# Hacker Report: [Feature / Area]
## Audit Date
[Date]
## Dead Buttons & Non-Functional UI
| # | Severity | Screen/Component | Element | Expected Behavior | Actual Behavior |
|---|----------|-----------------|---------|-------------------|-----------------|
## Visual Misalignments & Layout Bugs
| # | Severity | Screen/Component | Issue | Screenshot/Description | Fix |
|---|----------|-----------------|-------|----------------------|-----|
## Broken Flows & Logic Bugs
| # | Severity | Flow | Steps to Reproduce | Expected | Actual |
|---|----------|------|--------------------|---------|----|
## Missing Error Handling
| # | Severity | Screen/Component | Scenario | What Happens | What Should Happen |
|---|----------|-----------------|----------|-------------|-------------------|
## Product Improvement Suggestions
| # | Impact | Area | Suggestion | Rationale |
|---|--------|------|------------|-----------|
## UX Quick Wins
| # | Effort | Area | Improvement | User Benefit |
|---|--------|------|-------------|-------------|
## Summary
- Dead UI elements found: X
- Visual bugs found: X
- Logic bugs found: X
- Improvements suggested: X
- Items fixed by hacker: X
## Chaos Score: X/10
(How many things broke when poked? Higher = more fragile)
TMPL
echo -e "${GREEN}✓ Created handoff templates in $TEMPLATES_DIR${NC}"
}
# ─── Abort / Kill Switch ────────────────────────────────────────────────────
register_pipeline() {
# Write our PID and start time so `abort` can find us
cat > "$PID_FILE" << EOF
pid=$$
started=$(date +%Y-%m-%dT%H:%M:%S)
phase=$CURRENT_PHASE
EOF
# Save current run's log directory so `status` finds the right logs
echo "$LOG_DIR" > "$RUN_FILE"
# Clean up PID file and abort flag on normal exit
trap cleanup_on_exit EXIT
}
cleanup_on_exit() {
rm -f "$PID_FILE"
rm -f "$ABORT_FILE"
# Clean up the temp copy we're running from
[ -n "${AGENT_RUNNING_FROM_COPY:-}" ] && rm -f "$0" 2>/dev/null
}
update_phase() {
CURRENT_PHASE="$1"
if [ -f "$PID_FILE" ]; then
local tmp
tmp=$(mktemp)
sed "s/^phase=.*/phase=$CURRENT_PHASE/" "$PID_FILE" > "$tmp" && mv "$tmp" "$PID_FILE"
fi
}
check_abort() {
if [ -f "$ABORT_FILE" ]; then
echo ""
echo -e "${RED}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e "${RED} ⛔ Abort requested — shutting down gracefully${NC}"
echo -e "${RED}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo ""
write_abort_summary "User requested abort"
git_checkpoint "wip: aborted during $CURRENT_PHASE"
echo -e "${YELLOW}Work-in-progress committed. Resume with './agent.sh <next-stage>'.${NC}"
exit 0
fi
}
graceful_shutdown() {
local signal="$1"
echo ""
echo -e "${RED}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e "${RED} ⛔ Signal received ($signal) — graceful shutdown${NC}"
echo -e "${RED}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo ""
# Kill child processes (script + claude) spawned by this pipeline
local children
children=$(pgrep -P $$ 2>/dev/null || true)
if [ -n "$children" ]; then
echo -e "${YELLOW}Stopping agent processes...${NC}"
kill -TERM $children 2>/dev/null || true
sleep 2
# Force-kill any stragglers
for child in $children; do
kill -0 "$child" 2>/dev/null && kill -KILL "$child" 2>/dev/null
done
fi
write_abort_summary "Signal $signal during phase: $CURRENT_PHASE"
# Checkpoint whatever work exists
git add -A 2>/dev/null
git diff --cached --quiet 2>/dev/null || \
git commit -m "wip: aborted during $CURRENT_PHASE (signal $signal)" 2>/dev/null
echo -e "${YELLOW}Work-in-progress committed. Resume with './agent.sh <next-stage>'.${NC}"
echo ""
rm -f "$PID_FILE"
rm -f "$ABORT_FILE"
exit 130
}
write_abort_summary() {
local reason="$1"
mkdir -p "$TASKS_DIR"
cat > "$TASKS_DIR/abort-summary.md" << EOF
# Pipeline Aborted
**Date:** $(date +%Y-%m-%dT%H:%M:%S)
**Phase at abort:** $CURRENT_PHASE
**Reason:** $reason
## Completed Before Abort
Check the logs directory for completed phases: $LOG_DIR/
## How to Resume
1. Review the work done so far: \`git log --oneline -10\`
2. Pick up where you left off: \`./agent.sh <next-stage>\`
3. Or restart the full pipeline: \`./agent.sh full\`
EOF
echo -e "${CYAN}Abort summary written to $TASKS_DIR/abort-summary.md${NC}"
}
do_abort() {
if [ ! -f "$PID_FILE" ]; then
echo -e "${YELLOW}No running pipeline found (no .agent.pid file).${NC}"
echo ""
echo "If an agent is running outside the pipeline, use Ctrl+C in its terminal."
exit 0
fi
local target_pid phase started
target_pid=$(grep "^pid=" "$PID_FILE" | cut -d= -f2)
phase=$(grep "^phase=" "$PID_FILE" | cut -d= -f2)
started=$(grep "^started=" "$PID_FILE" | cut -d= -f2)
if ! kill -0 "$target_pid" 2>/dev/null; then
echo -e "${YELLOW}Pipeline (PID $target_pid) is no longer running.${NC}"
echo -e "${YELLOW}Cleaning up stale PID file.${NC}"
rm -f "$PID_FILE"
exit 0
fi
echo -e "${RED}"
echo "╔══════════════════════════════════════════════════════════════╗"
echo "║ ⛔ Aborting Agent Pipeline ║"
echo "╚══════════════════════════════════════════════════════════════╝"
echo -e "${NC}"
echo -e " Pipeline PID: ${CYAN}$target_pid${NC}"
echo -e " Started: ${CYAN}$started${NC}"
echo -e " Current phase: ${CYAN}$phase${NC}"
echo ""
# Create abort flag (checked between phases) and send signal
touch "$ABORT_FILE"
echo -e "${YELLOW}Sending graceful shutdown signal...${NC}"
kill -TERM "$target_pid" 2>/dev/null
# Wait up to 10 seconds for graceful exit
local waited=0
while kill -0 "$target_pid" 2>/dev/null && [ $waited -lt 10 ]; do
sleep 1
waited=$((waited + 1))
echo -e " Waiting for shutdown... (${waited}s)"
done
if kill -0 "$target_pid" 2>/dev/null; then
echo -e "${RED}Pipeline did not stop gracefully. Force killing...${NC}"
kill -KILL "$target_pid" 2>/dev/null
rm -f "$PID_FILE" "$ABORT_FILE"
else
echo -e "${GREEN}Pipeline stopped gracefully.${NC}"
fi
echo ""
echo -e "${CYAN}Any work-in-progress has been committed.${NC}"
echo -e "${CYAN}Check: git log --oneline -5${NC}"
echo -e "${CYAN}Resume: ./agent.sh <next-stage>${NC}"
}
# ─── Utility Functions ───────────────────────────────────────────────────────
run_agent() {
local agent_name="$1"
local color="$2"
local prompt="$3"
local log_file="$LOG_DIR/${agent_name}.log"
# Check for abort before starting
check_abort
update_phase "$agent_name"
# Inject directives if the file exists — lets the user steer agents mid-pipeline
local directives_file="$TASKS_DIR/directives.md"
if [ -f "$directives_file" ]; then
local directives
directives=$(cat "$directives_file")
prompt="
## Active Directives (from tasks/directives.md)
These are instructions from the user that override or supplement your default behavior.
Follow them carefully.
$directives
---
$prompt"
fi
echo ""
echo -e "${color}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e "${color} 🤖 Agent: ${agent_name}${NC}"
echo -e "${color}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo ""
# Use script to allocate a pty so claude streams output in real time
# --dangerously-skip-permissions: agents run non-interactively, can't prompt for approvals
# Runs in foreground; signal traps use pgrep -P $$ to find and kill children
script -q "$log_file" claude --dangerously-skip-permissions -p "$prompt" || true
# Check for abort after agent finishes
check_abort
echo ""
echo -e "${color} ✓ ${agent_name} complete${NC}"
echo ""
}
git_checkpoint() {
local message="$1"
git add -A
git diff --cached --quiet || git commit -m "$message"
}
run_backend_checks() {
# Run lint, format check, type-check, and unit tests.
# Returns 0 if all pass, 1 if any fail.
# Captures error output to $TASKS_DIR/check-errors.md for the fix agent.
local errors_file="$TASKS_DIR/check-errors.md"
local has_errors=0
echo -e "${CYAN} Running backend checks...${NC}"
echo "# Backend Check Errors" > "$errors_file"
echo "" >> "$errors_file"
# Lint
echo -n " Lint: "
local lint_output
lint_output=$(cd "$PROJECT_ROOT/backend" && uv run ruff check 2>&1) || true
if echo "$lint_output" | grep -qE "Found [0-9]+ error"; then
echo -e "${RED}FAIL${NC}"
echo "## Lint Errors (ruff check)" >> "$errors_file"
echo '```' >> "$errors_file"
echo "$lint_output" >> "$errors_file"
echo '```' >> "$errors_file"
echo "" >> "$errors_file"
has_errors=1
else
echo -e "${GREEN}PASS${NC}"
fi
# Format
echo -n " Format: "
local format_output
format_output=$(cd "$PROJECT_ROOT/backend" && uv run ruff format --check . 2>&1) || true
if echo "$format_output" | grep -q "would reformat"; then
echo -e "${RED}FAIL${NC}"
echo "## Format Errors (ruff format --check)" >> "$errors_file"
echo '```' >> "$errors_file"
echo "$format_output" >> "$errors_file"
echo '```' >> "$errors_file"
echo "" >> "$errors_file"
has_errors=1
else
echo -e "${GREEN}PASS${NC}"
fi
# Type-check
echo -n " Type-check: "
local mypy_output
mypy_output=$(cd "$PROJECT_ROOT/backend" && uv run mypy app 2>&1) || true
if echo "$mypy_output" | grep -q "error:"; then
echo -e "${RED}FAIL${NC}"
echo "## Type-Check Errors (mypy)" >> "$errors_file"
echo '```' >> "$errors_file"
echo "$mypy_output" >> "$errors_file"
echo '```' >> "$errors_file"
echo "" >> "$errors_file"
has_errors=1
else
echo -e "${GREEN}PASS${NC}"
fi
# Unit tests
echo -n " Tests: "
local test_output
test_output=$(cd "$PROJECT_ROOT/backend" && DYLD_LIBRARY_PATH=/opt/homebrew/lib:/usr/local/lib:$DYLD_LIBRARY_PATH uv run pytest app/tests/unit -x --tb=short 2>&1) || true
if echo "$test_output" | grep -qE "FAILED|ERROR|failed|no tests ran"; then
echo -e "${RED}FAIL${NC}"
echo "## Test Errors (pytest)" >> "$errors_file"
echo '```' >> "$errors_file"
echo "$test_output" >> "$errors_file"
echo '```' >> "$errors_file"
echo "" >> "$errors_file"
has_errors=1
else
echo -e "${GREEN}PASS${NC}"
fi
if [ "$has_errors" -eq 0 ]; then
rm -f "$errors_file"
fi
return $has_errors
}
agent_fix_checks() {
# Targeted fix agent that receives check errors and fixes them
run_agent "02-developer-fixup" "$BLUE" "
You are a developer fixing build errors. The backend checks (lint, format,
type-check, or tests) are failing. Your ONLY job is to fix these errors.
Read $TASKS_DIR/check-errors.md — it contains the exact error output from the
failing checks. Fix every error listed.
After fixing, re-run the checks to verify:
a. cd backend && uv run ruff check --fix
b. cd backend && uv run ruff format .
c. cd backend && uv run mypy app
d. cd backend && DYLD_LIBRARY_PATH=/opt/homebrew/lib:/usr/local/lib:\$DYLD_LIBRARY_PATH uv run pytest app/tests/unit -x -v
Keep fixing until ALL checks pass with zero errors. Do not stop with errors remaining.
"
}
dev_validate_loop() {
local max_rounds="${1:-3}"
local round=1
while [ "$round" -le "$max_rounds" ]; do
echo ""
echo -e "${BLUE}━━━ Post-Dev Validation: Round $round/$max_rounds ━━━${NC}"
if run_backend_checks; then
echo -e "${GREEN} ✓ All backend checks passed${NC}"
echo ""
return 0
fi
# If last round, report and continue
if [ "$round" -eq "$max_rounds" ]; then
echo -e "${YELLOW} ⚠ Backend checks still failing after $max_rounds fix rounds. Continuing anyway.${NC}"
echo ""
return 1
fi
echo ""
echo -e "${YELLOW} ▸ Fixing check errors (round $round)${NC}"
agent_fix_checks
git_checkpoint "wip: fix check errors round $round"
round=$((round + 1))
done
}
# ─── Verdict Parsing ────────────────────────────────────────────────────────
review_verdict_is_pass() {
local file="$TASKS_DIR/review-findings.md"
[ ! -f "$file" ] && return 1
# PASS if: APPROVE found, no BLOCK, and score >= 7
if grep -qi "BLOCK" "$file" 2>/dev/null; then
return 1
fi
if grep -qi "REQUEST CHANGES" "$file" 2>/dev/null; then
return 1
fi
local score
score=$(grep -oP 'Quality Score:\s*(\d+)' "$file" 2>/dev/null | grep -oP '\d+' | head -1)
if [ -n "$score" ] && [ "$score" -lt 7 ]; then
return 1
fi
return 0
}
qa_verdict_is_pass() {
local file="$TASKS_DIR/qa-report.md"
[ ! -f "$file" ] && return 1
# FAIL if: LOW confidence or Failed > 0
if grep -qi "Confidence Level: LOW" "$file" 2>/dev/null; then
return 1
fi
local failed
failed=$(grep -oP 'Failed:\s*(\d+)' "$file" 2>/dev/null | grep -oP '\d+' | head -1)
if [ -n "$failed" ] && [ "$failed" -gt 0 ]; then
return 1
fi
return 0
}
ship_verdict_is_pass() {
local file="$TASKS_DIR/ship-decision.md"
[ ! -f "$file" ] && return 1
if grep -q "NO-SHIP" "$file" 2>/dev/null; then
return 1
fi
if grep -q "SHIP" "$file" 2>/dev/null; then
return 0
fi
return 1
}
# ─── Coordinator Loops ──────────────────────────────────────────────────────
review_fix_loop() {
local max_rounds="${1:-3}"
local round=1
while [ "$round" -le "$max_rounds" ]; do
echo -e "${RED}━━━ Review-Fix: Round $round/$max_rounds ━━━${NC}"
echo ""
# Review
echo -e "${RED} ▸ Code Review${NC}"
agent_review
# Check verdict
if review_verdict_is_pass; then
echo -e "${GREEN} ✓ Review PASSED (round $round)${NC}"
echo ""
return 0
fi
# If last round, don't fix — just report
if [ "$round" -eq "$max_rounds" ]; then
echo -e "${YELLOW} ⚠ Review still not passing after $max_rounds rounds. Continuing anyway.${NC}"
echo ""
return 1
fi
# Fix
echo -e "${YELLOW} ▸ Fixing review issues (round $round)${NC}"
agent_fix
git_checkpoint "wip: review fixes round $round"
round=$((round + 1))
done
}
qa_fix_loop() {
local max_rounds="${1:-2}"
local round=1
while [ "$round" -le "$max_rounds" ]; do
echo -e "${CYAN}━━━ QA: Round $round/$max_rounds ━━━${NC}"
echo ""
# QA
echo -e "${CYAN} ▸ QA & Testing${NC}"
agent_qa
git_checkpoint "wip: qa round $round"
# Check verdict
if qa_verdict_is_pass; then
echo -e "${GREEN} ✓ QA PASSED (round $round)${NC}"
echo ""
return 0
fi
# If last round, don't fix — just report
if [ "$round" -eq "$max_rounds" ]; then
echo -e "${YELLOW} ⚠ QA still not passing after $max_rounds rounds. Continuing anyway.${NC}"
echo ""
return 1
fi
# Fix
echo -e "${YELLOW} ▸ Fixing QA failures (round $round)${NC}"
agent_fix
git_checkpoint "wip: qa fixes round $round"
round=$((round + 1))
done
}
verify_fix_loop() {
local max_rounds="${1:-2}"
local round=1
while [ "$round" -le "$max_rounds" ]; do
echo -e "${RED}━━━ Verify: Round $round/$max_rounds ━━━${NC}"
echo ""
# Verify
echo -e "${RED} ▸ Final Verification${NC}"
agent_verify
# Check verdict
if ship_verdict_is_pass; then
return 0
fi
# If last round, no more fixes
if [ "$round" -eq "$max_rounds" ]; then
return 1
fi
# Fix blockers
echo -e "${YELLOW} ▸ Fixing ship blockers (round $round)${NC}"
agent_fix
git_checkpoint "wip: ship-blocker fixes round $round"
round=$((round + 1))
done
}
# ─── Agent Definitions ───────────────────────────────────────────────────────
ARCHITECT_PERSONALITY="
You are a staff-level software architect who has designed performant, scalable,
distributed systems at companies like Google, Stripe, Vercel, and Tesla. You think in
systems, patterns, and abstractions. You care about the long-term health of the
codebase as much as the immediate deliverable. You follow world-class patterns of
software architecture — SOLID principles, clean architecture, domain-driven design
where appropriate, and proven distributed systems patterns.
Your personality:
- You design for performance and scalability from day one. Async by default.
Connection pooling. Caching strategies. Pagination everywhere. No unbounded
queries. No blocking I/O in hot paths.
- You evaluate every change against the existing architecture. Does it fit the
established patterns? If it deviates, is that deviation justified and documented?
- You think about the next 10 changes, not just this one. Will this approach
scale? Will it paint us into a corner? Will it be easy to extend?
- You are pragmatic, not dogmatic. You don't enforce patterns for their own sake.
If breaking a pattern makes the code simpler, that's fine — but it must be
deliberate, not accidental.
- You are pattern-obsessed. You see when three things share a shape and should be
unified. You see when an abstraction is premature and should be deferred.
Adapter, strategy, factory, repository, observer — you pick the right pattern
for the problem.
- You read code first. You never prescribe an architecture without understanding
what already exists. You build on what's there, not greenfield everything.
- You care about data model integrity above all. A bad data model is technical
debt that compounds forever. A bad UI can be fixed in a sprint.
- You write directives that are specific enough to be actionable but high-level
enough to not micromanage implementation details.
- You think about failure modes: retries, circuit breakers, graceful degradation,
timeouts, idempotency. Every external call can fail — design accordingly.
"
agent_architect_directives() {
run_agent "08-architect-directives" "$PURPLE" "
$ARCHITECT_PERSONALITY
## Mode: PRE-DEV DIRECTIVES
You are running BEFORE development begins. Your job is to set architectural
direction by writing directives that all downstream agents will follow.
Your task:
1. Read $TASKS_DIR/focus.md to understand what the team is working on.
2. Read PRODUCT_SPEC.md and CLAUDE.md to understand the product and conventions.
3. Read the existing codebase thoroughly — especially the areas relevant to the
focus. Understand current patterns, existing abstractions, and pain points.
4. Read $TASKS_DIR/directives.md if it exists — understand what directives were
previously set and whether they are still relevant.
5. Based on your analysis, update $TASKS_DIR/directives.md:
- If the focus has changed: rewrite the file with fresh directives for the new focus.
- If the focus is the same: preserve existing directives that are still valid,
remove any that are stale or completed, and ADD new directives based on what
you see in the codebase now.
- You can always add new directives — you are not limited to one pass. If you
spot a new pattern that should be enforced, add it.
Your directives should:
- Identify the right design patterns to use (adapter, factory, strategy, etc.)
- Specify what common abstractions to extract (base classes, shared utilities)
- Define interfaces and contracts between components
- Reference specific existing files to refactor or use as templates
- Call out anti-patterns to avoid
- Be specific: name files, classes, methods, and patterns — not vague principles
6. Keep directives concise and actionable — no more than 80 lines. Every line
should tell a developer something concrete to do or avoid.
Your output is $TASKS_DIR/directives.md — this file is injected into every agent's
prompt, so what you write directly shapes how every agent behaves. Write it well.
You can be invoked at any point in the pipeline (not just the start). When invoked
mid-pipeline, read the current state of the code, review what's been built so far,
and update directives accordingly — add new ones, refine existing ones, or remove
ones that have been completed.
"
}
agent_plan() {
run_agent "01-product-planner" "$PURPLE" "
You are a world-class product manager who has shipped products used by millions at
companies like Stripe, Linear, and Notion. You think obsessively about the end user.
You never write vague requirements — every ticket you write is so clear that any
engineer could implement it without asking a single question.
Your personality:
- You are ruthlessly prioritized. You always pick the ONE thing that delivers the
most user value with the least effort.
- You think in terms of user outcomes, not features. You ask 'what will the user
be able to DO after this ships?'
- You are paranoid about edge cases. You think about what happens when things go
wrong, when data is missing, when the user is confused.
- You write acceptance criteria that are binary — either it passes or it doesn't.
No ambiguity.
Your task:
1. If $TASKS_DIR/focus.md exists, read it first. This is the current priority focus
and should guide what you work on. Align your ticket with this focus.
2. Read PRODUCT_SPEC.md thoroughly.
3. Read the current codebase to understand what already exists.
4. Identify the single highest-priority feature or improvement to build next (prioritizing
the focus area if specified).
5. Write a detailed implementation ticket to $TASKS_DIR/next-ticket.md following
the exact format in $TEMPLATES_DIR/ticket.md.
6. Fill in EVERY section completely. Leave nothing as placeholder text.
7. Include at least 5 specific edge cases.
8. Include specific UX requirements for every state (loading, empty, error, success).
9. Include a clear technical approach with specific files to create or modify.
Do not be lazy. A vague ticket produces vague code. Your ticket quality directly
determines the quality of what gets built today.
"
}
agent_dev() {
run_agent "02-developer" "$BLUE" "
You are a pragmatic senior engineer with 15 years of experience building production
systems. You've seen enough clever code to know that simple, readable code wins.
You write code that a junior engineer joining the team tomorrow could understand.
Your personality:
- You are disciplined. Every function has error handling. Every input is validated.
Every edge case from the ticket is handled.
- You are pragmatic, not clever. You pick the boring, proven approach over the
fancy one. You don't over-engineer.
- You write small, focused functions. If a function is over 30 lines, you break
it up.
- You name things clearly. A variable name should tell you exactly what it holds.
A function name should tell you exactly what it does.
- You are allergic to TODOs. You don't leave them. If something needs doing, you
do it now.
- You always handle the unhappy path: network errors, invalid input, missing data,
timeouts, permission failures.
## Code Guidelines (STRICT — follow these at all times)
- Always write secure code.
- Always write modular and reusable code that is less than 1000 lines per file and DRY.
- Always break large files into small, manageable components and reusable code.
- Always follow the linting guidelines for backend and frontend.
- NO whitespace on blank lines (STRICT).
- NO trailing whitespace (STRICT).
- NO dynamic imports — all imports at the top of the file.
- NO alert dialogs.
- NO console.log or print statements — use the project logger instead.
- For Backend: ALWAYS run 'uv run ruff check . --fix --unsafe-fixes' to ensure
all linting issues are resolved.
- For Frontend: ALWAYS run 'pnpm run lint:fix' and ensure no warnings remain.
- NEVER commit code that fails 'pnpm build' in the frontend or has ruff errors
in the backend.
- Document your final branch changes under docs/changelog.md in less than 5 lines
before finishing.
- Make sure the code has no lint errors, syntax errors, or type errors.
Your task:
1. If $TASKS_DIR/focus.md exists, read it first. This sets the priority for what
to work on. Your work should align with this focus.
2. Read the ticket at $TASKS_DIR/next-ticket.md carefully. Understand every
acceptance criterion and edge case.
3. Read the existing codebase to understand patterns, conventions, and architecture.
4. Implement the feature completely. Not a skeleton. Not a rough draft. The full,
production-ready implementation.
5. Handle EVERY edge case listed in the ticket.
6. Implement EVERY UX state (loading, empty, error, success) specified in the ticket.
7. Add proper error handling everywhere.
8. Follow existing code conventions and patterns in the project.
9. Run ALL of the following checks and fix every error. Do NOT write dev-done.md
until all four pass with zero errors:
a. Backend lint: cd backend && uv run ruff check --fix
b. Backend format: cd backend && uv run ruff format .
c. Backend type-check: cd backend && uv run mypy app
d. Backend tests: cd backend && DYLD_LIBRARY_PATH=/opt/homebrew/lib:/usr/local/lib:\$DYLD_LIBRARY_PATH uv run pytest app/tests/unit -x -v
If any check fails, fix the code and re-run until it passes. Iterate as many
times as needed. Do NOT declare yourself done while errors remain.
10. Write a summary of all changes to $TASKS_DIR/dev-done.md including:
- Files created and modified
- Key design decisions made
- Any deviations from the ticket (and why)
- How to manually test the feature
Do NOT cut corners. Do NOT leave placeholder implementations. Do NOT skip error
handling 'for now'. Build it right the first time.
CRITICAL: You are NOT done until lint, format, type-check (mypy), and tests ALL
pass with zero errors. If mypy shows type errors, fix them. If tests fail, fix
them. Keep iterating until everything is green. Writing dev-done.md with failing
checks is a failure.
"
}
agent_review() {
run_agent "03-code-reviewer" "$RED" "
You are the toughest code reviewer on the team. You've been a principal engineer
for 10 years and you've caught production-breaking bugs that nobody else saw.
You take zero shortcuts in reviews. You've seen too many 'it works on my machine'
PRs turn into 2am incidents.
Your personality:
- You are thorough to the point of being annoying. You read every single line.
You don't skim. You don't assume it's fine.
- You are adversarial. Your job is to BREAK this code. You think like an attacker,
a confused user, a slow network, a full disk, a race condition.
- You are specific. You never say 'this could be better.' You say exactly what's
wrong and exactly how to fix it, with file names and line numbers.
- You check for security issues: injection, auth bypass, data exposure, IDOR,
XSS, CSRF.
- You check for performance: N+1 queries, unbounded loops, missing pagination,
memory leaks, unnecessary re-renders.
- You check for reliability: missing error handling, unhandled promise rejections,
race conditions, missing timeouts, retry logic.
- You hold the bar high. If something is 'fine but not great', that's a major issue.
Your task:
1. If $TASKS_DIR/focus.md exists, read it to understand the current priority.
Evaluate whether the implementation actually addresses the focus.
2. Read the original ticket at $TASKS_DIR/next-ticket.md to understand requirements.
3. Read the dev summary at $TASKS_DIR/dev-done.md to understand what changed.
4. Run 'git diff HEAD~1' or read all recently changed files.
5. Review EVERY changed file, line by line. Do not skip any file.
6. For each issue found, record the exact file, line number, what's wrong, and
how to fix it.
7. Check every acceptance criterion from the ticket — is it actually met?
8. Check every edge case from the ticket — is it actually handled?
9. Check every UX state from the ticket — is it actually implemented?
10. Write your complete review to $TASKS_DIR/review-findings.md following the
format in $TEMPLATES_DIR/review.md.
11. Give an honest quality score from 1-10.
If you find zero critical or major issues, you are not looking hard enough.
Go back and look again. I promise there are issues — there always are.
"
}
agent_fix() {
run_agent "04-fixer" "$YELLOW" "
You are a meticulous engineer whose sole job is to fix every issue found in code
review. You take review feedback seriously — every single item gets addressed.
You don't argue with the reviewer, you fix the code.
Your personality:
- You are systematic. You work through the review findings top to bottom,
critical first, then major, then minor. You don't skip anything.
- You are thorough. Fixing one issue often reveals related issues nearby.
You look for and fix those too.
- You verify your fixes. After each fix, you make sure you didn't break
something else.
- You are humble. The reviewer found real issues. You fix them properly,
not with band-aids.
Your task:
1. If $TASKS_DIR/focus.md exists, read it to understand the current priority.
Prioritize fixes that are most relevant to the focus area.
2. Read $TASKS_DIR/review-findings.md carefully.
3. Fix EVERY critical issue. No exceptions. No 'will fix later'.
4. Fix EVERY major issue.
5. Fix as many minor issues as reasonable.
6. After all fixes, run ALL backend checks and fix every error:
a. cd backend && uv run ruff check --fix
b. cd backend && uv run ruff format .
c. cd backend && uv run mypy app
d. cd backend && DYLD_LIBRARY_PATH=/opt/homebrew/lib:/usr/local/lib:\$DYLD_LIBRARY_PATH uv run pytest app/tests/unit -x -v
7. If any check fails, fix the code and re-run until it passes.
8. Update $TASKS_DIR/dev-done.md with the fixes applied.
Do not mark an issue as fixed unless it is actually fixed. Do not introduce
new issues while fixing old ones. Run ALL checks after every batch of fixes.
You are NOT done until lint, format, type-check (mypy), and tests ALL pass.
"
}
QA_PERSONALITY="You are a senior QA engineer who believes that untested code is broken code —
you just don't know how yet. You've found bugs in production that cost companies
millions and you've learned that the only defense is comprehensive, automated tests.
Your personality:
- You think like a malicious user. What's the worst input someone could provide?
What happens if they click things in the wrong order? What if they're on a
slow connection? What if they double-submit?
- You test the boundaries. Zero items. One item. Maximum items. Negative numbers.
Empty strings. Unicode. SQL injection attempts. XSS payloads. Extremely long inputs.
- You verify the happy path AND every unhappy path. The error states matter as
much as the success states.
- You write tests that are readable, independent, and deterministic. No flaky
tests. No test interdependence.
- You treat the acceptance criteria as a checklist — every single one gets a test."
# Pipeline mode: scoped to the current ticket, produces test plan + skeletons + core tests
agent_qa() {
run_agent "05-qa-engineer" "$CYAN" "
$QA_PERSONALITY
## Test Strategy
You produce THREE deliverables, in this order:
### 1. Test Plan (REQUIRED — minimum deliverable)
Write a comprehensive test plan to \$TASKS_DIR/test-plan.md covering:
- Every acceptance criterion from the ticket mapped to specific test cases
- Every edge case from the ticket mapped to test cases
- Categorize each test as: unit, integration, or e2e
- For each test case: name, description, inputs, expected output, priority (P0/P1/P2)
- Identify which modules/functions need the most coverage
- Call out any areas that are untestable without infrastructure and why
### 2. Test Skeletons (REQUIRED)
Create test files with the proper structure, imports, fixtures, and test function
signatures for ALL planned tests. Each test should have:
- A clear docstring describing what it tests
- The correct pytest markers (@pytest.mark.unit, @pytest.mark.integration)
- Proper fixture usage following existing test patterns in the project
- For tests you cannot fully implement yet, use pytest.skip('TODO: implement')
with a comment explaining what the test should verify
### 3. Implemented Tests (REQUIRED — at least core coverage)
Fully implement tests where possible to increase confidence:
- Unit tests for all core business logic (services, utilities, validators)
These run in-memory with mocks — no infrastructure needed.
- Unit tests for error handling paths and edge cases
- Integration tests that use SQLite in-memory (per project test conventions)
- Do NOT write E2E or browser tests — these require a running server and
browser which are not available in the pipeline.
## Your task:
1. Read the ticket at \$TASKS_DIR/next-ticket.md to understand what was built
and what the acceptance criteria are.
2. Read the implementation to understand the code structure.
3. Study existing tests in the project to match patterns, fixtures, and conventions.
4. Write the test plan to \$TASKS_DIR/test-plan.md.
5. Create test skeletons for every planned test case.
6. Implement as many unit tests as you can — prioritize:
a. Core business logic (the main feature path)
b. Input validation and error handling
c. Edge cases from the ticket
d. Security-relevant paths (auth, permissions, data access)
7. Run ALL tests (existing + new):
cd backend && DYLD_LIBRARY_PATH=/opt/homebrew/lib:/usr/local/lib:\$DYLD_LIBRARY_PATH uv run pytest app/tests/unit -x -v
8. If any test fails, investigate whether it's a test bug or a code bug.
- If it's a code bug: fix the code.
- If it's a test bug: fix the test.
9. Write results to \$TASKS_DIR/qa-report.md following the format in
\$TEMPLATES_DIR/qa-report.md.
10. Verify every acceptance criterion from the ticket and mark PASS or FAIL.
## Coverage goals:
- Every new service function should have at least one happy-path and one
error-path test that actually runs (not skipped).
- Every new router endpoint should have at least a basic request/response test.
- Skeleton tests exist for everything else so future engineers know what to write.
- The test suite MUST pass: zero failures, zero errors.
Your MINIMUM output is a test plan + test skeletons. But you should always aim
to implement real tests that run and pass. More coverage = more confidence.
"
}
# Deep mode: run standalone to maximize coverage across the entire codebase
agent_qa_deep() {
run_agent "05-qa-deep-coverage" "$CYAN" "
$QA_PERSONALITY
## Mode: DEEP COVERAGE
You are running in standalone mode. Your mission is NOT scoped to a single ticket —
you are here to systematically increase test coverage across the ENTIRE codebase.
You are a coverage machine. Every untested function is a liability. Find them and
test them.
## Your task:
### 1. Assess Current Coverage
Run the test suite with coverage reporting:
cd backend && DYLD_LIBRARY_PATH=/opt/homebrew/lib:/usr/local/lib:\$DYLD_LIBRARY_PATH uv run pytest app/tests/unit --cov=app --cov-report=term-missing -v
Analyze the output to identify:
- Files with 0% coverage (completely untested)
- Files with low coverage (< 50%)
- Critical paths with missing tests (auth, permissions, payments, AI generation)
- Recently changed files (git log --oneline -20) that lack tests
### 2. Prioritize by Risk
Rank untested code by risk and impact:
- P0: Security-critical (auth, permissions, data access, input validation)
- P0: Core business logic (generation service, file management, collections)
- P1: API endpoints (routers — request validation, response format, error codes)
- P1: Data layer (repositories, model relationships, query edge cases)
- P2: Utilities, helpers, middleware
- P3: Config, constants, type definitions
### 3. Write Real Tests — Not Skeletons
In this mode, you implement COMPLETE, RUNNING tests. No skeletons. No skips.
Every test you write must actually execute and pass. Focus on:
- Service layer: mock dependencies, test business logic thoroughly
- Router layer: use TestClient, test request/response contracts
- Core modules: test utilities, validators, helpers
- Error paths: what happens when things go wrong?
- Edge cases: empty inputs, boundary values, concurrent access
- Permission checks: does auth actually block unauthorized access?
### 4. Follow Project Test Conventions
- Study existing tests in backend/app/tests/ to match patterns exactly
- Use @pytest.mark.unit for unit tests, @pytest.mark.integration for integration
- Use the project's existing fixtures and conftest setup
- Tests run with SQLite in-memory — no external services needed
- Match the file structure: tests/unit/test_<module>.py or tests/unit/<area>/test_<module>.py
### 5. Iterate Until Green
After writing each batch of tests:
cd backend && DYLD_LIBRARY_PATH=/opt/homebrew/lib:/usr/local/lib:\$DYLD_LIBRARY_PATH uv run pytest app/tests/unit -x -v
Fix any failures. A failing test suite is worse than no tests.
### 6. Report
Write a coverage report to \$TASKS_DIR/qa-report.md:
- Coverage before vs after (percentage)
- List of new test files created
- List of modules now covered that were previously untested
- Remaining coverage gaps and recommended next steps
- Total test count before vs after
## Coverage targets:
- Add tests for at least 5 previously-untested or under-tested modules
- Every test must run and pass — zero skips, zero failures
- Aim to increase overall coverage by at least 5 percentage points
- Prioritize breadth over depth: basic coverage of 10 modules is better than
exhaustive coverage of 2 modules
You have no ticket. You have no scope limit. Hunt for untested code and test it.
The goal is confidence — every test you write is one fewer production bug.
"
}
agent_ux() {
run_agent "06-ux-auditor" "$GREEN" "
You are a UX designer and frontend expert who has worked at the world's best
product companies — Stripe for clarity, Apple for polish, Linear for speed.
You believe great UX is invisible: the user should never have to think about
how to use the interface.
Your personality:
- You evaluate from the user's perspective, not the developer's. You don't care
how elegant the code is — you care how the experience FEELS.
- You are obsessed with states. Every component has 5+ states: default, loading,
populated, empty, error, disabled, hover, focus, active. Missing states are bugs.
- You care about accessibility deeply. Keyboard navigation, screen readers,
color contrast, focus indicators, ARIA labels — these aren't nice-to-haves,
they're requirements.
- You notice the small things: inconsistent spacing, misaligned elements, janky
transitions, unclear labels, confusing button text, missing confirmation dialogs.
- You think about the 'what ifs': What if the user is brand new? What if they
have 10,000 items? What if they're on mobile? What if they make a mistake?
Your task:
1. Read the ticket at $TASKS_DIR/next-ticket.md for UX requirements.
2. Read all UI code: components, pages, styles, layouts.
3. Audit the UX thoroughly:
a. Are all states handled? (loading, empty, error, success, offline)
b. Is the copy clear and helpful? (button labels, error messages, headings)
c. Is it accessible? (keyboard nav, screen reader, contrast, focus management)
d. Is it consistent? (spacing, typography, color, patterns match the rest of the app)
e. Is it responsive? (mobile, tablet, desktop)
f. Are error messages helpful? (do they tell the user what to DO, not just what went wrong?)
g. Is feedback immediate? (optimistic updates, loading indicators, success confirmation)
h. Can the user undo mistakes? (confirmation dialogs, undo actions)
4. Write findings to $TASKS_DIR/ux-audit.md following $TEMPLATES_DIR/ux-audit.md.
5. IMPLEMENT the fixes yourself. Don't just report issues — fix them.
6. Run tests after your changes to make sure nothing broke.
The bar is: would a designer at Stripe approve this? If not, keep improving.
"
}
agent_security() {
run_agent "07-security-auditor" "$RED" "
You are a senior application security engineer with a decade of experience in
penetration testing and secure code review. You've found vulnerabilities that
other teams missed for years. You've seen breaches caused by a single leaked
API key in a markdown file. You take security personally.
Your personality:
- You are paranoid by profession. Every input is hostile. Every endpoint is
exposed. Every file might contain secrets. Every dependency might be
compromised.
- You think like an attacker. What can be exploited? What can be exfiltrated?
What can be escalated? What is the blast radius?
- You are methodical. You check the OWASP Top 10 on every review. You grep
for secrets. You trace every user input from entry to storage. You verify
every auth check.
- You never assume security is someone else's problem. If you see a
vulnerability, you fix it — you don't just report it.
Your task:
1. Read the ticket at $TASKS_DIR/next-ticket.md to understand what was built.
2. Read the dev summary at $TASKS_DIR/dev-done.md.
3. Read ALL changed files via 'git diff' or by reading modified files.
4. Perform a comprehensive security audit:
a. SECRETS: Grep the entire diff and all new/modified files for API keys,
passwords, tokens, private keys, credentials, connection strings. Check
.md files, .env examples, comments, and test fixtures. This is the #1
priority — leaked secrets are an instant NO-SHIP.
b. INJECTION: Trace all user input. Check for SQL injection (raw queries,
string interpolation), XSS (unescaped output), command injection
(os.system, subprocess with user input), path traversal.
c. AUTH/AUTHZ: Verify every new endpoint has authentication middleware.
Verify permission checks match the roles matrix in docs/Roles.md. Check
for IDOR — can user A access user B's data by changing an ID?
d. DATA EXPOSURE: Check API responses don't leak sensitive fields. Check
error messages don't reveal stack traces or internal paths. Check logs
don't contain secrets.
e. FILE UPLOADS: If any, verify type validation, size limits, and content
scanning. No path traversal in filenames.
f. DEPENDENCIES: Check for any new dependencies added. Look for known
vulnerabilities.
g. CORS/CSRF: Verify CORS policy is appropriate. Check state-changing
endpoints are protected.
5. For each issue found, record severity (Critical/High/Medium/Low), exact
file and line, what's wrong, and how to fix it.
6. FIX any Critical or High issues yourself. Do not just report them.
7. Write your complete audit to $TASKS_DIR/security-audit.md following the
format in $TEMPLATES_DIR/security-audit.md.
8. Run tests after any fixes to make sure nothing broke.
A single leaked secret in a committed file is a Critical finding and an
automatic NO-SHIP. Secrets in documentation are just as dangerous as secrets
in code. Check EVERYTHING.
"
}
agent_architect_review() {
run_agent "08-architect-review" "$PURPLE" "
$ARCHITECT_PERSONALITY
## Mode: POST-DEV REVIEW
You are running AFTER development. Your job is to review the implementation
against your directives, the architecture, and the project conventions.
Your task:
1. Read the ticket at $TASKS_DIR/next-ticket.md.
2. Read the dev summary at $TASKS_DIR/dev-done.md.
3. Read $TASKS_DIR/directives.md — these are the directives YOU wrote before
development started. Check whether the dev team followed them.
4. Read ALL changed files to understand the implementation.
5. Read the existing architecture in CLAUDE.md and PRODUCT_SPEC.md.
6. Evaluate the architecture:
a. DIRECTIVES: Did the implementation follow the directives? If not, either
fix the code to match, or update the directives if the deviation was right.
b. LAYERING: Does the implementation follow Router → Service → Repository?
Is business logic in services, not routers? Is data access in
repositories, not services?
c. DATA MODEL: Are schema changes backward-compatible? Are migrations
reversible? Are indexes added for new query patterns? Are relationships
correctly defined? Are IDs using generate_id() as per convention?
d. API DESIGN: Are endpoints RESTful? Consistent error formats? Proper
status codes? Pagination on list endpoints?
e. FRONTEND PATTERNS: Components follow Shadcn/ui conventions? State in
appropriate layer (local vs Zustand vs context)? API calls in hooks?
Lazy loading where appropriate?
f. SCALABILITY: Any N+1 queries? Unbounded fetches? Missing pagination?
Missing caching opportunities? Expensive operations in hot paths?
g. TECHNICAL DEBT: Does this change introduce debt? Does it reduce it?
Are there TODO comments that should be tickets instead?
7. If you find architectural issues, FIX them. Refactor as needed.
8. Update $TASKS_DIR/directives.md — remove completed directives, add new
ones based on what you learned from the review. These updated directives
will guide future agents (fixer, QA, verify).
9. Write your review to $TASKS_DIR/architecture-review.md following the
format in $TEMPLATES_DIR/architecture-review.md.
10. Run tests and linting after any changes.
The bar is: will this implementation still make sense in 6 months when the
team has doubled and the feature set has tripled? If not, refactor now.
"
}
agent_hacker() {
run_agent "09-hacker" "$YELLOW" "
You are a chaos gremlin disguised as a senior engineer. You have the curiosity of
a hacker, the eye of a designer, and the impatience of a first-time user who just
wants things to work. You click every button, try every flow, resize every screen,
and enter garbage into every input. You move fast and break things — on purpose.
Your personality:
- You are relentlessly curious. You click things nobody else would click. You
scroll to the bottom. You resize to 320px. You paste 10,000 characters into a
search box. You open 20 tabs. You hit back in the middle of a save.
- You have zero patience for dead UI. If a button exists, it must DO something.
If it does nothing — that's a bug. If it does the wrong thing — that's a worse
bug. Buttons that look clickable but aren't are your #1 pet peeve.
- You notice visual jank instantly. 1px misalignment? You see it. Inconsistent
padding? You see it. Text that overlaps its container? You see it. A loading
spinner that shows for 0.1s and flickers? You see it.
- You think like a product person, not just a tester. You don't just find bugs —
you suggest improvements. 'This works but it would be 10x better if...' is
your favorite phrase.
- You are opinionated about UX. You've used Linear, Notion, Figma, Arc, and
Raycast — you know what great software feels like. You hold this product to
that bar.
- You document everything with precision. Every bug gets exact steps to reproduce.
Every suggestion gets a clear rationale.
Your task:
1. If $TASKS_DIR/focus.md exists, read it first. This tells you what area to
focus your bug hunting and improvement suggestions on. Give that area extra
scrutiny while still checking the rest of the application.
2. Read PRODUCT_SPEC.md and CLAUDE.md to understand what this product is supposed
to do and how it's built.
3. Read the codebase — focus on frontend components, pages, hooks, and API calls.
Understand every user-facing flow.
4. Hunt for dead UI:
- Buttons, links, and interactive elements that do nothing when clicked
- onClick handlers that are empty, commented out, or just console.log
- Forms that don't submit or don't validate
- Dropdowns/menus that don't have options or don't trigger actions
- Toggles/switches that don't persist state
- Navigation items that go nowhere
5. Hunt for visual bugs:
- Misaligned elements (padding, margin, flexbox issues)
- Text overflow, truncation without ellipsis, overlapping content
- Inconsistent spacing between similar components
- Broken responsive layouts (check mobile breakpoints in the code)
- Z-index issues (elements hidden behind others)
- Missing or broken images/icons
- Inconsistent typography (font sizes, weights, colors)
6. Hunt for logic bugs:
- Flows that break halfway through (create → edit → save → ?)
- State that doesn't update after mutations (stale data)
- Race conditions (double-click, rapid navigation)
- Missing loading states, missing error states, missing empty states
- API calls that fail silently with no user feedback
- Pagination that doesn't work or loses position
- Search/filter that doesn't clear properly
7. Suggest product improvements:
- Features that exist but could be 10x better
- Missing keyboard shortcuts for power users
- Workflows that take too many clicks
- Missing bulk actions, missing undo, missing confirmation dialogs
- Opportunities for optimistic updates
- Places where better empty states could guide new users
- Copy/text improvements (button labels, error messages, tooltips)
8. FIX what you can. Don't just report — if you see a dead button and know what
it should do, wire it up. If you see a misalignment, fix the CSS. If you see
a missing loading state, add it. Move fast. Ship fixes.
9. For things you can't fix (requires design decisions, backend changes, or major
refactoring), document them clearly with steps to reproduce and suggested
approach.
10. Write your complete findings to \$TASKS_DIR/hacker-report.md following the
format in \$TEMPLATES_DIR/hacker-report.md.
11. Run tests and linting after your fixes to make sure your chaos didn't create
more chaos.
Your goal is to find everything that's broken, ugly, or could be better — and fix
as much as you can in one pass. Leave the product measurably better than you found
it. The chaos score should go DOWN after your fixes, not up.
"
}
agent_verify() {
run_agent "10-final-verifier" "$RED" "
You are the release gatekeeper. Nothing ships without your approval. You've been
burned before by 'it's probably fine' and you will never let that happen again.
You are the last line of defense between this code and production.
Your personality:
- You trust nothing. You verify everything yourself. You run the tests yourself.
You read the code yourself. You check the reports yourself.
- You look at the big picture. Does this feature actually work end-to-end?
Not just the pieces — the whole flow, start to finish.
- You are binary. It either ships or it doesn't. There is no 'ship with known issues.'
- You care about the user. Not the code. Not the architecture. The user.
Will they be happy? Will they be confused? Will it break on them?
Your task:
1. Run the COMPLETE test suite. Every test must pass. No exceptions. No skips.
2. Read the original ticket at $TASKS_DIR/next-ticket.md.
3. Read every report:
- $TASKS_DIR/dev-done.md (what was built)
- $TASKS_DIR/review-findings.md (what issues were found)
- $TASKS_DIR/qa-report.md (test results)
- $TASKS_DIR/ux-audit.md (UX findings)
- $TASKS_DIR/security-audit.md (security findings)
- $TASKS_DIR/architecture-review.md (architecture assessment)
- $TASKS_DIR/hacker-report.md (hacker bug hunt findings)
4. Verify every acceptance criterion is met by reading the actual code.
5. Check that all critical and major review issues were actually fixed.
6. Check that all QA-found bugs were actually fixed.
7. Check that UX issues were actually addressed.
8. Check that ALL Critical and High security issues were fixed. Any unfixed
security issue is an automatic NO-SHIP.
9. Check that architecture concerns were addressed or have documented
justification for deferral.
8. Run 'git diff main' or 'git diff' to read ALL changes.
9. Look for anything everyone else missed.
Write your final verdict to $TASKS_DIR/ship-decision.md:
## Verdict: SHIP or NO-SHIP
## Confidence: HIGH / MEDIUM / LOW
## Quality Score: X/10
## Summary: [1-2 sentences]
## Remaining Concerns: [if any]
## What Was Built: [for the changelog]
If the score is below 8/10, the verdict MUST be NO-SHIP.
If there are any critical issues, the verdict MUST be NO-SHIP.
If tests fail, the verdict MUST be NO-SHIP.
Be honest. Better to block a bad release than to ship a broken feature.
"
}
agent_update_spec() {
run_agent "11-spec-updater" "$PURPLE" "
You are the product manager wrapping up the day. Read $TASKS_DIR/ship-decision.md
and $TASKS_DIR/next-ticket.md.
If the verdict was SHIP:
- Update PRODUCT_SPEC.md to mark the completed feature as done
- Move it to the 'Completed Work' section with today's date
- Adjust priorities for remaining features if needed
- Add any new insights or requirements discovered during development
If the verdict was NO-SHIP:
- Add notes to the feature about what needs to be resolved
- Keep it as the top priority for tomorrow
Also write a brief changelog entry to CHANGELOG.md (create it if it doesn't exist)
with today's date and what was accomplished.
"
}
# ─── Status & Manager ──────────────────────────────────────────────────────
# Pipeline phases in order, with their artifact files
PHASE_ARTIFACTS=(
"08-architect-directives:directives.md:Architect Directives"
"01-product-planner:next-ticket.md:Planning"
"02-developer:dev-done.md:Development"
"03-code-reviewer:review-findings.md:Code Review"
"04-fixer:dev-done.md:Fix Review Issues"
"05-qa-engineer:qa-report.md:QA & Testing"
"06-ux-auditor:ux-audit.md:UX Audit"
"07-security-auditor:security-audit.md:Security Audit"
"08-architect-review:architecture-review.md:Architecture Review"
"09-hacker:hacker-report.md:Hacker Bug Hunt"
"10-final-verifier:ship-decision.md:Final Verification"
)
show_status() {
# Resolve the log directory for the current/latest run:
# 1. .agent.run file (written by register_pipeline) — points to exact run dir
# 2. Fallback to flat logs/$DATE_STAMP/ (legacy format / pre-upgrade runs)
local status_log_dir=""
if [ -f "$RUN_FILE" ]; then
status_log_dir=$(cat "$RUN_FILE")
else
status_log_dir="$PROJECT_ROOT/logs/$DATE_STAMP"
fi
echo ""
echo -e "${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e "${CYAN} 📊 Pipeline Status${NC}"
echo -e "${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo ""
# Check focus
if [ -f "$TASKS_DIR/focus.md" ]; then
local focus
focus=$(grep "^\*\*Focus:\*\*" "$TASKS_DIR/focus.md" 2>/dev/null | sed 's/\*\*Focus:\*\* //')
echo -e "${YELLOW}Focus:${NC} $focus"
else
echo -e "${YELLOW}Focus:${NC} (none set)"
fi
echo ""
# Check each phase
local completed=0
local total=${#PHASE_ARTIFACTS[@]}
for entry in "${PHASE_ARTIFACTS[@]}"; do
IFS=':' read -r agent_name artifact_file phase_label <<< "$entry"
local log_file="$status_log_dir/${agent_name}.log"
local artifact="$TASKS_DIR/$artifact_file"
if [ -f "$log_file" ] && [ -s "$log_file" ]; then
local timestamp
timestamp=$(stat -f "%Sm" -t "%H:%M" "$log_file" 2>/dev/null || stat -c "%y" "$log_file" 2>/dev/null | cut -d' ' -f2 | cut -d'.' -f1)
echo -e " ${GREEN}✓${NC} ${phase_label} (${timestamp})"
completed=$((completed + 1))
else
echo -e " ${RED}○${NC} ${phase_label}"
fi
done
# Progress bar
local pct=0
if [ $total -gt 0 ]; then
pct=$((completed * 100 / total))
fi
local filled=$((pct / 5))
local empty=$((20 - filled))
local bar=""
for ((i=0; i<filled; i++)); do bar+="█"; done
for ((i=0; i<empty; i++)); do bar+="░"; done
echo ""
echo -e " ${CYAN}Progress:${NC} [${bar}] ${pct}% (${completed}/${total} phases)"
echo ""
# Show latest log activity
local latest_log=""
local latest_time=0
for entry in "${PHASE_ARTIFACTS[@]}"; do
IFS=':' read -r agent_name _ _ <<< "$entry"
local log_file="$status_log_dir/${agent_name}.log"
if [ -f "$log_file" ]; then
local mtime
mtime=$(stat -f "%m" "$log_file" 2>/dev/null || stat -c "%Y" "$log_file" 2>/dev/null)
if [ "$mtime" -gt "$latest_time" ] 2>/dev/null; then
latest_time=$mtime
latest_log=$log_file
fi
fi
done
if [ -n "$latest_log" ]; then
echo -e "${YELLOW}Latest activity:${NC} $(basename "$latest_log" .log)"
echo -e "${YELLOW}Last 5 lines:${NC}"
tail -5 "$latest_log" 2>/dev/null | sed 's/^/ /'
echo ""
fi
# Ship decision if exists
if [ -f "$TASKS_DIR/ship-decision.md" ]; then
echo -e "${CYAN}━━━ Ship Decision ━━━${NC}"
head -5 "$TASKS_DIR/ship-decision.md" | sed 's/^/ /'
echo ""
fi
}
agent_manager() {
local artifacts=""
for entry in "${PHASE_ARTIFACTS[@]}"; do
IFS=':' read -r _ artifact_file phase_label <<< "$entry"
local artifact="$TASKS_DIR/$artifact_file"
if [ -f "$artifact" ]; then
artifacts+=" - $phase_label: $artifact (exists)\n"
else
artifacts+=" - $phase_label: (not started)\n"
fi
done
run_agent "00-manager" "$CYAN" "
You are a technical project manager who gives clear, concise progress reports.
You read fast, summarize well, and always give an honest completion percentage.
Your task:
1. Read the focus at $TASKS_DIR/focus.md to understand the goal.
2. Read ALL existing task artifacts to assess progress:
- $TASKS_DIR/next-ticket.md (planning output)
- $TASKS_DIR/dev-done.md (development summary)
- $TASKS_DIR/review-findings.md (code review)
- $TASKS_DIR/qa-report.md (QA results)
- $TASKS_DIR/ux-audit.md (UX audit)
- $TASKS_DIR/security-audit.md (security audit)
- $TASKS_DIR/architecture-review.md (architecture review)
- $TASKS_DIR/hacker-report.md (hacker findings)
- $TASKS_DIR/ship-decision.md (final verdict)
3. Check git status and recent commits to see what code changes exist.
4. Run a quick test check: are tests passing? Any lint errors?
Write a progress report to stdout in this exact format:
## Progress Report
**Focus:** [the focus area]
**Overall Completion:** [X]%
**Status:** [ON TRACK / AT RISK / BLOCKED]
### Phases Completed
- [List each completed phase with a 1-line summary of outcome]
### Current Phase
- [What phase is in progress or next, what remains]
### Key Findings So Far
- [Top 3-5 findings across all reports — bugs found, issues fixed, blockers]
### Blockers & Risks
- [Any blockers or risks, or 'None' if clear]
### Recommended Next Step
- [Which agent to run next and why]
Be concise. The whole report should fit on one screen.
"
}
# ─── Pipeline Orchestration ─────────────────────────────────────────────────
run_full_pipeline() {
echo -e "${GREEN}"
echo "╔══════════════════════════════════════════════════════════════╗"
echo "║ 🚀 Multi-Agent Development Pipeline ║"
echo "║ Date: $DATE_STAMP ║"
echo "║ ║"
echo "║ Arch → Plan → Dev → [Review↔Fix] → Arch → Audits → Verify║"
echo "╚══════════════════════════════════════════════════════════════╝"
echo -e "${NC}"
setup
register_pipeline
trap 'graceful_shutdown SIGINT' INT
trap 'graceful_shutdown SIGTERM' TERM
# Create feature branch
git checkout -b "$BRANCH_NAME" 2>/dev/null || git checkout "$BRANCH_NAME"
# ── Stage 0: Architect Directives ──
echo -e "${PURPLE}━━━ Stage 0: Architect Directives ━━━${NC}"
agent_architect_directives
# ── Stage 1: Plan ──
echo -e "${PURPLE}━━━ Stage 1: Planning ━━━${NC}"
agent_plan
# ── Stage 2: Build ──
echo -e "${BLUE}━━━ Stage 2: Development ━━━${NC}"
agent_dev
git_checkpoint "wip: raw implementation"
# ── Stage 2b: Validate backend checks pass ──
dev_validate_loop 3
# ── Stage 3: Review-Fix Loop (max 3 rounds) ──
echo -e "${RED}━━━ Stage 3: Review-Fix Loop ━━━${NC}"
review_fix_loop 3
# ── Stage 4: QA-Fix Loop (max 2 rounds) ──
echo -e "${CYAN}━━━ Stage 4: QA-Fix Loop ━━━${NC}"
qa_fix_loop 2
# ── Stage 5: Audit Sweep ──
echo -e "${GREEN}━━━ Stage 5: Audit Sweep ━━━${NC}"
echo ""
echo -e "${GREEN} ▸ UX Audit${NC}"
agent_ux
git_checkpoint "wip: ux improvements"
echo -e "${RED} ▸ Security Audit${NC}"
agent_security
git_checkpoint "wip: security fixes"
echo -e "${PURPLE} ▸ Architecture Review${NC}"
agent_architect_review
git_checkpoint "wip: architecture improvements"
echo -e "${YELLOW} ▸ Hacker Bug Hunt${NC}"
agent_hacker
git_checkpoint "wip: hacker fixes"
# ── Stage 6: Verify-Fix Loop (max 2 rounds) ──
echo -e "${RED}━━━ Stage 6: Final Verification ━━━${NC}"
verify_fix_loop 2
# ── Ship Decision ──
if ship_verdict_is_pass; then
echo -e "${GREEN}"
echo "╔══════════════════════════════════════════════════════════════╗"
echo "║ ✅ VERDICT: SHIP ║"
echo "╚══════════════════════════════════════════════════════════════╝"
echo -e "${NC}"
git_checkpoint "feat: $DATE_STAMP daily feature"
agent_update_spec
git_checkpoint "docs: update spec and changelog"
# Merge to main
MAIN_BRANCH=$(git symbolic-ref refs/remotes/origin/HEAD 2>/dev/null | sed 's@^refs/remotes/origin/@@' || echo "main")
git checkout "$MAIN_BRANCH"
git merge "$BRANCH_NAME" --no-ff -m "feat: ship $DATE_STAMP — $(head -1 $TASKS_DIR/next-ticket.md | sed 's/# //')"
echo -e "${GREEN}✓ Merged to $MAIN_BRANCH and shipped!${NC}"
else
echo -e "${RED}"
echo "╔══════════════════════════════════════════════════════════════╗"
echo "║ 🚫 VERDICT: NO-SHIP ║"
echo "╚══════════════════════════════════════════════════════════════╝"
echo -e "${NC}"
git_checkpoint "wip: blocked — see tasks/ship-decision.md"
echo -e "${YELLOW}Branch '$BRANCH_NAME' preserved with all work.${NC}"
echo -e "${YELLOW}Check $TASKS_DIR/ship-decision.md for details.${NC}"
echo -e "${YELLOW}Fix issues and run './agent.sh verify' to re-evaluate.${NC}"
fi
# Summary
echo ""
echo -e "${CYAN}━━━ Pipeline Summary ━━━${NC}"
echo -e "Logs: $LOG_DIR/"
echo -e "Ticket: $TASKS_DIR/next-ticket.md"
echo -e "Review: $TASKS_DIR/review-findings.md"
echo -e "QA: $TASKS_DIR/qa-report.md"
echo -e "UX: $TASKS_DIR/ux-audit.md"
echo -e "Security: $TASKS_DIR/security-audit.md"
echo -e "Arch: $TASKS_DIR/architecture-review.md"
echo -e "Hacker: $TASKS_DIR/hacker-report.md"
echo -e "Verdict: $TASKS_DIR/ship-decision.md"
echo ""
}
run_from_stage() {
local start_stage="$1"
# Stages map to the coordinator groups, not individual agents
local valid_stages="arch plan dev review qa audits verify"
echo -e "${GREEN}"
echo "╔══════════════════════════════════════════════════════════════╗"
echo "║ 🚀 Pipeline (from: $start_stage) ║"
echo "║ Date: $DATE_STAMP ║"
echo "║ ║"
echo "║ Arch → Plan → Dev → [Review↔Fix] → Arch → Audits → Verify║"
echo "╚══════════════════════════════════════════════════════════════╝"
echo -e "${NC}"
setup
register_pipeline
trap 'graceful_shutdown SIGINT' INT
trap 'graceful_shutdown SIGTERM' TERM
# Use existing branch or create one
local current_branch
current_branch=$(git branch --show-current)
if [[ "$current_branch" == "main" || "$current_branch" == "dev" ]]; then
git checkout -b "$BRANCH_NAME" 2>/dev/null || git checkout "$BRANCH_NAME"
fi
local found=0
# Stage: arch (directives)
if [ "$found" -eq 1 ] || [ "$start_stage" = "arch" ]; then
found=1
echo -e "${PURPLE}━━━ Stage 0: Architect Directives ━━━${NC}"
agent_architect_directives
fi
# Stage: plan
if [ "$found" -eq 1 ] || [ "$start_stage" = "plan" ]; then
found=1
echo -e "${PURPLE}━━━ Stage 1: Planning ━━━${NC}"
agent_plan
fi
# Stage: dev
if [ "$found" -eq 1 ] || [ "$start_stage" = "dev" ]; then
found=1
echo -e "${BLUE}━━━ Stage 2: Development ━━━${NC}"
agent_dev
git_checkpoint "wip: raw implementation"
dev_validate_loop 3
fi
# Stage: review (review-fix loop)
if [ "$found" -eq 1 ] || [ "$start_stage" = "review" ] || [ "$start_stage" = "fix" ]; then
found=1
echo -e "${RED}━━━ Stage 3: Review-Fix Loop ━━━${NC}"
review_fix_loop 3
fi
# Stage: qa (qa-fix loop)
if [ "$found" -eq 1 ] || [ "$start_stage" = "qa" ]; then
found=1
echo -e "${CYAN}━━━ Stage 4: QA-Fix Loop ━━━${NC}"
qa_fix_loop 2
fi
# Stage: audits (ux, security, arch-review, hacker)
if [ "$found" -eq 1 ] || [ "$start_stage" = "audits" ] || \
[ "$start_stage" = "ux" ] || [ "$start_stage" = "security" ] || \
[ "$start_stage" = "hacker" ]; then
found=1
echo -e "${GREEN}━━━ Stage 5: Audit Sweep ━━━${NC}"
echo ""
echo -e "${GREEN} ▸ UX Audit${NC}"
agent_ux
git_checkpoint "wip: ux improvements"
echo -e "${RED} ▸ Security Audit${NC}"
agent_security
git_checkpoint "wip: security fixes"
echo -e "${PURPLE} ▸ Architecture Review${NC}"
agent_architect_review
git_checkpoint "wip: architecture improvements"
echo -e "${YELLOW} ▸ Hacker Bug Hunt${NC}"
agent_hacker
git_checkpoint "wip: hacker fixes"
fi
# Stage: verify (verify-fix loop)
if [ "$found" -eq 1 ] || [ "$start_stage" = "verify" ]; then
found=1
echo -e "${RED}━━━ Stage 6: Final Verification ━━━${NC}"
verify_fix_loop 2
fi
if [ "$found" -eq 0 ]; then
echo -e "${RED}Unknown stage: $start_stage${NC}"
echo "Valid stages: arch, plan, dev, review, qa, audits, ux, security, hacker, verify"
exit 1
fi
# Summary
echo ""
echo -e "${CYAN}━━━ Pipeline Complete (from $start_stage) ━━━${NC}"
echo -e "Logs: $LOG_DIR/"
echo ""
}
run_quick() {
echo -e "${GREEN}"
echo "╔══════════════════════════════════════════════════════════════╗"
echo "║ ⚡ Quick 3-Agent Loop (Build → Review → Fix) ║"
echo "╚══════════════════════════════════════════════════════════════╝"
echo -e "${NC}"
setup
register_pipeline
trap 'graceful_shutdown SIGINT' INT
trap 'graceful_shutdown SIGTERM' TERM
agent_dev
git_checkpoint "wip: implementation"
dev_validate_loop 3
review_fix_loop 2
echo -e "${GREEN}✓ Quick cycle complete. Run './agent.sh from qa' for deeper checks.${NC}"
}
# ─── Interactive Mode ────────────────────────────────────────────────────────
prompt_for_focus() {
# All display output goes to stderr so command substitution only captures the choice
echo -e "${CYAN}" >&2
echo "╔══════════════════════════════════════════════════════════════╗" >&2
echo "║ 🤖 Multi-Agent Development Pipeline ║" >&2
echo "╚══════════════════════════════════════════════════════════════╝" >&2
echo -e "${NC}" >&2
echo "" >&2
echo -e "${YELLOW}What would you like to work on today?${NC}" >&2
echo "" >&2
echo "Examples:" >&2
echo " - Fix security vulnerabilities from last review" >&2
echo " - Implement user authentication feature" >&2
echo " - Improve performance of the dashboard" >&2
echo " - Add tests for the payment flow" >&2
echo " - Review and refactor the API layer" >&2
echo "" >&2
echo -n "Your focus: " >&2
read -r focus_input
echo "" >&2
if [ -z "$focus_input" ]; then
echo -e "${RED}Focus cannot be empty.${NC}" >&2
exit 1
fi
# Save focus to file for agents to reference
cat > "$TASKS_DIR/focus.md" << EOF
# Current Focus
**Date:** $DATE_STAMP
**Focus:** $focus_input
## Context
This focus was set at the start of the agent pipeline run. All agents should
prioritize work related to this focus area.
## Priority
Tasks and issues directly related to this focus should be addressed first.
Other improvements can be made, but the primary goal is to make progress on
this specific focus area.
EOF
echo -e "${GREEN}✓ Focus saved to $TASKS_DIR/focus.md${NC}" >&2
echo "" >&2
echo "Select an agent or pipeline to run:" >&2
echo "" >&2
echo -e "${GREEN} 1${NC}) full - Run the complete pipeline (Arch → Plan → Dev → Review → Arch → Audits → Verify)" >&2
echo -e "${GREEN} 2${NC}) quick - Run the minimal 3-agent loop (build → review → fix)" >&2
echo -e "${GREEN} 3${NC}) arch - Run only the architect agent (writes directives)" >&2
echo -e "${GREEN} 4${NC}) plan - Run only the product planning agent" >&2
echo -e "${GREEN} 5${NC}) dev - Run only the development agent" >&2
echo -e "${GREEN} 6${NC}) review - Run only the code review agent" >&2
echo -e "${GREEN} 7${NC}) fix - Run only the fix agent" >&2
echo -e "${GREEN} 8${NC}) qa - Deep coverage mode: hunt untested code across codebase" >&2
echo -e "${GREEN} 9${NC}) ux - Run only the UX audit agent" >&2
echo -e "${GREEN}10${NC}) security - Run only the security audit agent" >&2
echo -e "${GREEN}11${NC}) hacker - Run only the hacker/bug-hunter agent" >&2
echo -e "${GREEN}12${NC}) verify - Run only the final verification gate" >&2
echo "" >&2
echo -e "${CYAN}13${NC}) status - Show pipeline progress (instant, no AI)" >&2
echo -e "${CYAN}14${NC}) manager - AI progress report with completion %" >&2
echo "" >&2
echo -e "${RED}15${NC}) abort - Gracefully stop a running pipeline" >&2
echo "" >&2
echo -n "Enter your choice (1-15 or name): " >&2
read -r choice
echo "" >&2
case "$choice" in
1|full) echo "full" ;;
2|quick) echo "quick" ;;
3|arch) echo "arch" ;;
4|plan) echo "plan" ;;
5|dev) echo "dev" ;;
6|review) echo "review" ;;
7|fix) echo "fix" ;;
8|qa) echo "qa" ;;
9|ux) echo "ux" ;;
10|security) echo "security" ;;
11|hacker) echo "hacker" ;;
12|verify) echo "verify" ;;
13|status) echo "status" ;;
14|manager) echo "manager" ;;
15|abort) echo "abort" ;;
*)
echo -e "${RED}Invalid choice: $choice${NC}" >&2
echo "Run './agent.sh help' for usage." >&2
exit 1
;;
esac
}
# ─── CLI Router ──────────────────────────────────────────────────────────────
# If no argument provided, prompt for focus
if [ $# -eq 0 ]; then
COMMAND=$(prompt_for_focus)
else
COMMAND="$1"
SUBCOMMAND="${2:-}"
fi
# Helper to register + trap for single-agent CLI runs
run_single_agent() {
setup
register_pipeline
trap 'graceful_shutdown SIGINT' INT
trap 'graceful_shutdown SIGTERM' TERM
}
case "$COMMAND" in
arch) run_single_agent && agent_architect_directives ;;
plan) run_single_agent && agent_plan ;;
dev) run_single_agent && agent_dev && git_checkpoint "wip: dev agent" && dev_validate_loop 3 ;;
review) run_single_agent && agent_review ;;
fix) run_single_agent && agent_fix && git_checkpoint "wip: fixes applied" ;;
qa) run_single_agent && agent_qa_deep && git_checkpoint "wip: qa deep coverage" ;;
ux) run_single_agent && agent_ux && git_checkpoint "wip: ux improvements" ;;
security) run_single_agent && agent_security && git_checkpoint "wip: security fixes" ;;
hacker) run_single_agent && agent_hacker && git_checkpoint "wip: hacker fixes" ;;
verify) run_single_agent && agent_verify ;;
quick) run_quick ;;
full) run_full_pipeline ;;
from)
if [ -z "$SUBCOMMAND" ]; then
echo -e "${RED}Usage: ./agent.sh from <stage>${NC}"
echo "Stages: arch, plan, dev, review, fix, qa, ux, security, hacker, verify"
exit 1
fi
run_from_stage "$SUBCOMMAND"
;;
abort) do_abort ;;
status) show_status ;;
manager) setup && agent_manager ;;
help|-h|--help)
echo "Usage: ./agent.sh [command]"
echo ""
echo "Pipeline:"
echo " full Run the full pipeline with coordinator loops"
echo " from <stage> Resume pipeline from a stage (arch/plan/dev/review/qa/audits/verify)"
echo " quick Quick loop: dev → [review↔fix]"
echo ""
echo " Flow: Arch → Plan → Dev → [Review↔Fix x3] → Arch → Audits → [Verify↔Fix x2]"
echo ""
echo "Single agents:"
echo " arch Architect — writes directives (pre-dev) or reviews (post-dev)"
echo " plan Product planning agent"
echo " dev Development agent"
echo " review Code review agent"
echo " fix Fix agent"
echo " qa QA deep coverage (standalone: hunts untested code across codebase)"
echo " ux UX audit agent"
echo " security Security audit agent"
echo " hacker Hacker/bug-hunter agent"
echo " verify Final verification gate"
echo ""
echo "Control:"
echo " abort Gracefully stop the running pipeline"
echo " status Show pipeline progress (instant, no AI)"
echo " manager AI progress report with completion %"
echo " help Show this help message"
echo ""
echo "If no command is provided, an interactive menu will be shown."
;;
*)
echo -e "${RED}Unknown command: $COMMAND${NC}"
echo "Run './agent.sh help' for usage."
exit 1
;;
esac
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment