fireboy1919 · February 11, 2026 17:50
diff --git a/evaluations.json b/evaluations.json
 [
  {
    "id": "eval-01-field-resolver",
    "name": "Simple Field Resolver",
    "skills": ["viaduct-field-resolver"],
    "schema": "type Group implements Node @resolver @scope(to: [\"default\"]) {\n    id: ID!\n    name: String!\n    description: String\n}",
    "query": "I want to show a member count on each Group. Add a memberCount field that returns 42 for now.",
    "expected_behavior": [
      "Adds 'memberCount: Int! @resolver' to the Group type in schema",
      "Creates resolver class extending GroupResolvers.MemberCount()",
      "Uses @Resolver annotation"
    ],
    "verify_patterns": [
      "memberCount: Int!? @resolver",
      "GroupResolvers.MemberCount"
    ]
  },
  {
    "id": "eval-02-node-resolver",
    "name": "Node Type and Query Resolver",
    "skills": ["viaduct-node-type", "viaduct-query-resolver"],
    "schema": "type Tag implements Node @resolver @scope(to: [\"default\"]) {\n    id: ID!\n    name: String!\n    color: String!\n}\n\nextend type Query {\n    tag(id: ID! @idOf(type: \"Tag\")): Tag @resolver\n}",
    "query": "I need to be able to fetch a Tag by its ID. Implement the resolvers so I can query for a single tag. Return hardcoded data for now.",
    "expected_behavior": [
      "Creates TagNodeResolver extending NodeResolvers.Tag()",
      "Creates TagQueryResolver extending QueryResolvers.Tag()",
      "Uses ctx.arguments.id.internalID to get the UUID"
    ],
    "verify_patterns": [
      "NodeResolvers.Tag",
      "QueryResolvers.Tag",
      "internalID"
    ]
  },
  {
    "id": "eval-03-required-selection-set",
    "name": "Field Resolver with Required Selection Set",
    "skills": ["viaduct-field-resolver"],
    "schema": "type GroupMember implements Node @resolver @scope(to: [\"default\"]) {\n    id: ID!\n    userId: String!\n    role: String!\n}",
    "query": "Add a displayName field to GroupMember that shows 'User: ' followed by the userId. It needs to read the userId from the parent object.",
    "expected_behavior": [
      "Adds 'displayName: String @resolver' to GroupMember type",
      "Creates resolver with @Resolver annotation specifying objectValueFragment",
      "Extends GroupMemberResolvers.DisplayName()",
      "Accesses userId via ctx.objectValue.getUserId()"
    ],
    "verify_patterns": [
      "displayName: String!? @resolver",
      "GroupMemberResolvers.DisplayName",
      "objectValueFragment|fragment .* on GroupMember",
      "getUserId"
    ]
  },
  {
    "id": "eval-04-batch-resolution",
    "name": "Batch Resolution for N+1 Prevention",
    "skills": ["viaduct-batch"],
    "schema": "type Tag implements Node @resolver @scope(to: [\"default\"]) {\n    id: ID!\n    name: String!\n}\n\ntype Group implements Node @resolver @scope(to: [\"default\"]) {\n    id: ID!\n    name: String!\n    tags: [Tag!]! @resolver\n}",
    "query": "Implement the tags field on Group. Each group should return a list of tags. Make sure it handles multiple groups efficiently without N+1 queries. Return hardcoded tags for now.",
    "expected_behavior": [
      "Creates GroupTagsResolver extending GroupResolvers.Tags()",
      "Implements batchResolve with List<Context> parameter",
      "Returns List<FieldValue<List<Tag>>>",
      "Uses FieldValue.ofValue() for results"
    ],
    "verify_patterns": [
      "GroupResolvers.Tags",
      "batchResolve",
      "FieldValue"
    ]
  },
  {
    "id": "eval-05-mutations-globalid",
    "name": "Mutations with GlobalID Handling",
    "skills": ["viaduct-mutations"],
    "schema": "type Tag implements Node @resolver @scope(to: [\"default\"]) {\n    id: ID!\n    name: String!\n    color: String!\n}\n\ninput CreateTagInput {\n    name: String!\n    color: String!\n}\n\ninput UpdateTagInput {\n    id: ID! @idOf(type: \"Tag\")\n    name: String\n    color: String\n}\n\nextend type Mutation {\n    createTag(input: CreateTagInput!): Tag! @resolver\n    updateTag(input: UpdateTagInput!): Tag @resolver\n    deleteTag(id: ID! @idOf(type: \"Tag\")): Boolean! @resolver\n}",
    "query": "Implement the tag mutations: createTag, updateTag, and deleteTag. Return mock data for now.",
    "expected_behavior": [
      "Creates CreateTagMutationResolver extending MutationResolvers.CreateTag()",
      "Creates UpdateTagMutationResolver extending MutationResolvers.UpdateTag()",
      "Creates DeleteTagMutationResolver extending MutationResolvers.DeleteTag()",
      "Uses input.id.internalID for update/delete",
      "Uses ctx.globalIDFor when building Tag response"
    ],
    "verify_patterns": [
      "MutationResolvers.CreateTag",
      "MutationResolvers.UpdateTag",
      "MutationResolvers.DeleteTag",
      "internalID",
      "globalIDFor"
    ]
  },
  {
    "id": "eval-06-scopes",
    "name": "Scope-based API Visibility",
    "skills": ["viaduct-scopes"],
    "schema": "type Tag implements Node @resolver @scope(to: [\"default\"]) {\n    id: ID!\n    name: String!\n}",
    "query": "Add admin-only operations for Tag: a way to get/set internalNotes, get usageCount, and delete all tags. These should only be visible to admin API consumers, not the default scope.",
    "expected_behavior": [
      "Creates 'extend type Mutation @scope(to: [\"admin\"])' for admin mutations",
      "Uses mutations instead of fields (extend type Tag doesn't work)",
      "Includes deleteAllTags mutation"
    ],
    "verify_patterns": [
      "@scope\\(to: \\[\"admin\"\\]\\)",
      "extend type Mutation",
      "deleteAllTags",
      "MutationResolvers"
    ]
  },
  {
    "id": "eval-07-entity-relationships",
    "name": "Entity Relationships with Node References",
    "skills": ["viaduct-relationships"],
    "schema": "type User implements Node @resolver @scope(to: [\"default\"]) {\n    id: ID!\n    name: String!\n    email: String!\n}\n\ntype Tag implements Node @resolver @scope(to: [\"default\"]) {\n    id: ID!\n    name: String!\n    createdById: String!\n    createdBy: User @resolver\n}",
    "query": "Implement the createdBy field on Tag so it returns the User who created it. The createdById field has the user's ID. Also implement basic node resolvers for User and Tag.",
    "expected_behavior": [
      "Creates TagCreatedByResolver with objectValueFragment including createdById",
      "Returns ctx.nodeFor(ctx.globalIDFor(User.Reflection, createdById))",
      "Does NOT fetch user data directly"
    ],
    "verify_patterns": [
      "TagResolvers.CreatedBy",
      "nodeFor",
      "globalIDFor.*User"
    ]
  },
  {
    "id": "gotcha-01-missing-idof-input",
    "name": "Gotcha: Input Type Missing @idOf",
    "skills": ["viaduct-mutations"],
    "schema": "type Task implements Node @resolver @scope(to: [\"default\"]) {\n    id: ID!\n    title: String!\n    completed: Boolean!\n}\n\ninput UpdateTaskInput {\n    id: ID!\n    title: String\n    completed: Boolean\n}\n\nextend type Mutation {\n    updateTask(input: UpdateTaskInput!): Task! @resolver\n}",
    "query": "Implement the updateTask mutation. The resolver MUST extract the task ID from input.id and log it. Return mock data using that ID.",
    "expected_behavior": [
      "Recognizes @idOf is missing from UpdateTaskInput.id",
      "Adds @idOf(type: \"Task\") to the id field in schema",
      "Uses input.id.internalID in the resolver",
      "Does NOT manually decode Base64"
    ],
    "verify_patterns": [
      "@idOf\\(type: \"Task\"\\)",
      "MutationResolvers.UpdateTask",
      "internalID"
    ],
    "negative_patterns": [
      "Base64\\.getDecoder",
      "Base64\\.decode",
      "substringAfter\\(\":\"\\)"
    ]
  },
  {
    "id": "gotcha-02-missing-idof-query-arg",
    "name": "Gotcha: Query Argument Missing @idOf",
    "skills": ["viaduct-query-resolver"],
    "schema": "type Project implements Node @resolver @scope(to: [\"default\"]) {\n    id: ID!\n    name: String!\n    description: String\n}\n\nextend type Query {\n    project(id: ID!): Project @resolver\n}",
    "query": "I need to fetch a single project by ID. Implement the project query resolver. Return hardcoded data.",
    "expected_behavior": [
      "Recognizes @idOf is missing from query argument",
      "Adds @idOf(type: \"Project\") to the id argument",
      "Uses ctx.arguments.id.internalID in the resolver",
      "Does NOT manually decode Base64"
    ],
    "verify_patterns": [
      "project\\(id: ID! @idOf\\(type: \"Project\"\\)\\)",
      "QueryResolvers.Project",
      "internalID"
    ],
    "negative_patterns": [
      "Base64\\.getDecoder",
      "Base64\\.decode",
      "substringAfter\\(\":\"\\)"
    ]
  },
  {
    "id": "gotcha-03-globalid-response",
    "name": "Gotcha: Building GlobalID for Response",
    "skills": ["viaduct-mutations"],
    "schema": "type Item implements Node @resolver @scope(to: [\"default\"]) {\n    id: ID!\n    name: String!\n    quantity: Int!\n}\n\ninput CreateItemInput {\n    name: String!\n    quantity: Int!\n}\n\nextend type Mutation {\n    createItem(input: CreateItemInput!): Item! @resolver\n}",
    "query": "Implement the createItem mutation. Generate a UUID for the new item and return it with the input values.",
    "expected_behavior": [
      "Uses ctx.globalIDFor(Item.Reflection, uuid) to create GlobalID",
      "Does NOT pass raw UUID string to .id() method",
      "Uses Item.Builder pattern"
    ],
    "verify_patterns": [
      "MutationResolvers.CreateItem",
      "globalIDFor\\(Item\\.Reflection",
      "Item\\.Builder"
    ],
    "negative_patterns": [
      "\\.id\\([a-z]+Id\\)",
      "\\.id\\(uuid\\)",
      "\\.id\\(UUID\\."
    ]
  }
 ]
diff --git a/run-evaluations.sh b/run-evaluations.sh
 #!/bin/bash
 #
 # Viaduct Skill Evaluation Harness (Parallel Edition)
 #
 # Runs evaluations against the viaduct skill.
 # Each evaluation:
 #   1. Copies base-template to unique temp directory
 #   2. Appends eval-specific schema types
 #   3. Runs Gradle to generate scaffolding
 #   4. Runs AI agent (Claude CLI or Crush) to implement the feature
 #   5. Builds and verifies patterns
 #
 # Usage:
 #   ./run-evaluations.sh [options] [eval-id]
 #
 # Options:
 #   --no-skill      Run without the viaduct skill (baseline test)
 #   --skill         Run with the viaduct skill (default)
 #   --compare       Run with and without skill, then show side-by-side comparison
 #   --clean         Remove all previous eval outputs before starting
 #   --parallel=N    Run N evaluations in parallel (default: 10 for Crush, 4 for Claude)
 #   --sequential    Run evaluations one at a time (--parallel=1)
 #   --backend=X     Use 'crush' (default) or 'claude' as the AI backend
 #
 # Environment:
 #   MAX_RETRIES=3       Set max retry attempts
 #   MAX_PARALLEL=10     Set max parallel evaluations (default: 10 for Crush, 4 for Claude)
 #
 # Output:
 #   .eval-outputs/<eval-id>-agent.txt     Agent's final response
 #   .eval-outputs/<eval-id>-build.txt     Gradle build output
 #   .eval-outputs/<eval-id>-errors.txt    Error summary
 #   .eval-outputs/<eval-id>-workspace/    Full workspace (preserved on failure or retry)
 #
 # Backends:
 #   crush   - Charmbracelet Crush (~165 MB/process, default, requires crush CLI)
 #             Crush requires: CATWALK_URL=http://localhost:1 to use cached providers
 #   claude  - Claude CLI (~800 MB/process, requires claude CLI)
 #

 set -o pipefail

 # Clean up all child processes on exit
 cleanup() {
    local pids=$(jobs -p 2>/dev/null)
    if [[ -n "$pids" ]]; then
        echo -e "\nCleaning up child processes..."
        kill $pids 2>/dev/null
        sleep 2
        kill -9 $pids 2>/dev/null
    fi
 }
 trap cleanup EXIT INT TERM

 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 EVAL_FILE="$SCRIPT_DIR/evaluations.json"
 OUTPUT_DIR="$SCRIPT_DIR/.eval-outputs"
 BASE_TEMPLATE="$SCRIPT_DIR/base-template"
 WORK_BASE="/tmp/viaduct-skill-eval"

 # Default settings
 USE_SKILL=1
 CLEAN=0
 COMPARE=0
 FILTER=""
 MAX_RETRIES="${MAX_RETRIES:-3}"
 EVAL_TIMEOUT="${EVAL_TIMEOUT:-600}"  # 10 minutes per evaluation
 BACKEND="${BACKEND:-crush}"
 # MAX_PARALLEL default depends on backend (set after parsing args)

 # Colors
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
 CYAN='\033[0;36m'
 NC='\033[0m'

 # Parse arguments
 EXPLICIT_PARALLEL=""
 while [[ $# -gt 0 ]]; do
    case $1 in
        --no-skill)
            USE_SKILL=0
            shift
            ;;
        --skill)
            USE_SKILL=1
            shift
            ;;
        --parallel=*)
            EXPLICIT_PARALLEL="${1#*=}"
            shift
            ;;
        --sequential)
            EXPLICIT_PARALLEL=1
            shift
            ;;
        --backend=*)
            BACKEND="${1#*=}"
            shift
            ;;
        --crush)
            BACKEND="crush"
            shift
            ;;
        --claude)
            BACKEND="claude"
            shift
            ;;
        --clean)
            CLEAN=1
            shift
            ;;
        --compare)
            COMPARE=1
            shift
            ;;
        *)
            FILTER="$1"
            shift
            ;;
    esac
 done

 # Set MAX_PARALLEL based on backend (crush uses less memory, can run more parallel)
 if [[ -n "$EXPLICIT_PARALLEL" ]]; then
    MAX_PARALLEL="$EXPLICIT_PARALLEL"
 elif [[ -n "$MAX_PARALLEL" ]]; then
    : # Use environment variable
 elif [[ "$BACKEND" == "crush" ]]; then
    MAX_PARALLEL=10
 else
    MAX_PARALLEL=4
 fi

 mkdir -p "$OUTPUT_DIR"

 check_deps() {
    local missing=0
    command -v jq &>/dev/null || { echo -e "${RED}Error: jq required${NC}"; missing=1; }
    command -v java &>/dev/null || { echo -e "${RED}Error: java 17+ required${NC}"; missing=1; }
    [[ ! -d "$BASE_TEMPLATE" ]] && { echo -e "${RED}Error: base-template not found at $BASE_TEMPLATE${NC}"; missing=1; }

    if [[ "$BACKEND" == "crush" ]]; then
        command -v crush &>/dev/null || { echo -e "${RED}Error: crush CLI required (brew install charmbracelet/tap/crush)${NC}"; missing=1; }
    else
        command -v claude &>/dev/null || { echo -e "${RED}Error: claude CLI required${NC}"; missing=1; }
    fi

    [[ $missing -eq 1 ]] && exit 1
 }

 # Timer helper
 time_cmd() {
    local start=$(date +%s)
    "$@"
    local end=$(date +%s)
    echo $((end - start))
 }

 # Kill a process and all its children
 kill_tree() {
    local pid="$1"
    local signal="${2:-TERM}"
    # Kill children first (process group), then the process itself
    local children=$(pgrep -P "$pid" 2>/dev/null)
    for child in $children; do
        kill_tree "$child" "$signal"
    done
    kill -"$signal" "$pid" 2>/dev/null || true
 }

 # Run a function with a timeout. Kills the process tree if it exceeds the limit.
 # Usage: run_with_timeout <timeout_secs> <function> [args...]
 run_with_timeout() {
    local timeout="$1"
    shift

    # Run the function in a subshell so we get a single PID to track
    "$@" &
    local cmd_pid=$!

    # Watchdog: sleep then kill if still running
    (
        sleep "$timeout"
        if kill -0 "$cmd_pid" 2>/dev/null; then
            echo -e "${RED}[$(date +%H:%M:%S)] TIMEOUT: killing evaluation (exceeded ${timeout}s)${NC}" >&2
            kill_tree "$cmd_pid" TERM
            sleep 5
            # Force kill if still alive
            if kill -0 "$cmd_pid" 2>/dev/null; then
                kill_tree "$cmd_pid" KILL
            fi
        fi
    ) &
    local watchdog_pid=$!

    # Wait for the command to finish (either naturally or killed)
    wait "$cmd_pid" 2>/dev/null
    local exit_code=$?

    # Clean up the watchdog
    kill "$watchdog_pid" 2>/dev/null
    wait "$watchdog_pid" 2>/dev/null

    return $exit_code
 }

 # Pre-warm Gradle daemon and download dependencies
 prewarm_gradle() {
    echo "Pre-warming Gradle daemon and cache..."
    local prewarm_dir="$WORK_BASE-prewarm"
    local start=$(date +%s)

    rm -rf "$prewarm_dir"
    cp -r "$BASE_TEMPLATE" "$prewarm_dir"

    # Run a build to warm up the daemon and cache dependencies
    if (cd "$prewarm_dir" && ./gradlew viaductCodegen classes --daemon -q 2>&1); then
        local end=$(date +%s)
        echo -e "${GREEN}Gradle daemon warmed up${NC} ($(( end - start ))s)"
    else
        echo -e "${YELLOW}Warning: Gradle prewarm had issues, continuing anyway${NC}"
    fi
    rm -rf "$prewarm_dir"
 }

 setup_project() {
    local work_dir="$1"
    local schema_addition="$2"
    local eval_id="$3"

    # Clean and copy base template
    rm -rf "$work_dir"
    cp -r "$BASE_TEMPLATE" "$work_dir"

    # Append schema types for this evaluation
    if [[ -n "$schema_addition" ]]; then
        echo "" >> "$work_dir/src/main/viaduct/schema/Schema.graphqls"
        echo "$schema_addition" >> "$work_dir/src/main/viaduct/schema/Schema.graphqls"
    fi

    # Generate scaffolding with Gradle (using daemon for speed)
    if ! (cd "$work_dir" && ./gradlew viaductCodegen --daemon -q 2>&1); then
        return 1
    fi

    # Install AGENTS.md with doc references if in skill mode
    if [[ $USE_SKILL -eq 1 ]]; then
        local install_output
        install_output=$(cd "$work_dir" && node "$SCRIPT_DIR/../bin/install.js" 2>&1)
        local install_exit=$?

        if [[ $install_exit -ne 0 ]]; then
            echo -e "${RED}Warning: skill install failed for $eval_id (exit $install_exit)${NC}" >&2
            echo "$install_output" >&2
        fi

        # Verify docs were actually installed
        if [[ ! -f "$work_dir/AGENTS.md" && ! -f "$work_dir/CLAUDE.md" ]]; then
            echo -e "${RED}Warning: no AGENTS.md or CLAUDE.md found in $work_dir after install${NC}" >&2
        fi
        if [[ ! -d "$work_dir/.viaduct/agents" ]]; then
            echo -e "${RED}Warning: .viaduct/agents/ directory not created in $work_dir${NC}" >&2
        fi
    fi

    return 0
 }

 # Detect if we're using internal gateway or direct Anthropic API
 # Sets USE_GATEWAY=1 if using internal gateway, 0 if using direct API
 detect_auth_mode() {
    if [[ -n "$ANTHROPIC_API_KEY" ]]; then
        # Direct Anthropic API key provided
        USE_GATEWAY=0
    elif command -v iap-auth &>/dev/null; then
        # Internal gateway via iap-auth
        USE_GATEWAY=1
    else
        echo -e "${RED}Error: No authentication configured.${NC}"
        echo "Set ANTHROPIC_API_KEY for direct Anthropic API access,"
        echo "or ensure iap-auth is available for internal gateway access."
        exit 1
    fi
    export USE_GATEWAY
 }

 # Setup Crush environment for internal gateway (modifies model IDs)
 setup_crush_for_gateway() {
    # Crush auto-updates providers from remote, which overwrites local changes.
    # We need to modify the cached providers.json to use gateway model IDs.
    local providers_file="$HOME/.local/share/crush/providers.json"

    if [[ ! -f "$providers_file" ]]; then
        echo -e "${YELLOW}Warning: Crush providers.json not found, running crush once to initialize...${NC}"
        CATWALK_URL="http://localhost:1" crush models > /dev/null 2>&1 || true
    fi

    if [[ -f "$providers_file" ]]; then
        # Update Anthropic provider model IDs to match gateway format
        python3 << 'PYEOF' 2>/dev/null || true
 import json, os

 filepath = os.path.expanduser('~/.local/share/crush/providers.json')
 if not os.path.exists(filepath):
    exit(0)

 with open(filepath, 'r') as f:
    data = json.load(f)

 modified = False
 for provider in data:
    if provider.get('id') == 'anthropic':
        for model in provider.get('models', []):
            old_id = model['id']
            if not old_id.startswith('global.'):
                model['id'] = f"global.anthropic.{old_id}-v1:0"
                modified = True
        if not provider.get('default_large_model_id', '').startswith('global.'):
            provider['default_large_model_id'] = 'global.anthropic.claude-sonnet-4-5-20250929-v1:0'
            provider['default_small_model_id'] = 'global.anthropic.claude-haiku-4-5-20251001-v1:0'
            modified = True
        break

 if modified:
    with open(filepath, 'w') as f:
        json.dump(data, f, separators=(',', ':'))
 PYEOF
    fi
 }

 # Run prompt with Claude CLI
 run_with_claude() {
    local work_dir="$1"
    local prompt="$2"
    local output_file="$3"

    if [[ -n "$ANTHROPIC_API_KEY" ]]; then
        claude -p "$prompt" \
              --dangerously-skip-permissions \
              --no-session-persistence \
              "$work_dir" >> "$output_file" 2>&1 || true
    elif command -v iap-auth &>/dev/null; then
        local auth_token
        auth_token=$(iap-auth https://devaigateway.a.musta.ch 2>/dev/null)
        if [[ -z "$auth_token" ]]; then
            return 1
        fi
        CLAUDE_CODE_USE_BEDROCK=1 \
        ANTHROPIC_BEDROCK_BASE_URL="https://devaigateway.a.musta.ch/bedrock" \
        CLAUDE_CODE_SKIP_BEDROCK_AUTH=1 \
        ANTHROPIC_AUTH_TOKEN="$auth_token" \
        claude -p "$prompt" \
              --dangerously-skip-permissions \
              --no-session-persistence \
              "$work_dir" >> "$output_file" 2>&1 || true
    else
        return 1
    fi
    return 0
 }

 # Run prompt with Crush
 run_with_crush() {
    local work_dir="$1"
    local prompt="$2"
    local output_file="$3"

    (
        cd "$work_dir"
        if [[ "$USE_GATEWAY" -eq 1 ]]; then
            # Internal gateway mode: use iap-auth token and gateway endpoint
            local auth_token
            auth_token=$(iap-auth https://devaigateway.a.musta.ch 2>/dev/null)
            if [[ -z "$auth_token" ]]; then
                echo "Failed to get iap-auth token" >> "$output_file"
                return 1
            fi
            ANTHROPIC_API_KEY="$auth_token" \
            ANTHROPIC_API_ENDPOINT="https://devaigateway.a.musta.ch" \
            CATWALK_URL="http://localhost:1" \
            crush run "$prompt" >> "$output_file" 2>&1
        else
            # Direct Anthropic API mode: use ANTHROPIC_API_KEY directly
            # CATWALK_URL blocks remote provider updates to keep config stable
            CATWALK_URL="http://localhost:1" \
            crush run "$prompt" >> "$output_file" 2>&1
        fi
    ) || true
    return 0
 }

 # Run prompt with selected backend
 run_agent() {
    local work_dir="$1"
    local prompt="$2"
    local output_file="$3"

    if [[ "$BACKEND" == "crush" ]]; then
        run_with_crush "$work_dir" "$prompt" "$output_file"
    else
        run_with_claude "$work_dir" "$prompt" "$output_file"
    fi
 }

 # Extract the key error from build output
 extract_error_summary() {
    local build_output="$1"

    if grep -q "Unresolved reference" "$build_output" 2>/dev/null; then
        grep "Unresolved reference" "$build_output" | head -1 | sed 's/.*: //'
    elif grep -q "cannot find symbol" "$build_output" 2>/dev/null; then
        grep "cannot find symbol" "$build_output" | head -1
    elif grep -q "not found" "$build_output" 2>/dev/null; then
        grep -E "not found|Not found" "$build_output" | head -1 | sed 's/.*: //'
    elif grep -q "expected" "$build_output" 2>/dev/null; then
        grep "expected" "$build_output" | head -1
    elif grep -q "error:" "$build_output" 2>/dev/null; then
        grep "error:" "$build_output" | head -1 | sed 's/.*error: //'
    else
        tail -3 "$build_output" | head -1
    fi
 }

 # Run a single evaluation (can be called in parallel)
 # Writes result to $OUTPUT_DIR/<eval_id>.result
 run_evaluation() {
    local eval_id="$1"
    local eval_name="$2"
    local eval_query="$3"
    local verify_patterns="$4"
    local schema_addition="$5"
    local negative_patterns="$6"

    local suffix=$([[ $USE_SKILL -eq 0 ]] && echo "-noskill" || echo "")
    local backend_suffix=$([[ "$BACKEND" == "crush" ]] && echo "-crush" || echo "")

    # Unique workspace for this evaluation (includes suffix to avoid conflicts)
    local work_dir="$WORK_BASE-$eval_id$suffix$backend_suffix"
    local agent_output="$OUTPUT_DIR/$eval_id$suffix$backend_suffix-agent.txt"
    local build_output="$OUTPUT_DIR/$eval_id$suffix$backend_suffix-build.txt"
    local errors_file="$OUTPUT_DIR/$eval_id$suffix$backend_suffix-errors.txt"
    local result_file="$OUTPUT_DIR/$eval_id$suffix$backend_suffix.result"

    # Log start
    local eval_start=$(date +%s)
    echo "[$(date +%H:%M:%S)] Starting: $eval_id"

    # Setup fresh project with schema for this eval
    local setup_start=$(date +%s)
    if ! setup_project "$work_dir" "$schema_addition" "$eval_id"; then
        echo "FAIL|$eval_id|0|SETUP_FAILED" > "$result_file"
        echo "[$(date +%H:%M:%S)] $eval_id: SETUP FAILED"
        return 1
    fi

    local setup_end=$(date +%s)
    local setup_time=$((setup_end - setup_start))

    # Clear errors file
    > "$errors_file"

    # Build query - remove skill reference if no-skill mode
    local full_query
    if [[ $USE_SKILL -eq 1 ]]; then
        full_query="Work ONLY in $work_dir. Implement:

 $eval_query"
    else
        local clean_query="${eval_query//Use the viaduct skill for guidance./}"
        clean_query="${clean_query//Use the viaduct skill for guidance/}"
        full_query="Work ONLY in $work_dir. Implement:

 $clean_query"
    fi

    # Run AI agent
    local agent_start=$(date +%s)
    > "$agent_output"  # Clear output file
    if ! run_agent "$work_dir" "$full_query" "$agent_output"; then
        echo "FAIL|$eval_id|0|AUTH_FAILED" > "$result_file"
        echo "[$(date +%H:%M:%S)] $eval_id: AUTH FAILED"
        return 1
    fi

    local agent_end=$(date +%s)
    local agent_time=$((agent_end - agent_start))

    # Build and fix loop
    local build_start=$(date +%s)
    local build_success=0
    local attempt=1
    local retry_errors=""

    while [[ $attempt -le $MAX_RETRIES ]]; do
        # Run viaductCodegen first in case Claude modified the schema
        if (cd "$work_dir" && ./gradlew viaductCodegen classes --daemon -q > "$build_output" 2>&1); then
            build_success=1
            break
        else
            local error_summary=$(extract_error_summary "$build_output")
            echo "Attempt $attempt: $error_summary" >> "$errors_file"
            [[ -n "$retry_errors" ]] && retry_errors="$retry_errors|"
            retry_errors="$retry_errors$error_summary"

            if [[ $attempt -lt $MAX_RETRIES ]]; then
                local build_error=$(tail -50 "$build_output")
                local fix_query="Build failed. Fix it:
 \`\`\`
 $build_error
 \`\`\`
 Work ONLY in $work_dir."

                run_agent "$work_dir" "$fix_query" "$agent_output"
            fi
        fi
        ((attempt++))
    done

    # Check patterns
    local patterns_found=0
    local patterns_total=0
    local missing_patterns=""

    if [[ -n "$verify_patterns" ]]; then
        while IFS= read -r pattern; do
            if [[ -n "$pattern" ]]; then
                ((patterns_total++))
                if grep -rqE "$pattern" "$work_dir/src" 2>/dev/null; then
                    ((patterns_found++))
                else
                    [[ -n "$missing_patterns" ]] && missing_patterns="$missing_patterns, "
                    missing_patterns="$missing_patterns$pattern"
                fi
            fi
        done <<< "$verify_patterns"
    fi

    # Check negative patterns
    local negative_failed=0
    local found_negative=""

    if [[ -n "$negative_patterns" ]]; then
        while IFS= read -r pattern; do
            if [[ -n "$pattern" ]]; then
                if grep -rqE "$pattern" "$work_dir/src" 2>/dev/null; then
                    [[ -n "$found_negative" ]] && found_negative="$found_negative, "
                    found_negative="$found_negative$pattern"
                    ((negative_failed++))
                fi
            fi
        done <<< "$negative_patterns"
    fi

    # Record results
    if [[ -n "$missing_patterns" ]]; then
        echo "Missing patterns: $missing_patterns" >> "$errors_file"
    fi
    if [[ -n "$found_negative" ]]; then
        echo "Forbidden patterns found: $found_negative" >> "$errors_file"
    fi

    local build_end=$(date +%s)
    local build_time=$((build_end - build_start))
    local total_time=$((build_end - eval_start))

    # Determine pass/fail
    local eval_passed=0
    local timing_info="setup:${setup_time}s agent:${agent_time}s build:${build_time}s total:${total_time}s"

    if [[ $build_success -eq 1 ]] && [[ $patterns_found -eq $patterns_total ]] && [[ $negative_failed -eq 0 ]]; then
        echo "PASS|$eval_id|$attempt|$retry_errors|$timing_info" > "$result_file"
        echo -e "[$(date +%H:%M:%S)] $eval_id: ${GREEN}PASSED${NC} (attempt $attempt) [$timing_info]"
        eval_passed=1
    else
        local fail_reason=""
        [[ $build_success -eq 0 ]] && fail_reason="build_failed"
        [[ $patterns_found -ne $patterns_total ]] && fail_reason="${fail_reason:+$fail_reason,}missing_patterns"
        [[ $negative_failed -gt 0 ]] && fail_reason="${fail_reason:+$fail_reason,}forbidden_patterns"
        echo "FAIL|$eval_id|$attempt|$fail_reason|$retry_errors|$timing_info" > "$result_file"
        echo -e "[$(date +%H:%M:%S)] $eval_id: ${RED}FAILED${NC} ($fail_reason) [$timing_info]"
    fi

    # Preserve workspace if failed OR not a one-shot
    if [[ $eval_passed -eq 0 ]] || [[ $attempt -gt 1 ]]; then
        local workspace_dir="$OUTPUT_DIR/$eval_id$suffix-workspace"
        rm -rf "$workspace_dir"
        cp -r "$work_dir" "$workspace_dir"
    fi

    # Clean up temp workspace
    rm -rf "$work_dir"

    [[ $eval_passed -eq 1 ]] && return 0 || return 1
 }

 # Export functions and variables for parallel execution
 export -f run_evaluation setup_project extract_error_summary run_agent run_with_claude run_with_crush kill_tree run_with_timeout
 export SCRIPT_DIR OUTPUT_DIR BASE_TEMPLATE WORK_BASE USE_SKILL MAX_RETRIES EVAL_TIMEOUT BACKEND USE_GATEWAY
 export RED GREEN YELLOW BLUE CYAN NC

 main() {
    echo "Viaduct Skill Evaluation Harness (Parallel Edition)"
    echo "===================================================="
    echo "Base template: $BASE_TEMPLATE"
    echo "Work directory: $WORK_BASE-<eval-id>"
    [[ $USE_SKILL -eq 1 ]] && echo -e "Mode: ${GREEN}WITH SKILL${NC}" || echo -e "Mode: ${BLUE}NO SKILL${NC}"
    if [[ "$BACKEND" == "crush" ]]; then
        echo -e "Backend: ${CYAN}Crush${NC} (~165 MB/process)"
    else
        echo -e "Backend: ${CYAN}Claude CLI${NC} (~800 MB/process)"
    fi
    echo "Max retries: $MAX_RETRIES"
    echo "Eval timeout: ${EVAL_TIMEOUT}s"
    echo -e "Parallelism: ${CYAN}$MAX_PARALLEL${NC} concurrent evaluations"
    echo ""

    check_deps

    # Clean old outputs if requested
    if [[ $CLEAN -eq 1 ]]; then
        echo -e "${YELLOW}Cleaning eval outputs...${NC}"
        rm -rf "$OUTPUT_DIR"/*-workspace 2>/dev/null
        rm -f "$OUTPUT_DIR"/*.result "$OUTPUT_DIR"/*-agent.txt "$OUTPUT_DIR"/*-build.txt "$OUTPUT_DIR"/*-errors.txt "$OUTPUT_DIR"/*-claude.txt 2>/dev/null
        echo "Done."
        echo ""
    fi

    # Detect authentication mode (direct API vs internal gateway)
    detect_auth_mode
    if [[ "$USE_GATEWAY" -eq 1 ]]; then
        echo -e "Auth: ${CYAN}Internal gateway${NC} (iap-auth)"
    else
        echo -e "Auth: ${CYAN}Direct Anthropic API${NC}"
    fi

    # Setup Crush providers if using Crush backend with internal gateway
    if [[ "$BACKEND" == "crush" ]] && [[ "$USE_GATEWAY" -eq 1 ]]; then
        echo "Configuring Crush providers for internal gateway..."
        setup_crush_for_gateway
    fi

    # Pre-warm Gradle daemon
    prewarm_gradle

    # Get list of evaluations to run
    local eval_count=$(jq length "$EVAL_FILE")
    local evals_to_run=()

    for i in $(seq 0 $((eval_count - 1))); do
        local eval_id=$(jq -r ".[$i].id" "$EVAL_FILE")
        local eval_name=$(jq -r ".[$i].name" "$EVAL_FILE")

        # Filter check
        if [[ -n "$FILTER" && "$eval_id" != *"$FILTER"* && "$eval_name" != *"$FILTER"* ]]; then
            continue
        fi

        evals_to_run+=("$i")
    done

    local total_evals=${#evals_to_run[@]}
    echo "Running $total_evals evaluations..."
    echo ""

    # Clear old result files
    local suffix=$([[ $USE_SKILL -eq 0 ]] && echo "-noskill" || echo "")
    local backend_suffix=$([[ "$BACKEND" == "crush" ]] && echo "-crush" || echo "")
    rm -f "$OUTPUT_DIR"/*$suffix$backend_suffix.result 2>/dev/null

    # Track running jobs
    local running_pids=()
    local running_evals=()
    local completed=0

    for idx in "${evals_to_run[@]}"; do
        local eval_id=$(jq -r ".[$idx].id" "$EVAL_FILE")
        local eval_name=$(jq -r ".[$idx].name" "$EVAL_FILE")
        local eval_query=$(jq -r ".[$idx].query" "$EVAL_FILE")
        local verify_patterns=$(jq -r ".[$idx].verify_patterns | .[]?" "$EVAL_FILE" 2>/dev/null || echo "")
        local schema_addition=$(jq -r ".[$idx].schema // empty" "$EVAL_FILE" 2>/dev/null || echo "")
        local negative_patterns=$(jq -r ".[$idx].negative_patterns | .[]?" "$EVAL_FILE" 2>/dev/null || echo "")

        # Wait if we've hit max parallelism
        while [[ ${#running_pids[@]} -ge $MAX_PARALLEL ]]; do
            # Wait for any job to finish
            local new_pids=()
            local new_evals=()
            for i in "${!running_pids[@]}"; do
                if kill -0 "${running_pids[$i]}" 2>/dev/null; then
                    new_pids+=("${running_pids[$i]}")
                    new_evals+=("${running_evals[$i]}")
                else
                    ((completed++))
                    echo -e "${CYAN}[$completed/$total_evals completed]${NC}"
                fi
            done
            running_pids=("${new_pids[@]}")
            running_evals=("${new_evals[@]}")

            if [[ ${#running_pids[@]} -ge $MAX_PARALLEL ]]; then
                sleep 1
            fi
        done

        # Start evaluation in background with timeout
        run_with_timeout "$EVAL_TIMEOUT" run_evaluation "$eval_id" "$eval_name" "$eval_query" "$verify_patterns" "$schema_addition" "$negative_patterns" &
        running_pids+=($!)
        running_evals+=("$eval_id")
    done

    # Wait for all remaining jobs
    echo "Waiting for remaining evaluations to complete..."
    for pid in "${running_pids[@]}"; do
        wait "$pid" 2>/dev/null || true
    done

    # Collect results into arrays for grouped reporting
    local passed=0 failed=0 one_shot=0 total_run=0
    local -a success_oneshot=()
    local -a success_retry=()    # "eval_id|attempts|timing"
    local -a failure_list=()     # "eval_id|reason|details"

    for idx in "${evals_to_run[@]}"; do
        local eval_id=$(jq -r ".[$idx].id" "$EVAL_FILE")
        local eval_name=$(jq -r ".[$idx].name" "$EVAL_FILE")
        local result_file="$OUTPUT_DIR/$eval_id$suffix$backend_suffix.result"
        ((total_run++))

        if [[ -f "$result_file" ]]; then
            local result=$(cat "$result_file")
            local status=$(echo "$result" | cut -d'|' -f1)
            local attempt=$(echo "$result" | cut -d'|' -f3)
            local timing=$(echo "$result" | rev | cut -d'|' -f1 | rev)

            if [[ "$status" == "PASS" ]]; then
                ((passed++))
                if [[ "$attempt" == "1" ]]; then
                    ((one_shot++))
                    success_oneshot+=("$eval_id ($eval_name)")
                else
                    # Collect retry error details
                    local retry_errors=$(echo "$result" | cut -d'|' -f4)
                    success_retry+=("$eval_id ($eval_name)|$attempt|$timing|$retry_errors")
                fi
            else
                ((failed++))
                local fail_reason=$(echo "$result" | cut -d'|' -f4)
                local error_details=""
                local errors_file="$OUTPUT_DIR/$eval_id$suffix$backend_suffix-errors.txt"
                if [[ -f "$errors_file" ]] && [[ -s "$errors_file" ]]; then
                    error_details=$(cat "$errors_file")
                fi
                failure_list+=("$eval_id ($eval_name)|$fail_reason|$error_details|$timing")
            fi
        else
            ((failed++))
            failure_list+=("$eval_id ($eval_name)|timeout|Exceeded ${EVAL_TIMEOUT}s limit|")
        fi
    done

    # Print report
    echo ""
    echo "============================================================"
    echo "REPORT"
    echo "============================================================"
    [[ $USE_SKILL -eq 1 ]] && echo -e "Mode: ${GREEN}WITH SKILL${NC}" || echo -e "Mode: ${BLUE}NO SKILL${NC}"
    echo -e "Backend: ${CYAN}$BACKEND${NC}"
    echo ""

    # --- Successes ---
    echo -e "${GREEN}PASSED: $passed / $total_run${NC}"
    echo ""

    if [[ ${#success_oneshot[@]} -gt 0 ]]; then
        echo -e "  ${GREEN}One-shot ($one_shot):${NC}"
        for entry in "${success_oneshot[@]}"; do
            echo -e "    ${GREEN}✓${NC} $entry"
        done
        echo ""
    fi

    if [[ ${#success_retry[@]} -gt 0 ]]; then
        echo -e "  ${YELLOW}Passed with retries ($(( passed - one_shot ))):${NC}"
        for entry in "${success_retry[@]}"; do
            local name=$(echo "$entry" | cut -d'|' -f1)
            local attempts=$(echo "$entry" | cut -d'|' -f2)
            local timing=$(echo "$entry" | cut -d'|' -f3)
            local retry_errors=$(echo "$entry" | cut -d'|' -f4)
            echo -e "    ${YELLOW}✓${NC} $name — ${YELLOW}$attempts attempts${NC} [$timing]"
            if [[ -n "$retry_errors" ]]; then
                # Split retry errors (pipe-separated within the field, comma-separated here)
                echo -e "      ${CYAN}retry errors: $retry_errors${NC}"
            fi
        done
        echo ""
    fi

    # --- Failures ---
    if [[ ${#failure_list[@]} -gt 0 ]]; then
        echo -e "${RED}FAILED: $failed / $total_run${NC}"
        echo ""
        for entry in "${failure_list[@]}"; do
            local name=$(echo "$entry" | cut -d'|' -f1)
            local reason=$(echo "$entry" | cut -d'|' -f2)
            local details=$(echo "$entry" | cut -d'|' -f3)
            local timing=$(echo "$entry" | cut -d'|' -f4)
            echo -e "  ${RED}✗${NC} $name"
            echo -e "    Reason: ${RED}$reason${NC}"
            if [[ -n "$timing" ]]; then
                echo -e "    Timing: $timing"
            fi
            if [[ -n "$details" ]]; then
                while IFS= read -r line; do
                    echo -e "    ${CYAN}$line${NC}"
                done <<< "$details"
            fi
            echo ""
        done
    fi

    echo "============================================================"
    echo -e "Total: $total_run  |  ${GREEN}Passed: $passed${NC}  |  ${GREEN}One-shot: $one_shot${NC}  |  ${RED}Failed: $failed${NC}"
    if [[ $passed -gt 0 ]]; then
        echo -e "One-shot rate: ${GREEN}$(( one_shot * 100 / passed ))%${NC} of passes  |  ${GREEN}$(( one_shot * 100 / total_run ))%${NC} of total"
    fi
    echo ""
    echo "Outputs: $OUTPUT_DIR"

    # List preserved workspaces
    local workspaces=$(ls -d "$OUTPUT_DIR"/*-workspace 2>/dev/null | wc -l | tr -d ' ')
    if [[ "$workspaces" -gt 0 ]]; then
        echo -e "${CYAN}Preserved workspaces (failed or retried):${NC}"
        for ws in "$OUTPUT_DIR"/*-workspace; do
            [[ -d "$ws" ]] && echo "  $(basename "$ws")"
        done
    fi

    [[ $failed -gt 0 ]] && return 1 || return 0
 }

 run_compare() {
    echo "============================================================"
    echo "COMPARISON MODE: skill vs no-skill"
    echo "============================================================"
    echo ""

    local backend_suffix=$([[ "$BACKEND" == "crush" ]] && echo "-crush" || echo "")

    # Run with skill
    echo -e "${GREEN}>>> Running WITH skill...${NC}"
    echo ""
    USE_SKILL=1
    export USE_SKILL
    main
    local skill_exit=$?

    echo ""
    echo ""

    # Run without skill
    echo -e "${BLUE}>>> Running WITHOUT skill...${NC}"
    echo ""
    USE_SKILL=0
    export USE_SKILL
    main
    local noskill_exit=$?

    # Build comparison from result files
    echo ""
    echo ""
    echo "============================================================"
    echo "COMPARISON REPORT"
    echo "============================================================"
    echo -e "Backend: ${CYAN}$BACKEND${NC}"
    echo ""

    local eval_count=$(jq length "$EVAL_FILE")

    # Header
    printf "  %-40s  %-18s  %-18s\n" "Evaluation" "With Skill" "Without Skill"
    printf "  %-40s  %-18s  %-18s\n" "$(printf '%0.s─' {1..40})" "$(printf '%0.s─' {1..18})" "$(printf '%0.s─' {1..18})"

    local skill_passed=0 skill_oneshot=0 skill_total=0
    local noskill_passed=0 noskill_oneshot=0 noskill_total=0

    for i in $(seq 0 $((eval_count - 1))); do
        local eval_id=$(jq -r ".[$i].id" "$EVAL_FILE")
        local eval_name=$(jq -r ".[$i].name" "$EVAL_FILE")

        # Filter check
        if [[ -n "$FILTER" && "$eval_id" != *"$FILTER"* && "$eval_name" != *"$FILTER"* ]]; then
            continue
        fi

        local skill_result_file="$OUTPUT_DIR/$eval_id$backend_suffix.result"
        local noskill_result_file="$OUTPUT_DIR/$eval_id-noskill$backend_suffix.result"

        local skill_label noskill_label

        # Parse skill result
        ((skill_total++))
        if [[ -f "$skill_result_file" ]]; then
            local s_result=$(cat "$skill_result_file")
            local s_status=$(echo "$s_result" | cut -d'|' -f1)
            local s_attempt=$(echo "$s_result" | cut -d'|' -f3)
            if [[ "$s_status" == "PASS" ]]; then
                ((skill_passed++))
                if [[ "$s_attempt" == "1" ]]; then
                    ((skill_oneshot++))
                    skill_label="${GREEN}one-shot${NC}"
                else
                    skill_label="${YELLOW}attempt $s_attempt${NC}"
                fi
            else
                local s_reason=$(echo "$s_result" | cut -d'|' -f4)
                skill_label="${RED}FAIL ($s_reason)${NC}"
            fi
        else
            skill_label="${RED}TIMEOUT${NC}"
        fi

        # Parse no-skill result
        ((noskill_total++))
        if [[ -f "$noskill_result_file" ]]; then
            local n_result=$(cat "$noskill_result_file")
            local n_status=$(echo "$n_result" | cut -d'|' -f1)
            local n_attempt=$(echo "$n_result" | cut -d'|' -f3)
            if [[ "$n_status" == "PASS" ]]; then
                ((noskill_passed++))
                if [[ "$n_attempt" == "1" ]]; then
                    ((noskill_oneshot++))
                    noskill_label="${GREEN}one-shot${NC}"
                else
                    noskill_label="${YELLOW}attempt $n_attempt${NC}"
                fi
            else
                local n_reason=$(echo "$n_result" | cut -d'|' -f4)
                noskill_label="${RED}FAIL ($n_reason)${NC}"
            fi
        else
            noskill_label="${RED}TIMEOUT${NC}"
        fi

        # Use fixed-width columns with tput for reliable alignment
        local display_id="$eval_id"
        [[ ${#display_id} -gt 40 ]] && display_id="${display_id:0:37}..."
        printf "  %-40s  " "$display_id"
        # Print skill result (pad to 20 visible chars)
        echo -ne "$skill_label"
        printf "\t"
        echo -e "$noskill_label"
    done

    echo ""
    printf "  %-40s  %-18s  %-18s\n" "$(printf '%0.s─' {1..40})" "$(printf '%0.s─' {1..18})" "$(printf '%0.s─' {1..18})"

    echo -e "  Passed:                                   ${GREEN}$skill_passed / $skill_total${NC}              ${BLUE}$noskill_passed / $noskill_total${NC}"
    echo -e "  One-shot:                                 ${GREEN}$skill_oneshot / $skill_total${NC}              ${BLUE}$noskill_oneshot / $noskill_total${NC}"

    if [[ $skill_total -gt 0 ]]; then
        local skill_pct=$(( skill_oneshot * 100 / skill_total ))
        local noskill_pct=$(( noskill_oneshot * 100 / noskill_total ))
        local delta=$(( skill_pct - noskill_pct ))
        echo ""
        echo -e "  One-shot rate:                            ${GREEN}${skill_pct}%${NC}                  ${BLUE}${noskill_pct}%${NC}"
        if [[ $delta -gt 0 ]]; then
            echo -e "  Skill improvement:                        ${GREEN}+${delta}pp${NC}"
        elif [[ $delta -lt 0 ]]; then
            echo -e "  Skill improvement:                        ${RED}${delta}pp${NC}"
        else
            echo -e "  Skill improvement:                        0pp (no difference)"
        fi
    fi

    echo ""
    echo "Outputs: $OUTPUT_DIR"

    [[ $skill_exit -ne 0 || $noskill_exit -ne 0 ]] && exit 1
    exit 0
 }

 if [[ $COMPARE -eq 1 ]]; then
    run_compare
 else
    main
    exit $?
 fi
	[
	{
	"id": "eval-01-field-resolver",
	"name": "Simple Field Resolver",
	"skills": ["viaduct-field-resolver"],
	"schema": "type Group implements Node @resolver @scope(to: [\"default\"]) {\n id: ID!\n name: String!\n description: String\n}",
	"query": "I want to show a member count on each Group. Add a memberCount field that returns 42 for now.",
	"expected_behavior": [
	"Adds 'memberCount: Int! @resolver' to the Group type in schema",
	"Creates resolver class extending GroupResolvers.MemberCount()",
	"Uses @Resolver annotation"
	],
	"verify_patterns": [
	"memberCount: Int!? @resolver",
	"GroupResolvers.MemberCount"
	]
	},
	{
	"id": "eval-02-node-resolver",
	"name": "Node Type and Query Resolver",
	"skills": ["viaduct-node-type", "viaduct-query-resolver"],
	"schema": "type Tag implements Node @resolver @scope(to: [\"default\"]) {\n id: ID!\n name: String!\n color: String!\n}\n\nextend type Query {\n tag(id: ID! @idOf(type: \"Tag\")): Tag @resolver\n}",
	"query": "I need to be able to fetch a Tag by its ID. Implement the resolvers so I can query for a single tag. Return hardcoded data for now.",
	"expected_behavior": [
	"Creates TagNodeResolver extending NodeResolvers.Tag()",
	"Creates TagQueryResolver extending QueryResolvers.Tag()",
	"Uses ctx.arguments.id.internalID to get the UUID"
	],
	"verify_patterns": [
	"NodeResolvers.Tag",
	"QueryResolvers.Tag",
	"internalID"
	]
	},
	{
	"id": "eval-03-required-selection-set",
	"name": "Field Resolver with Required Selection Set",
	"skills": ["viaduct-field-resolver"],
	"schema": "type GroupMember implements Node @resolver @scope(to: [\"default\"]) {\n id: ID!\n userId: String!\n role: String!\n}",
	"query": "Add a displayName field to GroupMember that shows 'User: ' followed by the userId. It needs to read the userId from the parent object.",
	"expected_behavior": [
	"Adds 'displayName: String @resolver' to GroupMember type",
	"Creates resolver with @Resolver annotation specifying objectValueFragment",
	"Extends GroupMemberResolvers.DisplayName()",
	"Accesses userId via ctx.objectValue.getUserId()"
	],
	"verify_patterns": [
	"displayName: String!? @resolver",
	"GroupMemberResolvers.DisplayName",
	"objectValueFragment\|fragment .* on GroupMember",
	"getUserId"
	]
	},
	{
	"id": "eval-04-batch-resolution",
	"name": "Batch Resolution for N+1 Prevention",
	"skills": ["viaduct-batch"],
	"schema": "type Tag implements Node @resolver @scope(to: [\"default\"]) {\n id: ID!\n name: String!\n}\n\ntype Group implements Node @resolver @scope(to: [\"default\"]) {\n id: ID!\n name: String!\n tags: [Tag!]! @resolver\n}",
	"query": "Implement the tags field on Group. Each group should return a list of tags. Make sure it handles multiple groups efficiently without N+1 queries. Return hardcoded tags for now.",
	"expected_behavior": [
	"Creates GroupTagsResolver extending GroupResolvers.Tags()",
	"Implements batchResolve with List<Context> parameter",
	"Returns List<FieldValue<List<Tag>>>",
	"Uses FieldValue.ofValue() for results"
	],
	"verify_patterns": [
	"GroupResolvers.Tags",
	"batchResolve",
	"FieldValue"
	]
	},
	{
	"id": "eval-05-mutations-globalid",
	"name": "Mutations with GlobalID Handling",
	"skills": ["viaduct-mutations"],
	"schema": "type Tag implements Node @resolver @scope(to: [\"default\"]) {\n id: ID!\n name: String!\n color: String!\n}\n\ninput CreateTagInput {\n name: String!\n color: String!\n}\n\ninput UpdateTagInput {\n id: ID! @idOf(type: \"Tag\")\n name: String\n color: String\n}\n\nextend type Mutation {\n createTag(input: CreateTagInput!): Tag! @resolver\n updateTag(input: UpdateTagInput!): Tag @resolver\n deleteTag(id: ID! @idOf(type: \"Tag\")): Boolean! @resolver\n}",
	"query": "Implement the tag mutations: createTag, updateTag, and deleteTag. Return mock data for now.",
	"expected_behavior": [
	"Creates CreateTagMutationResolver extending MutationResolvers.CreateTag()",
	"Creates UpdateTagMutationResolver extending MutationResolvers.UpdateTag()",
	"Creates DeleteTagMutationResolver extending MutationResolvers.DeleteTag()",
	"Uses input.id.internalID for update/delete",
	"Uses ctx.globalIDFor when building Tag response"
	],
	"verify_patterns": [
	"MutationResolvers.CreateTag",
	"MutationResolvers.UpdateTag",
	"MutationResolvers.DeleteTag",
	"internalID",
	"globalIDFor"
	]
	},
	{
	"id": "eval-06-scopes",
	"name": "Scope-based API Visibility",
	"skills": ["viaduct-scopes"],
	"schema": "type Tag implements Node @resolver @scope(to: [\"default\"]) {\n id: ID!\n name: String!\n}",
	"query": "Add admin-only operations for Tag: a way to get/set internalNotes, get usageCount, and delete all tags. These should only be visible to admin API consumers, not the default scope.",
	"expected_behavior": [
	"Creates 'extend type Mutation @scope(to: [\"admin\"])' for admin mutations",
	"Uses mutations instead of fields (extend type Tag doesn't work)",
	"Includes deleteAllTags mutation"
	],
	"verify_patterns": [
	"@scope\\(to: \\[\"admin\"\\]\\)",
	"extend type Mutation",
	"deleteAllTags",
	"MutationResolvers"
	]
	},
	{
	"id": "eval-07-entity-relationships",
	"name": "Entity Relationships with Node References",
	"skills": ["viaduct-relationships"],
	"schema": "type User implements Node @resolver @scope(to: [\"default\"]) {\n id: ID!\n name: String!\n email: String!\n}\n\ntype Tag implements Node @resolver @scope(to: [\"default\"]) {\n id: ID!\n name: String!\n createdById: String!\n createdBy: User @resolver\n}",
	"query": "Implement the createdBy field on Tag so it returns the User who created it. The createdById field has the user's ID. Also implement basic node resolvers for User and Tag.",
	"expected_behavior": [
	"Creates TagCreatedByResolver with objectValueFragment including createdById",
	"Returns ctx.nodeFor(ctx.globalIDFor(User.Reflection, createdById))",
	"Does NOT fetch user data directly"
	],
	"verify_patterns": [
	"TagResolvers.CreatedBy",
	"nodeFor",
	"globalIDFor.*User"
	]
	},
	{
	"id": "gotcha-01-missing-idof-input",
	"name": "Gotcha: Input Type Missing @idOf",
	"skills": ["viaduct-mutations"],
	"schema": "type Task implements Node @resolver @scope(to: [\"default\"]) {\n id: ID!\n title: String!\n completed: Boolean!\n}\n\ninput UpdateTaskInput {\n id: ID!\n title: String\n completed: Boolean\n}\n\nextend type Mutation {\n updateTask(input: UpdateTaskInput!): Task! @resolver\n}",
	"query": "Implement the updateTask mutation. The resolver MUST extract the task ID from input.id and log it. Return mock data using that ID.",
	"expected_behavior": [
	"Recognizes @idOf is missing from UpdateTaskInput.id",
	"Adds @idOf(type: \"Task\") to the id field in schema",
	"Uses input.id.internalID in the resolver",
	"Does NOT manually decode Base64"
	],
	"verify_patterns": [
	"@idOf\\(type: \"Task\"\\)",
	"MutationResolvers.UpdateTask",
	"internalID"
	],
	"negative_patterns": [
	"Base64\\.getDecoder",
	"Base64\\.decode",
	"substringAfter\\(\":\"\\)"
	]
	},
	{
	"id": "gotcha-02-missing-idof-query-arg",
	"name": "Gotcha: Query Argument Missing @idOf",
	"skills": ["viaduct-query-resolver"],
	"schema": "type Project implements Node @resolver @scope(to: [\"default\"]) {\n id: ID!\n name: String!\n description: String\n}\n\nextend type Query {\n project(id: ID!): Project @resolver\n}",
	"query": "I need to fetch a single project by ID. Implement the project query resolver. Return hardcoded data.",
	"expected_behavior": [
	"Recognizes @idOf is missing from query argument",
	"Adds @idOf(type: \"Project\") to the id argument",
	"Uses ctx.arguments.id.internalID in the resolver",
	"Does NOT manually decode Base64"
	],
	"verify_patterns": [
	"project\\(id: ID! @idOf\\(type: \"Project\"\\)\\)",
	"QueryResolvers.Project",
	"internalID"
	],
	"negative_patterns": [
	"Base64\\.getDecoder",
	"Base64\\.decode",
	"substringAfter\\(\":\"\\)"
	]
	},
	{
	"id": "gotcha-03-globalid-response",
	"name": "Gotcha: Building GlobalID for Response",
	"skills": ["viaduct-mutations"],
	"schema": "type Item implements Node @resolver @scope(to: [\"default\"]) {\n id: ID!\n name: String!\n quantity: Int!\n}\n\ninput CreateItemInput {\n name: String!\n quantity: Int!\n}\n\nextend type Mutation {\n createItem(input: CreateItemInput!): Item! @resolver\n}",
	"query": "Implement the createItem mutation. Generate a UUID for the new item and return it with the input values.",
	"expected_behavior": [
	"Uses ctx.globalIDFor(Item.Reflection, uuid) to create GlobalID",
	"Does NOT pass raw UUID string to .id() method",
	"Uses Item.Builder pattern"
	],
	"verify_patterns": [
	"MutationResolvers.CreateItem",
	"globalIDFor\\(Item\\.Reflection",
	"Item\\.Builder"
	],
	"negative_patterns": [
	"\\.id\\([a-z]+Id\\)",
	"\\.id\\(uuid\\)",
	"\\.id\\(UUID\\."
	]
	}
	]
No results found