Skip to content

Instantly share code, notes, and snippets.

@fireboy1919
Created February 11, 2026 17:50
Show Gist options
  • Select an option

  • Save fireboy1919/ee9e86b0fdeba0c2feef57e7c4888555 to your computer and use it in GitHub Desktop.

Select an option

Save fireboy1919/ee9e86b0fdeba0c2feef57e7c4888555 to your computer and use it in GitHub Desktop.
Viaduct skill evaluation harness
[
{
"id": "eval-01-field-resolver",
"name": "Simple Field Resolver",
"skills": ["viaduct-field-resolver"],
"schema": "type Group implements Node @resolver @scope(to: [\"default\"]) {\n id: ID!\n name: String!\n description: String\n}",
"query": "I want to show a member count on each Group. Add a memberCount field that returns 42 for now.",
"expected_behavior": [
"Adds 'memberCount: Int! @resolver' to the Group type in schema",
"Creates resolver class extending GroupResolvers.MemberCount()",
"Uses @Resolver annotation"
],
"verify_patterns": [
"memberCount: Int!? @resolver",
"GroupResolvers.MemberCount"
]
},
{
"id": "eval-02-node-resolver",
"name": "Node Type and Query Resolver",
"skills": ["viaduct-node-type", "viaduct-query-resolver"],
"schema": "type Tag implements Node @resolver @scope(to: [\"default\"]) {\n id: ID!\n name: String!\n color: String!\n}\n\nextend type Query {\n tag(id: ID! @idOf(type: \"Tag\")): Tag @resolver\n}",
"query": "I need to be able to fetch a Tag by its ID. Implement the resolvers so I can query for a single tag. Return hardcoded data for now.",
"expected_behavior": [
"Creates TagNodeResolver extending NodeResolvers.Tag()",
"Creates TagQueryResolver extending QueryResolvers.Tag()",
"Uses ctx.arguments.id.internalID to get the UUID"
],
"verify_patterns": [
"NodeResolvers.Tag",
"QueryResolvers.Tag",
"internalID"
]
},
{
"id": "eval-03-required-selection-set",
"name": "Field Resolver with Required Selection Set",
"skills": ["viaduct-field-resolver"],
"schema": "type GroupMember implements Node @resolver @scope(to: [\"default\"]) {\n id: ID!\n userId: String!\n role: String!\n}",
"query": "Add a displayName field to GroupMember that shows 'User: ' followed by the userId. It needs to read the userId from the parent object.",
"expected_behavior": [
"Adds 'displayName: String @resolver' to GroupMember type",
"Creates resolver with @Resolver annotation specifying objectValueFragment",
"Extends GroupMemberResolvers.DisplayName()",
"Accesses userId via ctx.objectValue.getUserId()"
],
"verify_patterns": [
"displayName: String!? @resolver",
"GroupMemberResolvers.DisplayName",
"objectValueFragment|fragment .* on GroupMember",
"getUserId"
]
},
{
"id": "eval-04-batch-resolution",
"name": "Batch Resolution for N+1 Prevention",
"skills": ["viaduct-batch"],
"schema": "type Tag implements Node @resolver @scope(to: [\"default\"]) {\n id: ID!\n name: String!\n}\n\ntype Group implements Node @resolver @scope(to: [\"default\"]) {\n id: ID!\n name: String!\n tags: [Tag!]! @resolver\n}",
"query": "Implement the tags field on Group. Each group should return a list of tags. Make sure it handles multiple groups efficiently without N+1 queries. Return hardcoded tags for now.",
"expected_behavior": [
"Creates GroupTagsResolver extending GroupResolvers.Tags()",
"Implements batchResolve with List<Context> parameter",
"Returns List<FieldValue<List<Tag>>>",
"Uses FieldValue.ofValue() for results"
],
"verify_patterns": [
"GroupResolvers.Tags",
"batchResolve",
"FieldValue"
]
},
{
"id": "eval-05-mutations-globalid",
"name": "Mutations with GlobalID Handling",
"skills": ["viaduct-mutations"],
"schema": "type Tag implements Node @resolver @scope(to: [\"default\"]) {\n id: ID!\n name: String!\n color: String!\n}\n\ninput CreateTagInput {\n name: String!\n color: String!\n}\n\ninput UpdateTagInput {\n id: ID! @idOf(type: \"Tag\")\n name: String\n color: String\n}\n\nextend type Mutation {\n createTag(input: CreateTagInput!): Tag! @resolver\n updateTag(input: UpdateTagInput!): Tag @resolver\n deleteTag(id: ID! @idOf(type: \"Tag\")): Boolean! @resolver\n}",
"query": "Implement the tag mutations: createTag, updateTag, and deleteTag. Return mock data for now.",
"expected_behavior": [
"Creates CreateTagMutationResolver extending MutationResolvers.CreateTag()",
"Creates UpdateTagMutationResolver extending MutationResolvers.UpdateTag()",
"Creates DeleteTagMutationResolver extending MutationResolvers.DeleteTag()",
"Uses input.id.internalID for update/delete",
"Uses ctx.globalIDFor when building Tag response"
],
"verify_patterns": [
"MutationResolvers.CreateTag",
"MutationResolvers.UpdateTag",
"MutationResolvers.DeleteTag",
"internalID",
"globalIDFor"
]
},
{
"id": "eval-06-scopes",
"name": "Scope-based API Visibility",
"skills": ["viaduct-scopes"],
"schema": "type Tag implements Node @resolver @scope(to: [\"default\"]) {\n id: ID!\n name: String!\n}",
"query": "Add admin-only operations for Tag: a way to get/set internalNotes, get usageCount, and delete all tags. These should only be visible to admin API consumers, not the default scope.",
"expected_behavior": [
"Creates 'extend type Mutation @scope(to: [\"admin\"])' for admin mutations",
"Uses mutations instead of fields (extend type Tag doesn't work)",
"Includes deleteAllTags mutation"
],
"verify_patterns": [
"@scope\\(to: \\[\"admin\"\\]\\)",
"extend type Mutation",
"deleteAllTags",
"MutationResolvers"
]
},
{
"id": "eval-07-entity-relationships",
"name": "Entity Relationships with Node References",
"skills": ["viaduct-relationships"],
"schema": "type User implements Node @resolver @scope(to: [\"default\"]) {\n id: ID!\n name: String!\n email: String!\n}\n\ntype Tag implements Node @resolver @scope(to: [\"default\"]) {\n id: ID!\n name: String!\n createdById: String!\n createdBy: User @resolver\n}",
"query": "Implement the createdBy field on Tag so it returns the User who created it. The createdById field has the user's ID. Also implement basic node resolvers for User and Tag.",
"expected_behavior": [
"Creates TagCreatedByResolver with objectValueFragment including createdById",
"Returns ctx.nodeFor(ctx.globalIDFor(User.Reflection, createdById))",
"Does NOT fetch user data directly"
],
"verify_patterns": [
"TagResolvers.CreatedBy",
"nodeFor",
"globalIDFor.*User"
]
},
{
"id": "gotcha-01-missing-idof-input",
"name": "Gotcha: Input Type Missing @idOf",
"skills": ["viaduct-mutations"],
"schema": "type Task implements Node @resolver @scope(to: [\"default\"]) {\n id: ID!\n title: String!\n completed: Boolean!\n}\n\ninput UpdateTaskInput {\n id: ID!\n title: String\n completed: Boolean\n}\n\nextend type Mutation {\n updateTask(input: UpdateTaskInput!): Task! @resolver\n}",
"query": "Implement the updateTask mutation. The resolver MUST extract the task ID from input.id and log it. Return mock data using that ID.",
"expected_behavior": [
"Recognizes @idOf is missing from UpdateTaskInput.id",
"Adds @idOf(type: \"Task\") to the id field in schema",
"Uses input.id.internalID in the resolver",
"Does NOT manually decode Base64"
],
"verify_patterns": [
"@idOf\\(type: \"Task\"\\)",
"MutationResolvers.UpdateTask",
"internalID"
],
"negative_patterns": [
"Base64\\.getDecoder",
"Base64\\.decode",
"substringAfter\\(\":\"\\)"
]
},
{
"id": "gotcha-02-missing-idof-query-arg",
"name": "Gotcha: Query Argument Missing @idOf",
"skills": ["viaduct-query-resolver"],
"schema": "type Project implements Node @resolver @scope(to: [\"default\"]) {\n id: ID!\n name: String!\n description: String\n}\n\nextend type Query {\n project(id: ID!): Project @resolver\n}",
"query": "I need to fetch a single project by ID. Implement the project query resolver. Return hardcoded data.",
"expected_behavior": [
"Recognizes @idOf is missing from query argument",
"Adds @idOf(type: \"Project\") to the id argument",
"Uses ctx.arguments.id.internalID in the resolver",
"Does NOT manually decode Base64"
],
"verify_patterns": [
"project\\(id: ID! @idOf\\(type: \"Project\"\\)\\)",
"QueryResolvers.Project",
"internalID"
],
"negative_patterns": [
"Base64\\.getDecoder",
"Base64\\.decode",
"substringAfter\\(\":\"\\)"
]
},
{
"id": "gotcha-03-globalid-response",
"name": "Gotcha: Building GlobalID for Response",
"skills": ["viaduct-mutations"],
"schema": "type Item implements Node @resolver @scope(to: [\"default\"]) {\n id: ID!\n name: String!\n quantity: Int!\n}\n\ninput CreateItemInput {\n name: String!\n quantity: Int!\n}\n\nextend type Mutation {\n createItem(input: CreateItemInput!): Item! @resolver\n}",
"query": "Implement the createItem mutation. Generate a UUID for the new item and return it with the input values.",
"expected_behavior": [
"Uses ctx.globalIDFor(Item.Reflection, uuid) to create GlobalID",
"Does NOT pass raw UUID string to .id() method",
"Uses Item.Builder pattern"
],
"verify_patterns": [
"MutationResolvers.CreateItem",
"globalIDFor\\(Item\\.Reflection",
"Item\\.Builder"
],
"negative_patterns": [
"\\.id\\([a-z]+Id\\)",
"\\.id\\(uuid\\)",
"\\.id\\(UUID\\."
]
}
]
#!/bin/bash
#
# Viaduct Skill Evaluation Harness (Parallel Edition)
#
# Runs evaluations against the viaduct skill.
# Each evaluation:
# 1. Copies base-template to unique temp directory
# 2. Appends eval-specific schema types
# 3. Runs Gradle to generate scaffolding
# 4. Runs AI agent (Claude CLI or Crush) to implement the feature
# 5. Builds and verifies patterns
#
# Usage:
# ./run-evaluations.sh [options] [eval-id]
#
# Options:
# --no-skill Run without the viaduct skill (baseline test)
# --skill Run with the viaduct skill (default)
# --compare Run with and without skill, then show side-by-side comparison
# --clean Remove all previous eval outputs before starting
# --parallel=N Run N evaluations in parallel (default: 10 for Crush, 4 for Claude)
# --sequential Run evaluations one at a time (--parallel=1)
# --backend=X Use 'crush' (default) or 'claude' as the AI backend
#
# Environment:
# MAX_RETRIES=3 Set max retry attempts
# MAX_PARALLEL=10 Set max parallel evaluations (default: 10 for Crush, 4 for Claude)
#
# Output:
# .eval-outputs/<eval-id>-agent.txt Agent's final response
# .eval-outputs/<eval-id>-build.txt Gradle build output
# .eval-outputs/<eval-id>-errors.txt Error summary
# .eval-outputs/<eval-id>-workspace/ Full workspace (preserved on failure or retry)
#
# Backends:
# crush - Charmbracelet Crush (~165 MB/process, default, requires crush CLI)
# Crush requires: CATWALK_URL=http://localhost:1 to use cached providers
# claude - Claude CLI (~800 MB/process, requires claude CLI)
#
set -o pipefail
# Clean up all child processes on exit
cleanup() {
local pids=$(jobs -p 2>/dev/null)
if [[ -n "$pids" ]]; then
echo -e "\nCleaning up child processes..."
kill $pids 2>/dev/null
sleep 2
kill -9 $pids 2>/dev/null
fi
}
trap cleanup EXIT INT TERM
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
EVAL_FILE="$SCRIPT_DIR/evaluations.json"
OUTPUT_DIR="$SCRIPT_DIR/.eval-outputs"
BASE_TEMPLATE="$SCRIPT_DIR/base-template"
WORK_BASE="/tmp/viaduct-skill-eval"
# Default settings
USE_SKILL=1
CLEAN=0
COMPARE=0
FILTER=""
MAX_RETRIES="${MAX_RETRIES:-3}"
EVAL_TIMEOUT="${EVAL_TIMEOUT:-600}" # 10 minutes per evaluation
BACKEND="${BACKEND:-crush}"
# MAX_PARALLEL default depends on backend (set after parsing args)
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m'
# Parse arguments
EXPLICIT_PARALLEL=""
while [[ $# -gt 0 ]]; do
case $1 in
--no-skill)
USE_SKILL=0
shift
;;
--skill)
USE_SKILL=1
shift
;;
--parallel=*)
EXPLICIT_PARALLEL="${1#*=}"
shift
;;
--sequential)
EXPLICIT_PARALLEL=1
shift
;;
--backend=*)
BACKEND="${1#*=}"
shift
;;
--crush)
BACKEND="crush"
shift
;;
--claude)
BACKEND="claude"
shift
;;
--clean)
CLEAN=1
shift
;;
--compare)
COMPARE=1
shift
;;
*)
FILTER="$1"
shift
;;
esac
done
# Set MAX_PARALLEL based on backend (crush uses less memory, can run more parallel)
if [[ -n "$EXPLICIT_PARALLEL" ]]; then
MAX_PARALLEL="$EXPLICIT_PARALLEL"
elif [[ -n "$MAX_PARALLEL" ]]; then
: # Use environment variable
elif [[ "$BACKEND" == "crush" ]]; then
MAX_PARALLEL=10
else
MAX_PARALLEL=4
fi
mkdir -p "$OUTPUT_DIR"
check_deps() {
local missing=0
command -v jq &>/dev/null || { echo -e "${RED}Error: jq required${NC}"; missing=1; }
command -v java &>/dev/null || { echo -e "${RED}Error: java 17+ required${NC}"; missing=1; }
[[ ! -d "$BASE_TEMPLATE" ]] && { echo -e "${RED}Error: base-template not found at $BASE_TEMPLATE${NC}"; missing=1; }
if [[ "$BACKEND" == "crush" ]]; then
command -v crush &>/dev/null || { echo -e "${RED}Error: crush CLI required (brew install charmbracelet/tap/crush)${NC}"; missing=1; }
else
command -v claude &>/dev/null || { echo -e "${RED}Error: claude CLI required${NC}"; missing=1; }
fi
[[ $missing -eq 1 ]] && exit 1
}
# Timer helper
time_cmd() {
local start=$(date +%s)
"$@"
local end=$(date +%s)
echo $((end - start))
}
# Kill a process and all its children
kill_tree() {
local pid="$1"
local signal="${2:-TERM}"
# Kill children first (process group), then the process itself
local children=$(pgrep -P "$pid" 2>/dev/null)
for child in $children; do
kill_tree "$child" "$signal"
done
kill -"$signal" "$pid" 2>/dev/null || true
}
# Run a function with a timeout. Kills the process tree if it exceeds the limit.
# Usage: run_with_timeout <timeout_secs> <function> [args...]
run_with_timeout() {
local timeout="$1"
shift
# Run the function in a subshell so we get a single PID to track
"$@" &
local cmd_pid=$!
# Watchdog: sleep then kill if still running
(
sleep "$timeout"
if kill -0 "$cmd_pid" 2>/dev/null; then
echo -e "${RED}[$(date +%H:%M:%S)] TIMEOUT: killing evaluation (exceeded ${timeout}s)${NC}" >&2
kill_tree "$cmd_pid" TERM
sleep 5
# Force kill if still alive
if kill -0 "$cmd_pid" 2>/dev/null; then
kill_tree "$cmd_pid" KILL
fi
fi
) &
local watchdog_pid=$!
# Wait for the command to finish (either naturally or killed)
wait "$cmd_pid" 2>/dev/null
local exit_code=$?
# Clean up the watchdog
kill "$watchdog_pid" 2>/dev/null
wait "$watchdog_pid" 2>/dev/null
return $exit_code
}
# Pre-warm Gradle daemon and download dependencies
prewarm_gradle() {
echo "Pre-warming Gradle daemon and cache..."
local prewarm_dir="$WORK_BASE-prewarm"
local start=$(date +%s)
rm -rf "$prewarm_dir"
cp -r "$BASE_TEMPLATE" "$prewarm_dir"
# Run a build to warm up the daemon and cache dependencies
if (cd "$prewarm_dir" && ./gradlew viaductCodegen classes --daemon -q 2>&1); then
local end=$(date +%s)
echo -e "${GREEN}Gradle daemon warmed up${NC} ($(( end - start ))s)"
else
echo -e "${YELLOW}Warning: Gradle prewarm had issues, continuing anyway${NC}"
fi
rm -rf "$prewarm_dir"
}
setup_project() {
local work_dir="$1"
local schema_addition="$2"
local eval_id="$3"
# Clean and copy base template
rm -rf "$work_dir"
cp -r "$BASE_TEMPLATE" "$work_dir"
# Append schema types for this evaluation
if [[ -n "$schema_addition" ]]; then
echo "" >> "$work_dir/src/main/viaduct/schema/Schema.graphqls"
echo "$schema_addition" >> "$work_dir/src/main/viaduct/schema/Schema.graphqls"
fi
# Generate scaffolding with Gradle (using daemon for speed)
if ! (cd "$work_dir" && ./gradlew viaductCodegen --daemon -q 2>&1); then
return 1
fi
# Install AGENTS.md with doc references if in skill mode
if [[ $USE_SKILL -eq 1 ]]; then
local install_output
install_output=$(cd "$work_dir" && node "$SCRIPT_DIR/../bin/install.js" 2>&1)
local install_exit=$?
if [[ $install_exit -ne 0 ]]; then
echo -e "${RED}Warning: skill install failed for $eval_id (exit $install_exit)${NC}" >&2
echo "$install_output" >&2
fi
# Verify docs were actually installed
if [[ ! -f "$work_dir/AGENTS.md" && ! -f "$work_dir/CLAUDE.md" ]]; then
echo -e "${RED}Warning: no AGENTS.md or CLAUDE.md found in $work_dir after install${NC}" >&2
fi
if [[ ! -d "$work_dir/.viaduct/agents" ]]; then
echo -e "${RED}Warning: .viaduct/agents/ directory not created in $work_dir${NC}" >&2
fi
fi
return 0
}
# Detect if we're using internal gateway or direct Anthropic API
# Sets USE_GATEWAY=1 if using internal gateway, 0 if using direct API
detect_auth_mode() {
if [[ -n "$ANTHROPIC_API_KEY" ]]; then
# Direct Anthropic API key provided
USE_GATEWAY=0
elif command -v iap-auth &>/dev/null; then
# Internal gateway via iap-auth
USE_GATEWAY=1
else
echo -e "${RED}Error: No authentication configured.${NC}"
echo "Set ANTHROPIC_API_KEY for direct Anthropic API access,"
echo "or ensure iap-auth is available for internal gateway access."
exit 1
fi
export USE_GATEWAY
}
# Setup Crush environment for internal gateway (modifies model IDs)
setup_crush_for_gateway() {
# Crush auto-updates providers from remote, which overwrites local changes.
# We need to modify the cached providers.json to use gateway model IDs.
local providers_file="$HOME/.local/share/crush/providers.json"
if [[ ! -f "$providers_file" ]]; then
echo -e "${YELLOW}Warning: Crush providers.json not found, running crush once to initialize...${NC}"
CATWALK_URL="http://localhost:1" crush models > /dev/null 2>&1 || true
fi
if [[ -f "$providers_file" ]]; then
# Update Anthropic provider model IDs to match gateway format
python3 << 'PYEOF' 2>/dev/null || true
import json, os
filepath = os.path.expanduser('~/.local/share/crush/providers.json')
if not os.path.exists(filepath):
exit(0)
with open(filepath, 'r') as f:
data = json.load(f)
modified = False
for provider in data:
if provider.get('id') == 'anthropic':
for model in provider.get('models', []):
old_id = model['id']
if not old_id.startswith('global.'):
model['id'] = f"global.anthropic.{old_id}-v1:0"
modified = True
if not provider.get('default_large_model_id', '').startswith('global.'):
provider['default_large_model_id'] = 'global.anthropic.claude-sonnet-4-5-20250929-v1:0'
provider['default_small_model_id'] = 'global.anthropic.claude-haiku-4-5-20251001-v1:0'
modified = True
break
if modified:
with open(filepath, 'w') as f:
json.dump(data, f, separators=(',', ':'))
PYEOF
fi
}
# Run prompt with Claude CLI
run_with_claude() {
local work_dir="$1"
local prompt="$2"
local output_file="$3"
if [[ -n "$ANTHROPIC_API_KEY" ]]; then
claude -p "$prompt" \
--dangerously-skip-permissions \
--no-session-persistence \
"$work_dir" >> "$output_file" 2>&1 || true
elif command -v iap-auth &>/dev/null; then
local auth_token
auth_token=$(iap-auth https://devaigateway.a.musta.ch 2>/dev/null)
if [[ -z "$auth_token" ]]; then
return 1
fi
CLAUDE_CODE_USE_BEDROCK=1 \
ANTHROPIC_BEDROCK_BASE_URL="https://devaigateway.a.musta.ch/bedrock" \
CLAUDE_CODE_SKIP_BEDROCK_AUTH=1 \
ANTHROPIC_AUTH_TOKEN="$auth_token" \
claude -p "$prompt" \
--dangerously-skip-permissions \
--no-session-persistence \
"$work_dir" >> "$output_file" 2>&1 || true
else
return 1
fi
return 0
}
# Run prompt with Crush
run_with_crush() {
local work_dir="$1"
local prompt="$2"
local output_file="$3"
(
cd "$work_dir"
if [[ "$USE_GATEWAY" -eq 1 ]]; then
# Internal gateway mode: use iap-auth token and gateway endpoint
local auth_token
auth_token=$(iap-auth https://devaigateway.a.musta.ch 2>/dev/null)
if [[ -z "$auth_token" ]]; then
echo "Failed to get iap-auth token" >> "$output_file"
return 1
fi
ANTHROPIC_API_KEY="$auth_token" \
ANTHROPIC_API_ENDPOINT="https://devaigateway.a.musta.ch" \
CATWALK_URL="http://localhost:1" \
crush run "$prompt" >> "$output_file" 2>&1
else
# Direct Anthropic API mode: use ANTHROPIC_API_KEY directly
# CATWALK_URL blocks remote provider updates to keep config stable
CATWALK_URL="http://localhost:1" \
crush run "$prompt" >> "$output_file" 2>&1
fi
) || true
return 0
}
# Run prompt with selected backend
run_agent() {
local work_dir="$1"
local prompt="$2"
local output_file="$3"
if [[ "$BACKEND" == "crush" ]]; then
run_with_crush "$work_dir" "$prompt" "$output_file"
else
run_with_claude "$work_dir" "$prompt" "$output_file"
fi
}
# Extract the key error from build output
extract_error_summary() {
local build_output="$1"
if grep -q "Unresolved reference" "$build_output" 2>/dev/null; then
grep "Unresolved reference" "$build_output" | head -1 | sed 's/.*: //'
elif grep -q "cannot find symbol" "$build_output" 2>/dev/null; then
grep "cannot find symbol" "$build_output" | head -1
elif grep -q "not found" "$build_output" 2>/dev/null; then
grep -E "not found|Not found" "$build_output" | head -1 | sed 's/.*: //'
elif grep -q "expected" "$build_output" 2>/dev/null; then
grep "expected" "$build_output" | head -1
elif grep -q "error:" "$build_output" 2>/dev/null; then
grep "error:" "$build_output" | head -1 | sed 's/.*error: //'
else
tail -3 "$build_output" | head -1
fi
}
# Run a single evaluation (can be called in parallel)
# Writes result to $OUTPUT_DIR/<eval_id>.result
run_evaluation() {
local eval_id="$1"
local eval_name="$2"
local eval_query="$3"
local verify_patterns="$4"
local schema_addition="$5"
local negative_patterns="$6"
local suffix=$([[ $USE_SKILL -eq 0 ]] && echo "-noskill" || echo "")
local backend_suffix=$([[ "$BACKEND" == "crush" ]] && echo "-crush" || echo "")
# Unique workspace for this evaluation (includes suffix to avoid conflicts)
local work_dir="$WORK_BASE-$eval_id$suffix$backend_suffix"
local agent_output="$OUTPUT_DIR/$eval_id$suffix$backend_suffix-agent.txt"
local build_output="$OUTPUT_DIR/$eval_id$suffix$backend_suffix-build.txt"
local errors_file="$OUTPUT_DIR/$eval_id$suffix$backend_suffix-errors.txt"
local result_file="$OUTPUT_DIR/$eval_id$suffix$backend_suffix.result"
# Log start
local eval_start=$(date +%s)
echo "[$(date +%H:%M:%S)] Starting: $eval_id"
# Setup fresh project with schema for this eval
local setup_start=$(date +%s)
if ! setup_project "$work_dir" "$schema_addition" "$eval_id"; then
echo "FAIL|$eval_id|0|SETUP_FAILED" > "$result_file"
echo "[$(date +%H:%M:%S)] $eval_id: SETUP FAILED"
return 1
fi
local setup_end=$(date +%s)
local setup_time=$((setup_end - setup_start))
# Clear errors file
> "$errors_file"
# Build query - remove skill reference if no-skill mode
local full_query
if [[ $USE_SKILL -eq 1 ]]; then
full_query="Work ONLY in $work_dir. Implement:
$eval_query"
else
local clean_query="${eval_query//Use the viaduct skill for guidance./}"
clean_query="${clean_query//Use the viaduct skill for guidance/}"
full_query="Work ONLY in $work_dir. Implement:
$clean_query"
fi
# Run AI agent
local agent_start=$(date +%s)
> "$agent_output" # Clear output file
if ! run_agent "$work_dir" "$full_query" "$agent_output"; then
echo "FAIL|$eval_id|0|AUTH_FAILED" > "$result_file"
echo "[$(date +%H:%M:%S)] $eval_id: AUTH FAILED"
return 1
fi
local agent_end=$(date +%s)
local agent_time=$((agent_end - agent_start))
# Build and fix loop
local build_start=$(date +%s)
local build_success=0
local attempt=1
local retry_errors=""
while [[ $attempt -le $MAX_RETRIES ]]; do
# Run viaductCodegen first in case Claude modified the schema
if (cd "$work_dir" && ./gradlew viaductCodegen classes --daemon -q > "$build_output" 2>&1); then
build_success=1
break
else
local error_summary=$(extract_error_summary "$build_output")
echo "Attempt $attempt: $error_summary" >> "$errors_file"
[[ -n "$retry_errors" ]] && retry_errors="$retry_errors|"
retry_errors="$retry_errors$error_summary"
if [[ $attempt -lt $MAX_RETRIES ]]; then
local build_error=$(tail -50 "$build_output")
local fix_query="Build failed. Fix it:
\`\`\`
$build_error
\`\`\`
Work ONLY in $work_dir."
run_agent "$work_dir" "$fix_query" "$agent_output"
fi
fi
((attempt++))
done
# Check patterns
local patterns_found=0
local patterns_total=0
local missing_patterns=""
if [[ -n "$verify_patterns" ]]; then
while IFS= read -r pattern; do
if [[ -n "$pattern" ]]; then
((patterns_total++))
if grep -rqE "$pattern" "$work_dir/src" 2>/dev/null; then
((patterns_found++))
else
[[ -n "$missing_patterns" ]] && missing_patterns="$missing_patterns, "
missing_patterns="$missing_patterns$pattern"
fi
fi
done <<< "$verify_patterns"
fi
# Check negative patterns
local negative_failed=0
local found_negative=""
if [[ -n "$negative_patterns" ]]; then
while IFS= read -r pattern; do
if [[ -n "$pattern" ]]; then
if grep -rqE "$pattern" "$work_dir/src" 2>/dev/null; then
[[ -n "$found_negative" ]] && found_negative="$found_negative, "
found_negative="$found_negative$pattern"
((negative_failed++))
fi
fi
done <<< "$negative_patterns"
fi
# Record results
if [[ -n "$missing_patterns" ]]; then
echo "Missing patterns: $missing_patterns" >> "$errors_file"
fi
if [[ -n "$found_negative" ]]; then
echo "Forbidden patterns found: $found_negative" >> "$errors_file"
fi
local build_end=$(date +%s)
local build_time=$((build_end - build_start))
local total_time=$((build_end - eval_start))
# Determine pass/fail
local eval_passed=0
local timing_info="setup:${setup_time}s agent:${agent_time}s build:${build_time}s total:${total_time}s"
if [[ $build_success -eq 1 ]] && [[ $patterns_found -eq $patterns_total ]] && [[ $negative_failed -eq 0 ]]; then
echo "PASS|$eval_id|$attempt|$retry_errors|$timing_info" > "$result_file"
echo -e "[$(date +%H:%M:%S)] $eval_id: ${GREEN}PASSED${NC} (attempt $attempt) [$timing_info]"
eval_passed=1
else
local fail_reason=""
[[ $build_success -eq 0 ]] && fail_reason="build_failed"
[[ $patterns_found -ne $patterns_total ]] && fail_reason="${fail_reason:+$fail_reason,}missing_patterns"
[[ $negative_failed -gt 0 ]] && fail_reason="${fail_reason:+$fail_reason,}forbidden_patterns"
echo "FAIL|$eval_id|$attempt|$fail_reason|$retry_errors|$timing_info" > "$result_file"
echo -e "[$(date +%H:%M:%S)] $eval_id: ${RED}FAILED${NC} ($fail_reason) [$timing_info]"
fi
# Preserve workspace if failed OR not a one-shot
if [[ $eval_passed -eq 0 ]] || [[ $attempt -gt 1 ]]; then
local workspace_dir="$OUTPUT_DIR/$eval_id$suffix-workspace"
rm -rf "$workspace_dir"
cp -r "$work_dir" "$workspace_dir"
fi
# Clean up temp workspace
rm -rf "$work_dir"
[[ $eval_passed -eq 1 ]] && return 0 || return 1
}
# Export functions and variables for parallel execution
export -f run_evaluation setup_project extract_error_summary run_agent run_with_claude run_with_crush kill_tree run_with_timeout
export SCRIPT_DIR OUTPUT_DIR BASE_TEMPLATE WORK_BASE USE_SKILL MAX_RETRIES EVAL_TIMEOUT BACKEND USE_GATEWAY
export RED GREEN YELLOW BLUE CYAN NC
main() {
echo "Viaduct Skill Evaluation Harness (Parallel Edition)"
echo "===================================================="
echo "Base template: $BASE_TEMPLATE"
echo "Work directory: $WORK_BASE-<eval-id>"
[[ $USE_SKILL -eq 1 ]] && echo -e "Mode: ${GREEN}WITH SKILL${NC}" || echo -e "Mode: ${BLUE}NO SKILL${NC}"
if [[ "$BACKEND" == "crush" ]]; then
echo -e "Backend: ${CYAN}Crush${NC} (~165 MB/process)"
else
echo -e "Backend: ${CYAN}Claude CLI${NC} (~800 MB/process)"
fi
echo "Max retries: $MAX_RETRIES"
echo "Eval timeout: ${EVAL_TIMEOUT}s"
echo -e "Parallelism: ${CYAN}$MAX_PARALLEL${NC} concurrent evaluations"
echo ""
check_deps
# Clean old outputs if requested
if [[ $CLEAN -eq 1 ]]; then
echo -e "${YELLOW}Cleaning eval outputs...${NC}"
rm -rf "$OUTPUT_DIR"/*-workspace 2>/dev/null
rm -f "$OUTPUT_DIR"/*.result "$OUTPUT_DIR"/*-agent.txt "$OUTPUT_DIR"/*-build.txt "$OUTPUT_DIR"/*-errors.txt "$OUTPUT_DIR"/*-claude.txt 2>/dev/null
echo "Done."
echo ""
fi
# Detect authentication mode (direct API vs internal gateway)
detect_auth_mode
if [[ "$USE_GATEWAY" -eq 1 ]]; then
echo -e "Auth: ${CYAN}Internal gateway${NC} (iap-auth)"
else
echo -e "Auth: ${CYAN}Direct Anthropic API${NC}"
fi
# Setup Crush providers if using Crush backend with internal gateway
if [[ "$BACKEND" == "crush" ]] && [[ "$USE_GATEWAY" -eq 1 ]]; then
echo "Configuring Crush providers for internal gateway..."
setup_crush_for_gateway
fi
# Pre-warm Gradle daemon
prewarm_gradle
# Get list of evaluations to run
local eval_count=$(jq length "$EVAL_FILE")
local evals_to_run=()
for i in $(seq 0 $((eval_count - 1))); do
local eval_id=$(jq -r ".[$i].id" "$EVAL_FILE")
local eval_name=$(jq -r ".[$i].name" "$EVAL_FILE")
# Filter check
if [[ -n "$FILTER" && "$eval_id" != *"$FILTER"* && "$eval_name" != *"$FILTER"* ]]; then
continue
fi
evals_to_run+=("$i")
done
local total_evals=${#evals_to_run[@]}
echo "Running $total_evals evaluations..."
echo ""
# Clear old result files
local suffix=$([[ $USE_SKILL -eq 0 ]] && echo "-noskill" || echo "")
local backend_suffix=$([[ "$BACKEND" == "crush" ]] && echo "-crush" || echo "")
rm -f "$OUTPUT_DIR"/*$suffix$backend_suffix.result 2>/dev/null
# Track running jobs
local running_pids=()
local running_evals=()
local completed=0
for idx in "${evals_to_run[@]}"; do
local eval_id=$(jq -r ".[$idx].id" "$EVAL_FILE")
local eval_name=$(jq -r ".[$idx].name" "$EVAL_FILE")
local eval_query=$(jq -r ".[$idx].query" "$EVAL_FILE")
local verify_patterns=$(jq -r ".[$idx].verify_patterns | .[]?" "$EVAL_FILE" 2>/dev/null || echo "")
local schema_addition=$(jq -r ".[$idx].schema // empty" "$EVAL_FILE" 2>/dev/null || echo "")
local negative_patterns=$(jq -r ".[$idx].negative_patterns | .[]?" "$EVAL_FILE" 2>/dev/null || echo "")
# Wait if we've hit max parallelism
while [[ ${#running_pids[@]} -ge $MAX_PARALLEL ]]; do
# Wait for any job to finish
local new_pids=()
local new_evals=()
for i in "${!running_pids[@]}"; do
if kill -0 "${running_pids[$i]}" 2>/dev/null; then
new_pids+=("${running_pids[$i]}")
new_evals+=("${running_evals[$i]}")
else
((completed++))
echo -e "${CYAN}[$completed/$total_evals completed]${NC}"
fi
done
running_pids=("${new_pids[@]}")
running_evals=("${new_evals[@]}")
if [[ ${#running_pids[@]} -ge $MAX_PARALLEL ]]; then
sleep 1
fi
done
# Start evaluation in background with timeout
run_with_timeout "$EVAL_TIMEOUT" run_evaluation "$eval_id" "$eval_name" "$eval_query" "$verify_patterns" "$schema_addition" "$negative_patterns" &
running_pids+=($!)
running_evals+=("$eval_id")
done
# Wait for all remaining jobs
echo "Waiting for remaining evaluations to complete..."
for pid in "${running_pids[@]}"; do
wait "$pid" 2>/dev/null || true
done
# Collect results into arrays for grouped reporting
local passed=0 failed=0 one_shot=0 total_run=0
local -a success_oneshot=()
local -a success_retry=() # "eval_id|attempts|timing"
local -a failure_list=() # "eval_id|reason|details"
for idx in "${evals_to_run[@]}"; do
local eval_id=$(jq -r ".[$idx].id" "$EVAL_FILE")
local eval_name=$(jq -r ".[$idx].name" "$EVAL_FILE")
local result_file="$OUTPUT_DIR/$eval_id$suffix$backend_suffix.result"
((total_run++))
if [[ -f "$result_file" ]]; then
local result=$(cat "$result_file")
local status=$(echo "$result" | cut -d'|' -f1)
local attempt=$(echo "$result" | cut -d'|' -f3)
local timing=$(echo "$result" | rev | cut -d'|' -f1 | rev)
if [[ "$status" == "PASS" ]]; then
((passed++))
if [[ "$attempt" == "1" ]]; then
((one_shot++))
success_oneshot+=("$eval_id ($eval_name)")
else
# Collect retry error details
local retry_errors=$(echo "$result" | cut -d'|' -f4)
success_retry+=("$eval_id ($eval_name)|$attempt|$timing|$retry_errors")
fi
else
((failed++))
local fail_reason=$(echo "$result" | cut -d'|' -f4)
local error_details=""
local errors_file="$OUTPUT_DIR/$eval_id$suffix$backend_suffix-errors.txt"
if [[ -f "$errors_file" ]] && [[ -s "$errors_file" ]]; then
error_details=$(cat "$errors_file")
fi
failure_list+=("$eval_id ($eval_name)|$fail_reason|$error_details|$timing")
fi
else
((failed++))
failure_list+=("$eval_id ($eval_name)|timeout|Exceeded ${EVAL_TIMEOUT}s limit|")
fi
done
# Print report
echo ""
echo "============================================================"
echo "REPORT"
echo "============================================================"
[[ $USE_SKILL -eq 1 ]] && echo -e "Mode: ${GREEN}WITH SKILL${NC}" || echo -e "Mode: ${BLUE}NO SKILL${NC}"
echo -e "Backend: ${CYAN}$BACKEND${NC}"
echo ""
# --- Successes ---
echo -e "${GREEN}PASSED: $passed / $total_run${NC}"
echo ""
if [[ ${#success_oneshot[@]} -gt 0 ]]; then
echo -e " ${GREEN}One-shot ($one_shot):${NC}"
for entry in "${success_oneshot[@]}"; do
echo -e " ${GREEN}✓${NC} $entry"
done
echo ""
fi
if [[ ${#success_retry[@]} -gt 0 ]]; then
echo -e " ${YELLOW}Passed with retries ($(( passed - one_shot ))):${NC}"
for entry in "${success_retry[@]}"; do
local name=$(echo "$entry" | cut -d'|' -f1)
local attempts=$(echo "$entry" | cut -d'|' -f2)
local timing=$(echo "$entry" | cut -d'|' -f3)
local retry_errors=$(echo "$entry" | cut -d'|' -f4)
echo -e " ${YELLOW}✓${NC} $name — ${YELLOW}$attempts attempts${NC} [$timing]"
if [[ -n "$retry_errors" ]]; then
# Split retry errors (pipe-separated within the field, comma-separated here)
echo -e " ${CYAN}retry errors: $retry_errors${NC}"
fi
done
echo ""
fi
# --- Failures ---
if [[ ${#failure_list[@]} -gt 0 ]]; then
echo -e "${RED}FAILED: $failed / $total_run${NC}"
echo ""
for entry in "${failure_list[@]}"; do
local name=$(echo "$entry" | cut -d'|' -f1)
local reason=$(echo "$entry" | cut -d'|' -f2)
local details=$(echo "$entry" | cut -d'|' -f3)
local timing=$(echo "$entry" | cut -d'|' -f4)
echo -e " ${RED}✗${NC} $name"
echo -e " Reason: ${RED}$reason${NC}"
if [[ -n "$timing" ]]; then
echo -e " Timing: $timing"
fi
if [[ -n "$details" ]]; then
while IFS= read -r line; do
echo -e " ${CYAN}$line${NC}"
done <<< "$details"
fi
echo ""
done
fi
echo "============================================================"
echo -e "Total: $total_run | ${GREEN}Passed: $passed${NC} | ${GREEN}One-shot: $one_shot${NC} | ${RED}Failed: $failed${NC}"
if [[ $passed -gt 0 ]]; then
echo -e "One-shot rate: ${GREEN}$(( one_shot * 100 / passed ))%${NC} of passes | ${GREEN}$(( one_shot * 100 / total_run ))%${NC} of total"
fi
echo ""
echo "Outputs: $OUTPUT_DIR"
# List preserved workspaces
local workspaces=$(ls -d "$OUTPUT_DIR"/*-workspace 2>/dev/null | wc -l | tr -d ' ')
if [[ "$workspaces" -gt 0 ]]; then
echo -e "${CYAN}Preserved workspaces (failed or retried):${NC}"
for ws in "$OUTPUT_DIR"/*-workspace; do
[[ -d "$ws" ]] && echo " $(basename "$ws")"
done
fi
[[ $failed -gt 0 ]] && return 1 || return 0
}
run_compare() {
echo "============================================================"
echo "COMPARISON MODE: skill vs no-skill"
echo "============================================================"
echo ""
local backend_suffix=$([[ "$BACKEND" == "crush" ]] && echo "-crush" || echo "")
# Run with skill
echo -e "${GREEN}>>> Running WITH skill...${NC}"
echo ""
USE_SKILL=1
export USE_SKILL
main
local skill_exit=$?
echo ""
echo ""
# Run without skill
echo -e "${BLUE}>>> Running WITHOUT skill...${NC}"
echo ""
USE_SKILL=0
export USE_SKILL
main
local noskill_exit=$?
# Build comparison from result files
echo ""
echo ""
echo "============================================================"
echo "COMPARISON REPORT"
echo "============================================================"
echo -e "Backend: ${CYAN}$BACKEND${NC}"
echo ""
local eval_count=$(jq length "$EVAL_FILE")
# Header
printf " %-40s %-18s %-18s\n" "Evaluation" "With Skill" "Without Skill"
printf " %-40s %-18s %-18s\n" "$(printf '%0.s─' {1..40})" "$(printf '%0.s─' {1..18})" "$(printf '%0.s─' {1..18})"
local skill_passed=0 skill_oneshot=0 skill_total=0
local noskill_passed=0 noskill_oneshot=0 noskill_total=0
for i in $(seq 0 $((eval_count - 1))); do
local eval_id=$(jq -r ".[$i].id" "$EVAL_FILE")
local eval_name=$(jq -r ".[$i].name" "$EVAL_FILE")
# Filter check
if [[ -n "$FILTER" && "$eval_id" != *"$FILTER"* && "$eval_name" != *"$FILTER"* ]]; then
continue
fi
local skill_result_file="$OUTPUT_DIR/$eval_id$backend_suffix.result"
local noskill_result_file="$OUTPUT_DIR/$eval_id-noskill$backend_suffix.result"
local skill_label noskill_label
# Parse skill result
((skill_total++))
if [[ -f "$skill_result_file" ]]; then
local s_result=$(cat "$skill_result_file")
local s_status=$(echo "$s_result" | cut -d'|' -f1)
local s_attempt=$(echo "$s_result" | cut -d'|' -f3)
if [[ "$s_status" == "PASS" ]]; then
((skill_passed++))
if [[ "$s_attempt" == "1" ]]; then
((skill_oneshot++))
skill_label="${GREEN}one-shot${NC}"
else
skill_label="${YELLOW}attempt $s_attempt${NC}"
fi
else
local s_reason=$(echo "$s_result" | cut -d'|' -f4)
skill_label="${RED}FAIL ($s_reason)${NC}"
fi
else
skill_label="${RED}TIMEOUT${NC}"
fi
# Parse no-skill result
((noskill_total++))
if [[ -f "$noskill_result_file" ]]; then
local n_result=$(cat "$noskill_result_file")
local n_status=$(echo "$n_result" | cut -d'|' -f1)
local n_attempt=$(echo "$n_result" | cut -d'|' -f3)
if [[ "$n_status" == "PASS" ]]; then
((noskill_passed++))
if [[ "$n_attempt" == "1" ]]; then
((noskill_oneshot++))
noskill_label="${GREEN}one-shot${NC}"
else
noskill_label="${YELLOW}attempt $n_attempt${NC}"
fi
else
local n_reason=$(echo "$n_result" | cut -d'|' -f4)
noskill_label="${RED}FAIL ($n_reason)${NC}"
fi
else
noskill_label="${RED}TIMEOUT${NC}"
fi
# Use fixed-width columns with tput for reliable alignment
local display_id="$eval_id"
[[ ${#display_id} -gt 40 ]] && display_id="${display_id:0:37}..."
printf " %-40s " "$display_id"
# Print skill result (pad to 20 visible chars)
echo -ne "$skill_label"
printf "\t"
echo -e "$noskill_label"
done
echo ""
printf " %-40s %-18s %-18s\n" "$(printf '%0.s─' {1..40})" "$(printf '%0.s─' {1..18})" "$(printf '%0.s─' {1..18})"
echo -e " Passed: ${GREEN}$skill_passed / $skill_total${NC} ${BLUE}$noskill_passed / $noskill_total${NC}"
echo -e " One-shot: ${GREEN}$skill_oneshot / $skill_total${NC} ${BLUE}$noskill_oneshot / $noskill_total${NC}"
if [[ $skill_total -gt 0 ]]; then
local skill_pct=$(( skill_oneshot * 100 / skill_total ))
local noskill_pct=$(( noskill_oneshot * 100 / noskill_total ))
local delta=$(( skill_pct - noskill_pct ))
echo ""
echo -e " One-shot rate: ${GREEN}${skill_pct}%${NC} ${BLUE}${noskill_pct}%${NC}"
if [[ $delta -gt 0 ]]; then
echo -e " Skill improvement: ${GREEN}+${delta}pp${NC}"
elif [[ $delta -lt 0 ]]; then
echo -e " Skill improvement: ${RED}${delta}pp${NC}"
else
echo -e " Skill improvement: 0pp (no difference)"
fi
fi
echo ""
echo "Outputs: $OUTPUT_DIR"
[[ $skill_exit -ne 0 || $noskill_exit -ne 0 ]] && exit 1
exit 0
}
if [[ $COMPARE -eq 1 ]]; then
run_compare
else
main
exit $?
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment