Created
February 11, 2026 23:50
-
-
Save realark/8db2a76218ff4f35ff67d363c7d602f9 to your computer and use it in GitHub Desktop.
trace-claude-code for evals
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import { Eval } from "braintrust"; | |
| import { spawn } from "child_process"; | |
| // Set DEBUG=1 to stream agent stdout/stderr to console | |
| const DEBUG = process.env.DEBUG === "1"; | |
| // Optional: set CLAUDE_PLUGIN_DIR to pass --plugin-dir to claude | |
| const CLAUDE_PLUGIN_DIR = process.env.CLAUDE_PLUGIN_DIR; | |
| interface TaskInput { | |
| prompt: string; | |
| } | |
| interface SpanContext { | |
| parentSpanId: string; | |
| rootSpanId: string; | |
| experimentId?: string; | |
| } | |
| interface TaskOutput { | |
| stdout: string; | |
| stderr: string; | |
| exitCode: number | null; | |
| messages: unknown[]; | |
| } | |
| /** | |
| * Runs Claude Code (claude CLI) with the given prompt. | |
| */ | |
| async function runClaudeCode(opts: { | |
| prompt: string; | |
| spanContext?: SpanContext; | |
| timeoutMs?: number; | |
| }): Promise<TaskOutput> { | |
| const { prompt, spanContext, timeoutMs = 60_000 } = opts; | |
| return new Promise((resolve) => { | |
| // Build settings with tracing env vars | |
| const settings = JSON.stringify({ | |
| env: { | |
| BRAINTRUST_CC_PROJECT: "hello-claude-code", | |
| TRACE_TO_BRAINTRUST: "true", | |
| ...(spanContext && { | |
| CC_PARENT_SPAN_ID: spanContext.parentSpanId, | |
| CC_ROOT_SPAN_ID: spanContext.rootSpanId, | |
| ...(spanContext.experimentId && { | |
| CC_EXPERIMENT_ID: spanContext.experimentId, | |
| }), | |
| }), | |
| }, | |
| }); | |
| const args = [ | |
| "-p", | |
| "--output-format", "stream-json", | |
| "--verbose", | |
| "--settings", settings, | |
| ]; | |
| // Add --plugin-dir if CLAUDE_PLUGIN_DIR is set | |
| if (CLAUDE_PLUGIN_DIR) { | |
| args.push("--plugin-dir", CLAUDE_PLUGIN_DIR); | |
| } | |
| const proc = spawn("claude", args, { | |
| stdio: ["pipe", "pipe", "pipe"], | |
| env: process.env, | |
| }); | |
| proc.stdin.write(prompt); | |
| proc.stdin.end(); | |
| let stdout = ""; | |
| let stderr = ""; | |
| proc.stdout.on("data", (data: Buffer) => { | |
| const chunk = data.toString(); | |
| stdout += chunk; | |
| if (DEBUG) process.stdout.write(chunk); | |
| }); | |
| proc.stderr.on("data", (data: Buffer) => { | |
| const chunk = data.toString(); | |
| stderr += chunk; | |
| if (DEBUG) process.stderr.write(chunk); | |
| }); | |
| const timeout = setTimeout(() => { | |
| proc.kill("SIGTERM"); | |
| }, timeoutMs); | |
| proc.on("close", (exitCode) => { | |
| clearTimeout(timeout); | |
| const messages: unknown[] = []; | |
| for (const line of stdout.split("\n")) { | |
| if (line.trim()) { | |
| try { | |
| messages.push(JSON.parse(line)); | |
| } catch { | |
| // Skip non-JSON lines | |
| } | |
| } | |
| } | |
| resolve({ stdout, stderr, exitCode, messages }); | |
| }); | |
| proc.on("error", (err) => { | |
| clearTimeout(timeout); | |
| resolve({ stdout, stderr: stderr + err.message, exitCode: 1, messages: [] }); | |
| }); | |
| }); | |
| } | |
| /** | |
| * Basic scorer that checks if Claude Code completed successfully. | |
| */ | |
| function completionScorer({ output }: { output: TaskOutput }) { | |
| return { | |
| name: "completion", | |
| score: output.exitCode === 0 ? 1 : 0, | |
| metadata: { | |
| exitCode: output.exitCode, | |
| }, | |
| }; | |
| } | |
| Eval("hello-claude-code", { | |
| data: () => [ | |
| { | |
| input: { prompt: "Hi, what's your name?" }, | |
| }, | |
| ], | |
| task: async (input, hooks): Promise<TaskOutput> => { | |
| const span = hooks.span; | |
| // Capture span context for tracing | |
| const parentSpanId = span.spanId; | |
| const rootSpanId = span.rootSpanId; | |
| const parentInfo = span.getParentInfo?.(); | |
| // objectType 1 = experiment in the SpanObjectTypeV3 enum | |
| const experimentId = parentInfo?.objectType === 1 ? await parentInfo.objectId.get() : undefined; | |
| return runClaudeCode({ | |
| prompt: input.prompt, | |
| spanContext: { | |
| parentSpanId, | |
| rootSpanId, | |
| experimentId, | |
| }, | |
| }); | |
| }, | |
| scores: [completionScorer], | |
| }); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment