Skip to content

Instantly share code, notes, and snippets.

@realark
Created February 11, 2026 23:50
Show Gist options
  • Select an option

  • Save realark/8db2a76218ff4f35ff67d363c7d602f9 to your computer and use it in GitHub Desktop.

Select an option

Save realark/8db2a76218ff4f35ff67d363c7d602f9 to your computer and use it in GitHub Desktop.
trace-claude-code for evals
import { Eval } from "braintrust";
import { spawn } from "child_process";
// Set DEBUG=1 to stream agent stdout/stderr to console
const DEBUG = process.env.DEBUG === "1";
// Optional: set CLAUDE_PLUGIN_DIR to pass --plugin-dir to claude
const CLAUDE_PLUGIN_DIR = process.env.CLAUDE_PLUGIN_DIR;
interface TaskInput {
prompt: string;
}
interface SpanContext {
parentSpanId: string;
rootSpanId: string;
experimentId?: string;
}
interface TaskOutput {
stdout: string;
stderr: string;
exitCode: number | null;
messages: unknown[];
}
/**
* Runs Claude Code (claude CLI) with the given prompt.
*/
async function runClaudeCode(opts: {
prompt: string;
spanContext?: SpanContext;
timeoutMs?: number;
}): Promise<TaskOutput> {
const { prompt, spanContext, timeoutMs = 60_000 } = opts;
return new Promise((resolve) => {
// Build settings with tracing env vars
const settings = JSON.stringify({
env: {
BRAINTRUST_CC_PROJECT: "hello-claude-code",
TRACE_TO_BRAINTRUST: "true",
...(spanContext && {
CC_PARENT_SPAN_ID: spanContext.parentSpanId,
CC_ROOT_SPAN_ID: spanContext.rootSpanId,
...(spanContext.experimentId && {
CC_EXPERIMENT_ID: spanContext.experimentId,
}),
}),
},
});
const args = [
"-p",
"--output-format", "stream-json",
"--verbose",
"--settings", settings,
];
// Add --plugin-dir if CLAUDE_PLUGIN_DIR is set
if (CLAUDE_PLUGIN_DIR) {
args.push("--plugin-dir", CLAUDE_PLUGIN_DIR);
}
const proc = spawn("claude", args, {
stdio: ["pipe", "pipe", "pipe"],
env: process.env,
});
proc.stdin.write(prompt);
proc.stdin.end();
let stdout = "";
let stderr = "";
proc.stdout.on("data", (data: Buffer) => {
const chunk = data.toString();
stdout += chunk;
if (DEBUG) process.stdout.write(chunk);
});
proc.stderr.on("data", (data: Buffer) => {
const chunk = data.toString();
stderr += chunk;
if (DEBUG) process.stderr.write(chunk);
});
const timeout = setTimeout(() => {
proc.kill("SIGTERM");
}, timeoutMs);
proc.on("close", (exitCode) => {
clearTimeout(timeout);
const messages: unknown[] = [];
for (const line of stdout.split("\n")) {
if (line.trim()) {
try {
messages.push(JSON.parse(line));
} catch {
// Skip non-JSON lines
}
}
}
resolve({ stdout, stderr, exitCode, messages });
});
proc.on("error", (err) => {
clearTimeout(timeout);
resolve({ stdout, stderr: stderr + err.message, exitCode: 1, messages: [] });
});
});
}
/**
* Basic scorer that checks if Claude Code completed successfully.
*/
function completionScorer({ output }: { output: TaskOutput }) {
return {
name: "completion",
score: output.exitCode === 0 ? 1 : 0,
metadata: {
exitCode: output.exitCode,
},
};
}
Eval("hello-claude-code", {
data: () => [
{
input: { prompt: "Hi, what's your name?" },
},
],
task: async (input, hooks): Promise<TaskOutput> => {
const span = hooks.span;
// Capture span context for tracing
const parentSpanId = span.spanId;
const rootSpanId = span.rootSpanId;
const parentInfo = span.getParentInfo?.();
// objectType 1 = experiment in the SpanObjectTypeV3 enum
const experimentId = parentInfo?.objectType === 1 ? await parentInfo.objectId.get() : undefined;
return runClaudeCode({
prompt: input.prompt,
spanContext: {
parentSpanId,
rootSpanId,
experimentId,
},
});
},
scores: [completionScorer],
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment