realark · February 11, 2026 23:50
diff --git a/hello-claude-code.eval.ts b/hello-claude-code.eval.ts
 import { Eval } from "braintrust";
 import { spawn } from "child_process";

 // Set DEBUG=1 to stream agent stdout/stderr to console
 const DEBUG = process.env.DEBUG === "1";

 // Optional: set CLAUDE_PLUGIN_DIR to pass --plugin-dir to claude
 const CLAUDE_PLUGIN_DIR = process.env.CLAUDE_PLUGIN_DIR;

 interface TaskInput {
  prompt: string;
 }

 interface SpanContext {
  parentSpanId: string;
  rootSpanId: string;
  experimentId?: string;
 }

 interface TaskOutput {
  stdout: string;
  stderr: string;
  exitCode: number | null;
  messages: unknown[];
 }

 /**
 * Runs Claude Code (claude CLI) with the given prompt.
 */
 async function runClaudeCode(opts: {
  prompt: string;
  spanContext?: SpanContext;
  timeoutMs?: number;
 }): Promise<TaskOutput> {
  const { prompt, spanContext, timeoutMs = 60_000 } = opts;

  return new Promise((resolve) => {
    // Build settings with tracing env vars
    const settings = JSON.stringify({
      env: {
        BRAINTRUST_CC_PROJECT: "hello-claude-code",
        TRACE_TO_BRAINTRUST: "true",
        ...(spanContext && {
          CC_PARENT_SPAN_ID: spanContext.parentSpanId,
          CC_ROOT_SPAN_ID: spanContext.rootSpanId,
          ...(spanContext.experimentId && {
            CC_EXPERIMENT_ID: spanContext.experimentId,
          }),
        }),
      },
    });

    const args = [
      "-p",
      "--output-format", "stream-json",
      "--verbose",
      "--settings", settings,
    ];

    // Add --plugin-dir if CLAUDE_PLUGIN_DIR is set
    if (CLAUDE_PLUGIN_DIR) {
      args.push("--plugin-dir", CLAUDE_PLUGIN_DIR);
    }

    const proc = spawn("claude", args, {
      stdio: ["pipe", "pipe", "pipe"],
      env: process.env,
    });

    proc.stdin.write(prompt);
    proc.stdin.end();

    let stdout = "";
    let stderr = "";

    proc.stdout.on("data", (data: Buffer) => {
      const chunk = data.toString();
      stdout += chunk;
      if (DEBUG) process.stdout.write(chunk);
    });

    proc.stderr.on("data", (data: Buffer) => {
      const chunk = data.toString();
      stderr += chunk;
      if (DEBUG) process.stderr.write(chunk);
    });

    const timeout = setTimeout(() => {
      proc.kill("SIGTERM");
    }, timeoutMs);

    proc.on("close", (exitCode) => {
      clearTimeout(timeout);
      const messages: unknown[] = [];
      for (const line of stdout.split("\n")) {
        if (line.trim()) {
          try {
            messages.push(JSON.parse(line));
          } catch {
            // Skip non-JSON lines
          }
        }
      }
      resolve({ stdout, stderr, exitCode, messages });
    });

    proc.on("error", (err) => {
      clearTimeout(timeout);
      resolve({ stdout, stderr: stderr + err.message, exitCode: 1, messages: [] });
    });
  });
 }

 /**
 * Basic scorer that checks if Claude Code completed successfully.
 */
 function completionScorer({ output }: { output: TaskOutput }) {
  return {
    name: "completion",
    score: output.exitCode === 0 ? 1 : 0,
    metadata: {
      exitCode: output.exitCode,
    },
  };
 }

 Eval("hello-claude-code", {
  data: () => [
    {
      input: { prompt: "Hi, what's your name?" },
    },
  ],
  task: async (input, hooks): Promise<TaskOutput> => {
    const span = hooks.span;

    // Capture span context for tracing
    const parentSpanId = span.spanId;
    const rootSpanId = span.rootSpanId;
    const parentInfo = span.getParentInfo?.();
    // objectType 1 = experiment in the SpanObjectTypeV3 enum
    const experimentId = parentInfo?.objectType === 1 ? await parentInfo.objectId.get() : undefined;

    return runClaudeCode({
      prompt: input.prompt,
      spanContext: {
        parentSpanId,
        rootSpanId,
        experimentId,
      },
    });
  },
  scores: [completionScorer],
 });
	import { Eval } from "braintrust";
	import { spawn } from "child_process";

	// Set DEBUG=1 to stream agent stdout/stderr to console
	const DEBUG = process.env.DEBUG === "1";

	// Optional: set CLAUDE_PLUGIN_DIR to pass --plugin-dir to claude
	const CLAUDE_PLUGIN_DIR = process.env.CLAUDE_PLUGIN_DIR;

	interface TaskInput {
	prompt: string;
	}

	interface SpanContext {
	parentSpanId: string;
	rootSpanId: string;
	experimentId?: string;
	}

	interface TaskOutput {
	stdout: string;
	stderr: string;
	exitCode: number \| null;
	messages: unknown[];
	}

	/**
	* Runs Claude Code (claude CLI) with the given prompt.
	*/
	async function runClaudeCode(opts: {
	prompt: string;
	spanContext?: SpanContext;
	timeoutMs?: number;
	}): Promise<TaskOutput> {
	const { prompt, spanContext, timeoutMs = 60_000 } = opts;

	return new Promise((resolve) => {
	// Build settings with tracing env vars
	const settings = JSON.stringify({
	env: {
	BRAINTRUST_CC_PROJECT: "hello-claude-code",
	TRACE_TO_BRAINTRUST: "true",
	...(spanContext && {
	CC_PARENT_SPAN_ID: spanContext.parentSpanId,
	CC_ROOT_SPAN_ID: spanContext.rootSpanId,
	...(spanContext.experimentId && {
	CC_EXPERIMENT_ID: spanContext.experimentId,
	}),
	}),
	},
	});

	const args = [
	"-p",
	"--output-format", "stream-json",
	"--verbose",
	"--settings", settings,
	];

	// Add --plugin-dir if CLAUDE_PLUGIN_DIR is set
	if (CLAUDE_PLUGIN_DIR) {
	args.push("--plugin-dir", CLAUDE_PLUGIN_DIR);
	}

	const proc = spawn("claude", args, {
	stdio: ["pipe", "pipe", "pipe"],
	env: process.env,
	});

	proc.stdin.write(prompt);
	proc.stdin.end();

	let stdout = "";
	let stderr = "";

	proc.stdout.on("data", (data: Buffer) => {
	const chunk = data.toString();
	stdout += chunk;
	if (DEBUG) process.stdout.write(chunk);
	});

	proc.stderr.on("data", (data: Buffer) => {
	const chunk = data.toString();
	stderr += chunk;
	if (DEBUG) process.stderr.write(chunk);
	});

	const timeout = setTimeout(() => {
	proc.kill("SIGTERM");
	}, timeoutMs);

	proc.on("close", (exitCode) => {
	clearTimeout(timeout);
	const messages: unknown[] = [];
	for (const line of stdout.split("\n")) {
	if (line.trim()) {
	try {
	messages.push(JSON.parse(line));
	} catch {
	// Skip non-JSON lines
	}
	}
	}
	resolve({ stdout, stderr, exitCode, messages });
	});

	proc.on("error", (err) => {
	clearTimeout(timeout);
	resolve({ stdout, stderr: stderr + err.message, exitCode: 1, messages: [] });
	});
	});
	}

	/**
	* Basic scorer that checks if Claude Code completed successfully.
	*/
	function completionScorer({ output }: { output: TaskOutput }) {
	return {
	name: "completion",
	score: output.exitCode === 0 ? 1 : 0,
	metadata: {
	exitCode: output.exitCode,
	},
	};
	}

	Eval("hello-claude-code", {
	data: () => [
	{
	input: { prompt: "Hi, what's your name?" },
	},
	],
	task: async (input, hooks): Promise<TaskOutput> => {
	const span = hooks.span;

	// Capture span context for tracing
	const parentSpanId = span.spanId;
	const rootSpanId = span.rootSpanId;
	const parentInfo = span.getParentInfo?.();
	// objectType 1 = experiment in the SpanObjectTypeV3 enum
	const experimentId = parentInfo?.objectType === 1 ? await parentInfo.objectId.get() : undefined;

	return runClaudeCode({
	prompt: input.prompt,
	spanContext: {
	parentSpanId,
	rootSpanId,
	experimentId,
	},
	});
	},
	scores: [completionScorer],
	});
No results found