pdulapalli · December 14, 2025 22:04
diff --git a/eval_example_part_2.py b/eval_example_part_2.py
 import os
 from typing import TypedDict
 import anthropic
 import json
 import re


 class TestCaseRequired(TypedDict):
    input: str
    tags: list[str]


 class TestCase(TestCaseRequired, total=False):
    expected_keywords: list[str]
    expected_absent: list[str]


 client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))

 SYSTEM_PROMPT = "You are a helpful customer support agent for an online store."


 test_cases: list[TestCase] = [
    {
        "input": "Where is my order #12345?",
        "expected_keywords": ["order", "12345", "tracking"],
        "tags": ["orders"],
    },
    {
        "input": "I want a refund for my broken item",
        "expected_keywords": ["refund", "return", "sorry"],
        "tags": ["refunds"],
    },
    {
        "input": "How long does shipping take to California?",
        "expected_keywords": ["shipping", "days", "delivery"],
        "tags": ["shipping"],
    },
    {
        "input": "What's the capital of France?",
        "expected_keywords": [],
        "expected_absent": ["Paris"],  # Should stay on topic
        "tags": ["off-topic"],
    },
 ]


 def get_response(user_input: str) -> str:
    response = client.messages.create(
        model="claude-sonnet-4-5-20250929",
        max_tokens=256,
        system=SYSTEM_PROMPT,
        messages=[{"role": "user", "content": user_input}],
    )
    return response.content[0].text


 def check_keywords(response: str, case: TestCase) -> tuple[bool, str]:
    response_lower = response.lower()

    for kw in case.get("expected_keywords", []):
        if kw.lower() not in response_lower:
            return False, f"Missing keyword: {kw}"

    for kw in case.get("expected_absent", []):
        if kw.lower() in response_lower:
            return False, f"Should not contain: {kw}"

    return True, "All checks passed"


 def grade_response(user_input: str, bot_response: str) -> dict:
    grading_prompt = f"""Rate this customer support response on a 1-5 scale.

 Customer message: {user_input}
 Bot response: {bot_response}

 Criteria:
 - Helpfulness: Does it address the customer's actual question?
 - Tone: Is it professional and appropriately empathetic?
 - Accuracy: Is any policy/factual information correct?

 Return JSON with: {{"score": <1-5>, "reasoning": "<brief explanation>"}}"""

    response = client.messages.create(
        model="claude-sonnet-4-5-20250929",
        max_tokens=150,
        messages=[{"role": "user", "content": grading_prompt}],
    )

    text = response.content[0].text
    # Extract JSON from markdown code blocks if present
    json_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", text)
    if json_match:
        text = json_match.group(1)
    return json.loads(text.strip())


 results = []
 for case in test_cases:
    response = get_response(case["input"])
    keyword_passed, keyword_message = check_keywords(response, case)
    grade = grade_response(case["input"], response)

    results.append(
        {
            "input": case["input"],
            "response": response,
            "tags": case["tags"],
            "keyword_check": {
                "passed": keyword_passed,
                "message": keyword_message,
            },
            "llm_grade": grade,
        }
    )

 print(json.dumps(results, indent=2))
diff --git a/eval_example_part_2_output.json b/eval_example_part_2_output.json
 [
  {
    "input": "Where is my order #12345?",
    "response": "I'd be happy to help you track order #12345! However, I don't have access to your order information in our system right now.\n\nTo help you locate your order, I can:\n\n1. **Check your email** - Look for an order confirmation or shipping notification from us that includes tracking information\n2. **Direct you to track it yourself** - Visit our website's order tracking page and enter your order number and email address\n3. **Escalate to our fulfillment team** - I can connect you with someone who has direct access to our order system\n\nCould you also provide:\n- The email address associated with your order\n- Approximate order date\n\nThis will help me assist you more effectively!",
    "tags": [
      "orders"
    ],
    "keyword_check": {
      "passed": true,
      "message": "All checks passed"
    },
    "llm_grade": {
      "score": 2,
      "reasoning": "While the tone is professional and friendly, the response fails to address the customer's direct question about their order location. The bot admits it cannot access order information, then provides workarounds that put the burden back on the customer (check your own email, track it yourself). For a simple 'where is my order' question, the customer expects a direct answer or at least an immediate escalation. The response is overly verbose and creates unnecessary friction by asking for information the system should already have (email address for an order number they possess). This would likely frustrate a customer seeking quick information."
    }
  },
  {
    "input": "I want a refund for my broken item",
    "response": "I'd be happy to help you with a refund for your broken item. To process this for you, I'll need a few details:\n\n1. **Order number** - This helps me locate your purchase\n2. **What item was broken?** - The product name or description\n3. **When did you receive it?** - To verify it's within our return window\n4. **How was it damaged?** - Was it broken upon arrival, or did it break during use?\n\nDo you also have any photos of the damage? While not always required, they can help speed up the refund process.\n\nOnce I have this information, I can guide you through the next steps to get your refund processed quickly.",
    "tags": [
      "refunds"
    ],
    "keyword_check": {
      "passed": false,
      "message": "Missing keyword: sorry"
    },
    "llm_grade": {
      "score": 5,
      "reasoning": "This response excels in all criteria. It directly addresses the refund request with empathy ('I'd be happy to help'), asks all necessary questions in a clear, organized format, explains why each piece of information is needed (building trust), and proactively mentions photos as optional but helpful. The tone is professional yet warm, and it ends with reassurance about quick processing. Without access to actual company policy, no factual errors are apparent. This is a model support response."
    }
  },
  {
    "input": "How long does shipping take to California?",
    "response": "For shipping to California, delivery times typically depend on the shipping method you choose:\n\n- **Standard Shipping**: 5-7 business days\n- **Expedited Shipping**: 3-4 business days\n- **Express Shipping**: 1-2 business days\n\nPlease note that these are estimates from the date your order ships (not the order date). Processing time before shipment is usually 1-2 business days.\n\nIs there anything else you'd like to know about shipping or your order?",
    "tags": [
      "shipping"
    ],
    "keyword_check": {
      "passed": true,
      "message": "All checks passed"
    },
    "llm_grade": {
      "score": 5,
      "reasoning": "This response excellently addresses the customer's question with comprehensive, well-organized information. It provides specific timeframes for multiple shipping options, includes an important clarification about processing time vs. shipping time, and maintains a professional yet friendly tone. The formatting with bullet points makes it easy to scan, and the follow-up question shows good customer service practice by offering additional help. Assuming the information is accurate to company policy, this is an exemplary support response."
    }
  },
  {
    "input": "What's the capital of France?",
    "response": "The capital of France is **Paris**.\n\nHowever, I noticed you've reached out to customer support for an online store. Is there anything I can help you with regarding your order, account, products, or any other store-related questions? I'm here to assist you! \ud83d\ude0a",
    "tags": [
      "off-topic"
    ],
    "keyword_check": {
      "passed": false,
      "message": "Should not contain: Paris"
    },
    "llm_grade": {
      "score": 4,
      "reasoning": "The response is helpful (correctly answers the question), accurate (Paris is correct), and professional with a friendly tone. It appropriately redirects the customer to store-related inquiries. Loses one point because the customer may have been testing the bot or asking legitimately, and the redirect, while appropriate, could feel slightly dismissive. Overall, this is a well-balanced response that handles an off-topic question gracefully."
    }
  }
 ]
	import os
	from typing import TypedDict
	import anthropic
	import json
	import re


	class TestCaseRequired(TypedDict):
	input: str
	tags: list[str]


	class TestCase(TestCaseRequired, total=False):
	expected_keywords: list[str]
	expected_absent: list[str]


	client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))

	SYSTEM_PROMPT = "You are a helpful customer support agent for an online store."


	test_cases: list[TestCase] = [
	{
	"input": "Where is my order #12345?",
	"expected_keywords": ["order", "12345", "tracking"],
	"tags": ["orders"],
	},
	{
	"input": "I want a refund for my broken item",
	"expected_keywords": ["refund", "return", "sorry"],
	"tags": ["refunds"],
	},
	{
	"input": "How long does shipping take to California?",
	"expected_keywords": ["shipping", "days", "delivery"],
	"tags": ["shipping"],
	},
	{
	"input": "What's the capital of France?",
	"expected_keywords": [],
	"expected_absent": ["Paris"], # Should stay on topic
	"tags": ["off-topic"],
	},
	]


	def get_response(user_input: str) -> str:
	response = client.messages.create(
	model="claude-sonnet-4-5-20250929",
	max_tokens=256,
	system=SYSTEM_PROMPT,
	messages=[{"role": "user", "content": user_input}],
	)
	return response.content[0].text


	def check_keywords(response: str, case: TestCase) -> tuple[bool, str]:
	response_lower = response.lower()

	for kw in case.get("expected_keywords", []):
	if kw.lower() not in response_lower:
	return False, f"Missing keyword: {kw}"

	for kw in case.get("expected_absent", []):
	if kw.lower() in response_lower:
	return False, f"Should not contain: {kw}"

	return True, "All checks passed"


	def grade_response(user_input: str, bot_response: str) -> dict:
	grading_prompt = f"""Rate this customer support response on a 1-5 scale.

	Customer message: {user_input}
	Bot response: {bot_response}

	Criteria:
	- Helpfulness: Does it address the customer's actual question?
	- Tone: Is it professional and appropriately empathetic?
	- Accuracy: Is any policy/factual information correct?

	Return JSON with: {{"score": <1-5>, "reasoning": "<brief explanation>"}}"""

	response = client.messages.create(
	model="claude-sonnet-4-5-20250929",
	max_tokens=150,
	messages=[{"role": "user", "content": grading_prompt}],
	)

	text = response.content[0].text
	# Extract JSON from markdown code blocks if present
	json_match = re.search(r"```(?:json)?\s([\s\S]?)\s*```", text)
	if json_match:
	text = json_match.group(1)
	return json.loads(text.strip())


	results = []
	for case in test_cases:
	response = get_response(case["input"])
	keyword_passed, keyword_message = check_keywords(response, case)
	grade = grade_response(case["input"], response)

	results.append(
	{
	"input": case["input"],
	"response": response,
	"tags": case["tags"],
	"keyword_check": {
	"passed": keyword_passed,
	"message": keyword_message,
	},
	"llm_grade": grade,
	}
	)

	print(json.dumps(results, indent=2))
	[
	{
	"input": "Where is my order #12345?",
	"response": "I'd be happy to help you track order #12345! However, I don't have access to your order information in our system right now.\n\nTo help you locate your order, I can:\n\n1. Check your email - Look for an order confirmation or shipping notification from us that includes tracking information\n2. Direct you to track it yourself - Visit our website's order tracking page and enter your order number and email address\n3. Escalate to our fulfillment team - I can connect you with someone who has direct access to our order system\n\nCould you also provide:\n- The email address associated with your order\n- Approximate order date\n\nThis will help me assist you more effectively!",
	"tags": [
	"orders"
	],
	"keyword_check": {
	"passed": true,
	"message": "All checks passed"
	},
	"llm_grade": {
	"score": 2,
	"reasoning": "While the tone is professional and friendly, the response fails to address the customer's direct question about their order location. The bot admits it cannot access order information, then provides workarounds that put the burden back on the customer (check your own email, track it yourself). For a simple 'where is my order' question, the customer expects a direct answer or at least an immediate escalation. The response is overly verbose and creates unnecessary friction by asking for information the system should already have (email address for an order number they possess). This would likely frustrate a customer seeking quick information."
	}
	},
	{
	"input": "I want a refund for my broken item",
	"response": "I'd be happy to help you with a refund for your broken item. To process this for you, I'll need a few details:\n\n1. Order number - This helps me locate your purchase\n2. What item was broken? - The product name or description\n3. When did you receive it? - To verify it's within our return window\n4. How was it damaged? - Was it broken upon arrival, or did it break during use?\n\nDo you also have any photos of the damage? While not always required, they can help speed up the refund process.\n\nOnce I have this information, I can guide you through the next steps to get your refund processed quickly.",
	"tags": [
	"refunds"
	],
	"keyword_check": {
	"passed": false,
	"message": "Missing keyword: sorry"
	},
	"llm_grade": {
	"score": 5,
	"reasoning": "This response excels in all criteria. It directly addresses the refund request with empathy ('I'd be happy to help'), asks all necessary questions in a clear, organized format, explains why each piece of information is needed (building trust), and proactively mentions photos as optional but helpful. The tone is professional yet warm, and it ends with reassurance about quick processing. Without access to actual company policy, no factual errors are apparent. This is a model support response."
	}
	},
	{
	"input": "How long does shipping take to California?",
	"response": "For shipping to California, delivery times typically depend on the shipping method you choose:\n\n- Standard Shipping: 5-7 business days\n- Expedited Shipping: 3-4 business days\n- Express Shipping: 1-2 business days\n\nPlease note that these are estimates from the date your order ships (not the order date). Processing time before shipment is usually 1-2 business days.\n\nIs there anything else you'd like to know about shipping or your order?",
	"tags": [
	"shipping"
	],
	"keyword_check": {
	"passed": true,
	"message": "All checks passed"
	},
	"llm_grade": {
	"score": 5,
	"reasoning": "This response excellently addresses the customer's question with comprehensive, well-organized information. It provides specific timeframes for multiple shipping options, includes an important clarification about processing time vs. shipping time, and maintains a professional yet friendly tone. The formatting with bullet points makes it easy to scan, and the follow-up question shows good customer service practice by offering additional help. Assuming the information is accurate to company policy, this is an exemplary support response."
	}
	},
	{
	"input": "What's the capital of France?",
	"response": "The capital of France is Paris.\n\nHowever, I noticed you've reached out to customer support for an online store. Is there anything I can help you with regarding your order, account, products, or any other store-related questions? I'm here to assist you! \ud83d\ude0a",
	"tags": [
	"off-topic"
	],
	"keyword_check": {
	"passed": false,
	"message": "Should not contain: Paris"
	},
	"llm_grade": {
	"score": 4,
	"reasoning": "The response is helpful (correctly answers the question), accurate (Paris is correct), and professional with a friendly tone. It appropriately redirects the customer to store-related inquiries. Loses one point because the customer may have been testing the bot or asking legitimately, and the redirect, while appropriate, could feel slightly dismissive. Overall, this is a well-balanced response that handles an off-topic question gracefully."
	}
	}
	]