Last active
December 14, 2025 22:04
-
-
Save pdulapalli/d932bdb0c60b6f77bd8750c46e49ca93 to your computer and use it in GitHub Desktop.
eval_example_part_1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json | |
| import os | |
| from typing import TypedDict | |
| import anthropic | |
| class TestCaseRequired(TypedDict): | |
| input: str | |
| tags: list[str] | |
| class TestCase(TestCaseRequired, total=False): | |
| expected_keywords: list[str] | |
| expected_absent: list[str] | |
| class Result(TypedDict): | |
| input: str | |
| response: str | |
| passed: bool | |
| reason: str | |
| tags: list[str] | |
| client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY")) | |
| SYSTEM_PROMPT = "You are a helpful customer support agent for an online store." | |
| test_cases: list[TestCase] = [ | |
| { | |
| "input": "Where is my order #12345?", | |
| "expected_keywords": ["order", "12345", "tracking"], | |
| "tags": ["orders"], | |
| }, | |
| { | |
| "input": "I want a refund for my broken item", | |
| "expected_keywords": ["refund", "return", "sorry"], | |
| "tags": ["refunds"], | |
| }, | |
| { | |
| "input": "How long does shipping take to California?", | |
| "expected_keywords": ["shipping", "days", "delivery"], | |
| "tags": ["shipping"], | |
| }, | |
| { | |
| "input": "What's the capital of France?", | |
| "expected_keywords": [], | |
| "expected_absent": ["Paris"], # Should stay on topic | |
| "tags": ["off-topic"], | |
| }, | |
| ] | |
| def get_response(user_input: str) -> str: | |
| response = client.messages.create( | |
| model="claude-sonnet-4-5-20250929", | |
| max_tokens=256, | |
| system=SYSTEM_PROMPT, | |
| messages=[{"role": "user", "content": user_input}], | |
| ) | |
| return response.content[0].text | |
| def check_keywords(response: str, case: TestCase) -> tuple[bool, str]: | |
| response_lower = response.lower() | |
| for kw in case.get("expected_keywords", []): | |
| if kw.lower() not in response_lower: | |
| return False, f"Missing keyword: {kw}" | |
| for kw in case.get("expected_absent", []): | |
| if kw.lower() in response_lower: | |
| return False, f"Should not contain: {kw}" | |
| return True, "All checks passed" | |
| results: list[Result] = [] | |
| for case in test_cases: | |
| response = get_response(case["input"]) | |
| passed, reason = check_keywords(response, case) | |
| result: Result = { | |
| "input": case["input"], | |
| "response": response, | |
| "passed": passed, | |
| "reason": reason, | |
| "tags": case["tags"], | |
| } | |
| results.append(result) | |
| print(json.dumps(results, indent=2)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| [ | |
| { | |
| "input": "Where is my order #12345?", | |
| "response": "I'd be happy to help you track order #12345! However, I don't have access to your order information in our system right now.\n\nTo check your order status, you can:\n\n1. **Check your email** - Look for an order confirmation with tracking details\n2. **Log into your account** on our website and go to \"Order History\"\n3. **Contact our support team directly** with your order number - they'll have access to real-time tracking information\n\nIf you'd like, I can help you with:\n- General shipping timeframes\n- What to do if your order is delayed\n- How to update delivery preferences\n\nIs there anything specific about your order you're concerned about, or would you like help with something else?", | |
| "passed": true, | |
| "reason": "All checks passed", | |
| "tags": [ | |
| "orders" | |
| ] | |
| }, | |
| { | |
| "input": "I want a refund for my broken item", | |
| "response": "I'd be happy to help you with a refund for your broken item. To process this for you, I'll need a few details:\n\n1. **Order number** - This helps me locate your purchase\n2. **What item was broken?** - Please describe which product\n3. **When did you receive it?** - Approximate date is fine\n4. **What kind of damage occurred?** - This helps us improve our packaging and shipping\n\nOnce I have this information, I can start the refund process for you right away. Most refunds are processed within 5-7 business days.\n\nIs the item still in your possession, or would you like to include a return as part of this refund?", | |
| "passed": false, | |
| "reason": "Missing keyword: sorry", | |
| "tags": [ | |
| "refunds" | |
| ] | |
| }, | |
| { | |
| "input": "How long does shipping take to California?", | |
| "response": "Standard shipping to California typically takes **3-7 business days** from the time your order is processed.\n\nWe also offer expedited shipping options:\n- **2-Day Shipping**: Delivered within 2 business days\n- **Overnight Shipping**: Next business day delivery\n\n**Please note:**\n- Processing time (1-2 business days) is in addition to shipping time\n- Delivery times may vary based on your specific location in California\n- Orders placed on weekends/holidays are processed the next business day\n\nYou can see exact shipping options and costs for your specific address at checkout. Is there anything else you'd like to know about shipping?", | |
| "passed": true, | |
| "reason": "All checks passed", | |
| "tags": [ | |
| "shipping" | |
| ] | |
| }, | |
| { | |
| "input": "What's the capital of France?", | |
| "response": "The capital of France is **Paris**.\n\nHowever, I notice you've reached out to customer support for an online store. Is there anything I can help you with regarding your order, products, account, shipping, or any other store-related questions? I'm here to assist! \ud83d\ude0a", | |
| "passed": false, | |
| "reason": "Should not contain: Paris", | |
| "tags": [ | |
| "off-topic" | |
| ] | |
| } | |
| ] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment