Created
October 24, 2024 00:39
-
-
Save caleb-kaiser/1df38e5268f80f4fc46316a0cbba2f39 to your computer and use it in GitHub Desktop.
4-custom-metric.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/caleb-kaiser/1df38e5268f80f4fc46316a0cbba2f39/4-custom-metric.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "<img src=\"https://raw.githubusercontent.com/comet-ml/opik/main/apps/opik-documentation/documentation/static/img/opik-logo.svg\" width=\"250\"/>" | |
| ], | |
| "metadata": { | |
| "id": "O9mgL3KF9m2k" | |
| } | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "mDqAkheK9j4j" | |
| }, | |
| "source": [ | |
| "# Defining a Custom Metric in Opik.\n", | |
| "\n", | |
| "In this lesson, we will define a custom metric called Factuality. You can use OpenAI or open source models via LiteLLM." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# Imports & Configuration" | |
| ], | |
| "metadata": { | |
| "id": "5psT3jEC9e83" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "! pip install comet-ml opik openai litellm --quiet" | |
| ], | |
| "metadata": { | |
| "id": "ZBF2E0EIyikz" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "id": "0fvHKpAd9j4o" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from opik import Opik, track, DatasetItem\n", | |
| "from opik.evaluation import evaluate\n", | |
| "from opik.integrations.openai import track_openai\n", | |
| "from opik.evaluation.metrics import base_metric, score_result\n", | |
| "import openai\n", | |
| "import os\n", | |
| "from datetime import datetime\n", | |
| "from getpass import getpass\n", | |
| "import litellm\n", | |
| "\n", | |
| "\n", | |
| "# Define project name to enable tracing\n", | |
| "os.environ[\"OPIK_PROJECT_NAME\"] = \"food_chatbot_eval\"\n", | |
| "\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# opik configs\n", | |
| "if \"OPIK_API_KEY\" not in os.environ:\n", | |
| " os.environ[\"OPIK_API_KEY\"] = getpass(\"Enter your Opik API key: \")\n", | |
| "\n" | |
| ], | |
| "metadata": { | |
| "id": "JrFiw0joyD04" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# openai configs\n", | |
| "if \"OPENAI_API_KEY\" not in os.environ:\n", | |
| " os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter your OpenAI API key: \")\n" | |
| ], | |
| "metadata": { | |
| "id": "a00iWl7XyK6u" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "import opik\n", | |
| "\n", | |
| "opik.configure(use_local=False)" | |
| ], | |
| "metadata": { | |
| "id": "L7hGe7avyMWr" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# Templates & Context" | |
| ], | |
| "metadata": { | |
| "id": "_-5O4uA5wpnU" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# menu items\n", | |
| "menu_items = \"\"\"\n", | |
| "Menu: Kids Menu\n", | |
| "Food Item: Mini Cheeseburger\n", | |
| "Price: $6.99\n", | |
| "Vegan: N\n", | |
| "Popularity: 4/5\n", | |
| "Included: Mini beef patty, cheese, lettuce, tomato, and fries.\n", | |
| "\n", | |
| "Menu: Appetizers\n", | |
| "Food Item: Loaded Potato Skins\n", | |
| "Price: $8.99\n", | |
| "Vegan: N\n", | |
| "Popularity: 3/5\n", | |
| "Included: Crispy potato skins filled with cheese, bacon bits, and served with sour cream.\n", | |
| "\n", | |
| "Menu: Appetizers\n", | |
| "Food Item: Bruschetta\n", | |
| "Price: $7.99\n", | |
| "Vegan: Y\n", | |
| "Popularity: 4/5\n", | |
| "Included: Toasted baguette slices topped with fresh tomatoes, basil, garlic, and balsamic glaze.\n", | |
| "\n", | |
| "Menu: Main Menu\n", | |
| "Food Item: Grilled Chicken Caesar Salad\n", | |
| "Price: $12.99\n", | |
| "Vegan: N\n", | |
| "Popularity: 4/5\n", | |
| "Included: Grilled chicken breast, romaine lettuce, Parmesan cheese, croutons, and Caesar dressing.\n", | |
| "\n", | |
| "Menu: Main Menu\n", | |
| "Food Item: Classic Cheese Pizza\n", | |
| "Price: $10.99\n", | |
| "Vegan: N\n", | |
| "Popularity: 5/5\n", | |
| "Included: Thin-crust pizza topped with tomato sauce, mozzarella cheese, and fresh basil.\n", | |
| "\n", | |
| "Menu: Main Menu\n", | |
| "Food Item: Spaghetti Bolognese\n", | |
| "Price: $14.99\n", | |
| "Vegan: N\n", | |
| "Popularity: 4/5\n", | |
| "Included: Pasta tossed in a savory meat sauce made with ground beef, tomatoes, onions, and herbs.\n", | |
| "\n", | |
| "Menu: Vegan Options\n", | |
| "Food Item: Veggie Wrap\n", | |
| "Price: $9.99\n", | |
| "Vegan: Y\n", | |
| "Popularity: 3/5\n", | |
| "Included: Grilled vegetables, hummus, mixed greens, and a wrap served with a side of sweet potato fries.\n", | |
| "\n", | |
| "Menu: Vegan Options\n", | |
| "Food Item: Vegan Beyond Burger\n", | |
| "Price: $11.99\n", | |
| "Vegan: Y\n", | |
| "Popularity: 4/5\n", | |
| "Included: Plant-based patty, vegan cheese, lettuce, tomato, onion, and a choice of regular or sweet potato fries.\n", | |
| "\n", | |
| "Menu: Desserts\n", | |
| "Food Item: Chocolate Lava Cake\n", | |
| "Price: $6.99\n", | |
| "Vegan: N\n", | |
| "Popularity: 5/5\n", | |
| "Included: Warm chocolate cake with a gooey molten center, served with vanilla ice cream.\n", | |
| "\n", | |
| "Menu: Desserts\n", | |
| "Food Item: Fresh Berry Parfait\n", | |
| "Price: $5.99\n", | |
| "Vegan: Y\n", | |
| "Popularity: 4/5\n", | |
| "Included: Layers of mixed berries, granola, and vegan coconut yogurt.\n", | |
| "\"\"\"\n" | |
| ], | |
| "metadata": { | |
| "id": "P8vs5LU8wrvl" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# prompt template for the Factuality metric\n", | |
| "prompt_template = \"\"\"\n", | |
| "###INSTRUCTIONS###\n", | |
| "\n", | |
| "You are a helpful assistant who should evaluate if a food chatbot's response is factual given user requests and a menu (delimited by +++++). Output 1 if the chatbot response is factually answering the user message and 0 if it doesn't.\n", | |
| "\n", | |
| "+++++\n", | |
| "{menu_items}\n", | |
| "+++++\n", | |
| "\n", | |
| "###EXAMPLE OUTPUT FORMAT###\n", | |
| "{{\n", | |
| " \"value\": 0,\n", | |
| " \"reason\": \"The response is not factually answering the user question.\"\n", | |
| "}}\n", | |
| "\n", | |
| "###INPUTS:###\n", | |
| "{user_message}\n", | |
| "\n", | |
| "###RESPONSE:###\n", | |
| "{chatbot_response}\n", | |
| "\"\"\"\n" | |
| ], | |
| "metadata": { | |
| "id": "33y7keDlwtjo" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "question_template = \"\"\"Answer a question about the following menu:\n", | |
| "\n", | |
| "# MENU\n", | |
| "{menu}\n", | |
| "\n", | |
| "# QUESTION\n", | |
| "{question}\n", | |
| "\"\"\"" | |
| ], | |
| "metadata": { | |
| "id": "mI1sS8Awwzu2" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# Dataset" | |
| ], | |
| "metadata": { | |
| "id": "yWxSUH-X1q7x" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Create or get the dataset\n", | |
| "dataset = client.get_or_create_dataset(name=\"foodchatbot_eval\")" | |
| ], | |
| "metadata": { | |
| "id": "NL8CIFRl1qac" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "## Optional: Download Dataset From Comet" | |
| ], | |
| "metadata": { | |
| "id": "XdPwkTV01zgu" | |
| } | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "If you have not previously created the `foodchatbot_eval` dataset in your Opik workspace, run the following code to download the dataset as a Comet Artifact and populate your Opik dataset.\n", | |
| "\n", | |
| "If you have already created the `foodchatbot_eval` dataset, you can skip to the next section" | |
| ], | |
| "metadata": { | |
| "id": "hWV6j7MM2X38" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "import comet_ml" | |
| ], | |
| "metadata": { | |
| "id": "1L8re4Wu1yDv" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "experiment = comet_ml.start(project_name=\"foodchatbot_eval\")\n", | |
| "\n", | |
| "logged_artifact = experiment.get_artifact(artifact_name=\"foodchatbot_eval\",\n", | |
| " workspace=\"examples\")\n", | |
| "local_artifact = logged_artifact.download(\"./\")\n", | |
| "experiment.end()" | |
| ], | |
| "metadata": { | |
| "id": "5_naIp7_12on" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "import csv\n", | |
| "import json\n", | |
| "# Read the CSV file and insert items into the dataset\n", | |
| "with open('./foodchatbot_clean_eval_dataset.csv', newline='') as csvfile:\n", | |
| " reader = csv.reader(csvfile)\n", | |
| " for row in reader:\n", | |
| " index, question, response = row\n", | |
| " item = {\n", | |
| " \"index\": index,\n", | |
| " \"question\": question,\n", | |
| " \"response\": response\n", | |
| " }\n", | |
| "\n", | |
| " dataset.insert([item])" | |
| ], | |
| "metadata": { | |
| "id": "NtuhLA0X12mN" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# Build Your Application" | |
| ], | |
| "metadata": { | |
| "id": "3nmTICPOw2XV" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Simple little client class for using different LLM APIs (OpenAI or LiteLLM)\n", | |
| "class LLMClient:\n", | |
| " def __init__(self, client_type: str =\"openai\", model: str =\"gpt-4o-mini\"):\n", | |
| " self.client_type = client_type\n", | |
| " self.model = model\n", | |
| "\n", | |
| " if self.client_type == \"openai\":\n", | |
| " self.client = track_openai(openai.OpenAI())\n", | |
| "\n", | |
| " else:\n", | |
| " self.client = None\n", | |
| "\n", | |
| " # LiteLLM query function\n", | |
| " def _get_litellm_response(self, query: str, system: str = \"You are a helpful assistant.\"):\n", | |
| " messages = [\n", | |
| " {\"role\": \"system\", \"content\": system },\n", | |
| " { \"role\": \"user\", \"content\": query }\n", | |
| " ]\n", | |
| "\n", | |
| " response = litellm.completion(\n", | |
| " model=self.model,\n", | |
| " messages=messages\n", | |
| " )\n", | |
| "\n", | |
| " return response.choices[0].message.content\n", | |
| "\n", | |
| " # OpenAI query function - use **kwargs to pass arguments like temperature\n", | |
| " def _get_openai_response(self, query: str, system: str = \"You are a helpful assistant.\", **kwargs):\n", | |
| " messages = [\n", | |
| " {\"role\": \"system\", \"content\": system },\n", | |
| " { \"role\": \"user\", \"content\": query }\n", | |
| " ]\n", | |
| "\n", | |
| " response = self.client.chat.completions.create(\n", | |
| " model=self.model,\n", | |
| " messages=messages,\n", | |
| " **kwargs\n", | |
| " )\n", | |
| "\n", | |
| " return response.choices[0].message.content\n", | |
| "\n", | |
| "\n", | |
| " def query(self, query: str, system: str = \"You are a helpful assistant.\", **kwargs):\n", | |
| " if self.client_type == 'openai':\n", | |
| " return self._get_openai_response(query, system, **kwargs)\n", | |
| "\n", | |
| " else:\n", | |
| " return self._get_litellm_response(query, system)\n", | |
| "\n", | |
| "\n", | |
| "\n" | |
| ], | |
| "metadata": { | |
| "id": "bZ1hCID4vaAE" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Initialize your client!\n", | |
| "\n", | |
| "llm_client = LLMClient()" | |
| ], | |
| "metadata": { | |
| "id": "aiLj5YNUyTXZ" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# Evaluation" | |
| ], | |
| "metadata": { | |
| "id": "9a9UlyKZ9ZNl" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Define the Factuality Metric\n", | |
| "class Factuality(base_metric.BaseMetric):\n", | |
| " def __init__(self, name: str):\n", | |
| " self.name = name\n", | |
| "\n", | |
| " def score(self, input: str, output: str, context: str, reference: str):\n", | |
| " response = llm_client.query(prompt_template.format(menu_items=context, user_message=input, chatbot_response=output))\n", | |
| "\n", | |
| " response = eval(response)\n", | |
| "\n", | |
| " return score_result.ScoreResult(\n", | |
| " value=response[\"value\"],\n", | |
| " name=self.name,\n", | |
| " reason=response[\"reason\"]\n", | |
| " )\n" | |
| ], | |
| "metadata": { | |
| "id": "vwmKvx8_vT2M" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [], | |
| "metadata": { | |
| "id": "dI-rxB3iwSlR" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "id": "8MJp8lQP9j4r" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "@track\n", | |
| "def chatbot_application(input: str) -> str:\n", | |
| " response = llm_client.query(question_template.format(menu=menu_items, question=input))\n", | |
| " return response\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Define the evaluation task\n", | |
| "def evaluation_task(x: DatasetItem):\n", | |
| " return {\n", | |
| " \"input\": x['question'],\n", | |
| " \"output\": chatbot_application(x['question']),\n", | |
| " \"context\": menu_items,\n", | |
| " \"reference\": x['response']\n", | |
| " }\n" | |
| ], | |
| "metadata": { | |
| "id": "Gvj3h1CixJ2K" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "client = Opik()" | |
| ], | |
| "metadata": { | |
| "id": "EtYYa2La2mzx" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Define the metrics\n", | |
| "metrics = [Factuality(\"Factuality\")]" | |
| ], | |
| "metadata": { | |
| "id": "WU5yRdaQxKo1" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Run evaluation\n", | |
| "experiment_name = \"gpt-4o-mini\" + \"_\" + dataset.name + \"_\" + datetime.now().strftime(\"%Y-%m-%d_%H-%M-%S\")\n", | |
| "\n", | |
| "evaluation = evaluate(\n", | |
| " experiment_name=experiment_name,\n", | |
| " dataset=dataset,\n", | |
| " task=evaluation_task,\n", | |
| " scoring_metrics=metrics,\n", | |
| " experiment_config={\n", | |
| " \"model\": \"gpt-4o-mini\"\n", | |
| " }\n", | |
| ")" | |
| ], | |
| "metadata": { | |
| "id": "ebvqkbRDxKly" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [], | |
| "metadata": { | |
| "id": "FdUvk7lvxKiK" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "comet-eval", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.10.15" | |
| }, | |
| "colab": { | |
| "provenance": [], | |
| "collapsed_sections": [ | |
| "5psT3jEC9e83", | |
| "_-5O4uA5wpnU", | |
| "yWxSUH-X1q7x", | |
| "3nmTICPOw2XV", | |
| "9a9UlyKZ9ZNl" | |
| ], | |
| "include_colab_link": true | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment