caleb-kaiser · October 24, 2024 00:39
diff --git a/4-custom-metric.ipynb b/4-custom-metric.ipynb
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/caleb-kaiser/1df38e5268f80f4fc46316a0cbba2f39/4-custom-metric.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "<img src=\"https://raw.githubusercontent.com/comet-ml/opik/main/apps/opik-documentation/documentation/static/img/opik-logo.svg\" width=\"250\"/>"
      ],
      "metadata": {
        "id": "O9mgL3KF9m2k"
      }
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "mDqAkheK9j4j"
      },
      "source": [
        "# Defining a Custom Metric in Opik.\n",
        "\n",
        "In this lesson, we will define a custom metric called Factuality. You can use OpenAI or open source models via LiteLLM."
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Imports & Configuration"
      ],
      "metadata": {
        "id": "5psT3jEC9e83"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "! pip install comet-ml opik openai litellm --quiet"
      ],
      "metadata": {
        "id": "ZBF2E0EIyikz"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "0fvHKpAd9j4o"
      },
      "outputs": [],
      "source": [
        "from opik import Opik, track, DatasetItem\n",
        "from opik.evaluation import evaluate\n",
        "from opik.integrations.openai import track_openai\n",
        "from opik.evaluation.metrics import base_metric, score_result\n",
        "import openai\n",
        "import os\n",
        "from datetime import datetime\n",
        "from getpass import getpass\n",
        "import litellm\n",
        "\n",
        "\n",
        "# Define project name to enable tracing\n",
        "os.environ[\"OPIK_PROJECT_NAME\"] = \"food_chatbot_eval\"\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# opik configs\n",
        "if \"OPIK_API_KEY\" not in os.environ:\n",
        "    os.environ[\"OPIK_API_KEY\"] = getpass(\"Enter your Opik API key: \")\n",
        "\n"
      ],
      "metadata": {
        "id": "JrFiw0joyD04"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# openai configs\n",
        "if \"OPENAI_API_KEY\" not in os.environ:\n",
        "    os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter your OpenAI API key: \")\n"
      ],
      "metadata": {
        "id": "a00iWl7XyK6u"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import opik\n",
        "\n",
        "opik.configure(use_local=False)"
      ],
      "metadata": {
        "id": "L7hGe7avyMWr"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Templates & Context"
      ],
      "metadata": {
        "id": "_-5O4uA5wpnU"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# menu items\n",
        "menu_items = \"\"\"\n",
        "Menu: Kids Menu\n",
        "Food Item: Mini Cheeseburger\n",
        "Price: $6.99\n",
        "Vegan: N\n",
        "Popularity: 4/5\n",
        "Included: Mini beef patty, cheese, lettuce, tomato, and fries.\n",
        "\n",
        "Menu: Appetizers\n",
        "Food Item: Loaded Potato Skins\n",
        "Price: $8.99\n",
        "Vegan: N\n",
        "Popularity: 3/5\n",
        "Included: Crispy potato skins filled with cheese, bacon bits, and served with sour cream.\n",
        "\n",
        "Menu: Appetizers\n",
        "Food Item: Bruschetta\n",
        "Price: $7.99\n",
        "Vegan: Y\n",
        "Popularity: 4/5\n",
        "Included: Toasted baguette slices topped with fresh tomatoes, basil, garlic, and balsamic glaze.\n",
        "\n",
        "Menu: Main Menu\n",
        "Food Item: Grilled Chicken Caesar Salad\n",
        "Price: $12.99\n",
        "Vegan: N\n",
        "Popularity: 4/5\n",
        "Included: Grilled chicken breast, romaine lettuce, Parmesan cheese, croutons, and Caesar dressing.\n",
        "\n",
        "Menu: Main Menu\n",
        "Food Item: Classic Cheese Pizza\n",
        "Price: $10.99\n",
        "Vegan: N\n",
        "Popularity: 5/5\n",
        "Included: Thin-crust pizza topped with tomato sauce, mozzarella cheese, and fresh basil.\n",
        "\n",
        "Menu: Main Menu\n",
        "Food Item: Spaghetti Bolognese\n",
        "Price: $14.99\n",
        "Vegan: N\n",
        "Popularity: 4/5\n",
        "Included: Pasta tossed in a savory meat sauce made with ground beef, tomatoes, onions, and herbs.\n",
        "\n",
        "Menu: Vegan Options\n",
        "Food Item: Veggie Wrap\n",
        "Price: $9.99\n",
        "Vegan: Y\n",
        "Popularity: 3/5\n",
        "Included: Grilled vegetables, hummus, mixed greens, and a wrap served with a side of sweet potato fries.\n",
        "\n",
        "Menu: Vegan Options\n",
        "Food Item: Vegan Beyond Burger\n",
        "Price: $11.99\n",
        "Vegan: Y\n",
        "Popularity: 4/5\n",
        "Included: Plant-based patty, vegan cheese, lettuce, tomato, onion, and a choice of regular or sweet potato fries.\n",
        "\n",
        "Menu: Desserts\n",
        "Food Item: Chocolate Lava Cake\n",
        "Price: $6.99\n",
        "Vegan: N\n",
        "Popularity: 5/5\n",
        "Included: Warm chocolate cake with a gooey molten center, served with vanilla ice cream.\n",
        "\n",
        "Menu: Desserts\n",
        "Food Item: Fresh Berry Parfait\n",
        "Price: $5.99\n",
        "Vegan: Y\n",
        "Popularity: 4/5\n",
        "Included: Layers of mixed berries, granola, and vegan coconut yogurt.\n",
        "\"\"\"\n"
      ],
      "metadata": {
        "id": "P8vs5LU8wrvl"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# prompt template for the Factuality metric\n",
        "prompt_template = \"\"\"\n",
        "###INSTRUCTIONS###\n",
        "\n",
        "You are a helpful assistant who should evaluate if a food chatbot's response is factual given user requests and a menu (delimited by +++++). Output 1 if the chatbot response is factually answering the user message and 0 if it doesn't.\n",
        "\n",
        "+++++\n",
        "{menu_items}\n",
        "+++++\n",
        "\n",
        "###EXAMPLE OUTPUT FORMAT###\n",
        "{{\n",
        "    \"value\": 0,\n",
        "    \"reason\": \"The response is not factually answering the user question.\"\n",
        "}}\n",
        "\n",
        "###INPUTS:###\n",
        "{user_message}\n",
        "\n",
        "###RESPONSE:###\n",
        "{chatbot_response}\n",
        "\"\"\"\n"
      ],
      "metadata": {
        "id": "33y7keDlwtjo"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "question_template = \"\"\"Answer a question about the following menu:\n",
        "\n",
        "# MENU\n",
        "{menu}\n",
        "\n",
        "# QUESTION\n",
        "{question}\n",
        "\"\"\""
      ],
      "metadata": {
        "id": "mI1sS8Awwzu2"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Dataset"
      ],
      "metadata": {
        "id": "yWxSUH-X1q7x"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Create or get the dataset\n",
        "dataset = client.get_or_create_dataset(name=\"foodchatbot_eval\")"
      ],
      "metadata": {
        "id": "NL8CIFRl1qac"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Optional: Download Dataset From Comet"
      ],
      "metadata": {
        "id": "XdPwkTV01zgu"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "If you have not previously created the `foodchatbot_eval` dataset in your Opik workspace, run the following code to download the dataset as a Comet Artifact and populate your Opik dataset.\n",
        "\n",
        "If you have already created the `foodchatbot_eval` dataset, you can skip to the next section"
      ],
      "metadata": {
        "id": "hWV6j7MM2X38"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import comet_ml"
      ],
      "metadata": {
        "id": "1L8re4Wu1yDv"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "experiment = comet_ml.start(project_name=\"foodchatbot_eval\")\n",
        "\n",
        "logged_artifact = experiment.get_artifact(artifact_name=\"foodchatbot_eval\",\n",
        "                                        workspace=\"examples\")\n",
        "local_artifact = logged_artifact.download(\"./\")\n",
        "experiment.end()"
      ],
      "metadata": {
        "id": "5_naIp7_12on"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import csv\n",
        "import json\n",
        "# Read the CSV file and insert items into the dataset\n",
        "with open('./foodchatbot_clean_eval_dataset.csv', newline='') as csvfile:\n",
        "    reader = csv.reader(csvfile)\n",
        "    for row in reader:\n",
        "        index, question, response = row\n",
        "        item = {\n",
        "            \"index\": index,\n",
        "            \"question\": question,\n",
        "            \"response\": response\n",
        "        }\n",
        "\n",
        "        dataset.insert([item])"
      ],
      "metadata": {
        "id": "NtuhLA0X12mN"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Build Your Application"
      ],
      "metadata": {
        "id": "3nmTICPOw2XV"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Simple little client class for using different LLM APIs (OpenAI or LiteLLM)\n",
        "class LLMClient:\n",
        "  def __init__(self, client_type: str =\"openai\", model: str =\"gpt-4o-mini\"):\n",
        "    self.client_type = client_type\n",
        "    self.model = model\n",
        "\n",
        "    if self.client_type == \"openai\":\n",
        "      self.client = track_openai(openai.OpenAI())\n",
        "\n",
        "    else:\n",
        "      self.client = None\n",
        "\n",
        "  # LiteLLM query function\n",
        "  def _get_litellm_response(self, query: str, system: str = \"You are a helpful assistant.\"):\n",
        "    messages = [\n",
        "        {\"role\": \"system\", \"content\": system },\n",
        "        { \"role\": \"user\", \"content\": query }\n",
        "    ]\n",
        "\n",
        "    response = litellm.completion(\n",
        "        model=self.model,\n",
        "        messages=messages\n",
        "    )\n",
        "\n",
        "    return response.choices[0].message.content\n",
        "\n",
        "  # OpenAI query function - use **kwargs to pass arguments like temperature\n",
        "  def _get_openai_response(self, query: str, system: str = \"You are a helpful assistant.\", **kwargs):\n",
        "    messages = [\n",
        "        {\"role\": \"system\", \"content\": system },\n",
        "        { \"role\": \"user\", \"content\": query }\n",
        "    ]\n",
        "\n",
        "    response = self.client.chat.completions.create(\n",
        "        model=self.model,\n",
        "        messages=messages,\n",
        "        **kwargs\n",
        "    )\n",
        "\n",
        "    return response.choices[0].message.content\n",
        "\n",
        "\n",
        "  def query(self, query: str, system: str = \"You are a helpful assistant.\", **kwargs):\n",
        "    if self.client_type == 'openai':\n",
        "      return self._get_openai_response(query, system, **kwargs)\n",
        "\n",
        "    else:\n",
        "      return self._get_litellm_response(query, system)\n",
        "\n",
        "\n",
        "\n"
      ],
      "metadata": {
        "id": "bZ1hCID4vaAE"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Initialize your client!\n",
        "\n",
        "llm_client = LLMClient()"
      ],
      "metadata": {
        "id": "aiLj5YNUyTXZ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Evaluation"
      ],
      "metadata": {
        "id": "9a9UlyKZ9ZNl"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Define the Factuality Metric\n",
        "class Factuality(base_metric.BaseMetric):\n",
        "    def __init__(self, name: str):\n",
        "        self.name = name\n",
        "\n",
        "    def score(self, input: str, output: str, context: str, reference: str):\n",
        "        response = llm_client.query(prompt_template.format(menu_items=context, user_message=input, chatbot_response=output))\n",
        "\n",
        "        response = eval(response)\n",
        "\n",
        "        return score_result.ScoreResult(\n",
        "            value=response[\"value\"],\n",
        "            name=self.name,\n",
        "            reason=response[\"reason\"]\n",
        "        )\n"
      ],
      "metadata": {
        "id": "vwmKvx8_vT2M"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "dI-rxB3iwSlR"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "8MJp8lQP9j4r"
      },
      "outputs": [],
      "source": [
        "@track\n",
        "def chatbot_application(input: str) -> str:\n",
        "    response = llm_client.query(question_template.format(menu=menu_items, question=input))\n",
        "    return response\n"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Define the evaluation task\n",
        "def evaluation_task(x: DatasetItem):\n",
        "    return {\n",
        "        \"input\": x['question'],\n",
        "        \"output\": chatbot_application(x['question']),\n",
        "        \"context\": menu_items,\n",
        "        \"reference\": x['response']\n",
        "    }\n"
      ],
      "metadata": {
        "id": "Gvj3h1CixJ2K"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "client = Opik()"
      ],
      "metadata": {
        "id": "EtYYa2La2mzx"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Define the metrics\n",
        "metrics = [Factuality(\"Factuality\")]"
      ],
      "metadata": {
        "id": "WU5yRdaQxKo1"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Run evaluation\n",
        "experiment_name = \"gpt-4o-mini\" + \"_\" + dataset.name + \"_\" + datetime.now().strftime(\"%Y-%m-%d_%H-%M-%S\")\n",
        "\n",
        "evaluation = evaluate(\n",
        "    experiment_name=experiment_name,\n",
        "    dataset=dataset,\n",
        "    task=evaluation_task,\n",
        "    scoring_metrics=metrics,\n",
        "    experiment_config={\n",
        "        \"model\": \"gpt-4o-mini\"\n",
        "    }\n",
        ")"
      ],
      "metadata": {
        "id": "ebvqkbRDxKly"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "FdUvk7lvxKiK"
      },
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "comet-eval",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.15"
    },
    "colab": {
      "provenance": [],
      "collapsed_sections": [
        "5psT3jEC9e83",
        "_-5O4uA5wpnU",
        "yWxSUH-X1q7x",
        "3nmTICPOw2XV",
        "9a9UlyKZ9ZNl"
      ],
      "include_colab_link": true
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/caleb-kaiser/1df38e5268f80f4fc46316a0cbba2f39/4-custom-metric.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"<img src=\"https://raw.githubusercontent.com/comet-ml/opik/main/apps/opik-documentation/documentation/static/img/opik-logo.svg\" width=\"250\"/>"
	],
	"metadata": {
	"id": "O9mgL3KF9m2k"
	}
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "mDqAkheK9j4j"
	},
	"source": [
	"# Defining a Custom Metric in Opik.\n",
	"\n",
	"In this lesson, we will define a custom metric called Factuality. You can use OpenAI or open source models via LiteLLM."
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Imports & Configuration"
	],
	"metadata": {
	"id": "5psT3jEC9e83"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"! pip install comet-ml opik openai litellm --quiet"
	],
	"metadata": {
	"id": "ZBF2E0EIyikz"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "0fvHKpAd9j4o"
	},
	"outputs": [],
	"source": [
	"from opik import Opik, track, DatasetItem\n",
	"from opik.evaluation import evaluate\n",
	"from opik.integrations.openai import track_openai\n",
	"from opik.evaluation.metrics import base_metric, score_result\n",
	"import openai\n",
	"import os\n",
	"from datetime import datetime\n",
	"from getpass import getpass\n",
	"import litellm\n",
	"\n",
	"\n",
	"# Define project name to enable tracing\n",
	"os.environ[\"OPIK_PROJECT_NAME\"] = \"food_chatbot_eval\"\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"# opik configs\n",
	"if \"OPIK_API_KEY\" not in os.environ:\n",
	" os.environ[\"OPIK_API_KEY\"] = getpass(\"Enter your Opik API key: \")\n",
	"\n"
	],
	"metadata": {
	"id": "JrFiw0joyD04"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# openai configs\n",
	"if \"OPENAI_API_KEY\" not in os.environ:\n",
	" os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter your OpenAI API key: \")\n"
	],
	"metadata": {
	"id": "a00iWl7XyK6u"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"import opik\n",
	"\n",
	"opik.configure(use_local=False)"
	],
	"metadata": {
	"id": "L7hGe7avyMWr"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Templates & Context"
	],
	"metadata": {
	"id": "_-5O4uA5wpnU"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"# menu items\n",
	"menu_items = \"\"\"\n",
	"Menu: Kids Menu\n",
	"Food Item: Mini Cheeseburger\n",
	"Price: $6.99\n",
	"Vegan: N\n",
	"Popularity: 4/5\n",
	"Included: Mini beef patty, cheese, lettuce, tomato, and fries.\n",
	"\n",
	"Menu: Appetizers\n",
	"Food Item: Loaded Potato Skins\n",
	"Price: $8.99\n",
	"Vegan: N\n",
	"Popularity: 3/5\n",
	"Included: Crispy potato skins filled with cheese, bacon bits, and served with sour cream.\n",
	"\n",
	"Menu: Appetizers\n",
	"Food Item: Bruschetta\n",
	"Price: $7.99\n",
	"Vegan: Y\n",
	"Popularity: 4/5\n",
	"Included: Toasted baguette slices topped with fresh tomatoes, basil, garlic, and balsamic glaze.\n",
	"\n",
	"Menu: Main Menu\n",
	"Food Item: Grilled Chicken Caesar Salad\n",
	"Price: $12.99\n",
	"Vegan: N\n",
	"Popularity: 4/5\n",
	"Included: Grilled chicken breast, romaine lettuce, Parmesan cheese, croutons, and Caesar dressing.\n",
	"\n",
	"Menu: Main Menu\n",
	"Food Item: Classic Cheese Pizza\n",
	"Price: $10.99\n",
	"Vegan: N\n",
	"Popularity: 5/5\n",
	"Included: Thin-crust pizza topped with tomato sauce, mozzarella cheese, and fresh basil.\n",
	"\n",
	"Menu: Main Menu\n",
	"Food Item: Spaghetti Bolognese\n",
	"Price: $14.99\n",
	"Vegan: N\n",
	"Popularity: 4/5\n",
	"Included: Pasta tossed in a savory meat sauce made with ground beef, tomatoes, onions, and herbs.\n",
	"\n",
	"Menu: Vegan Options\n",
	"Food Item: Veggie Wrap\n",
	"Price: $9.99\n",
	"Vegan: Y\n",
	"Popularity: 3/5\n",
	"Included: Grilled vegetables, hummus, mixed greens, and a wrap served with a side of sweet potato fries.\n",
	"\n",
	"Menu: Vegan Options\n",
	"Food Item: Vegan Beyond Burger\n",
	"Price: $11.99\n",
	"Vegan: Y\n",
	"Popularity: 4/5\n",
	"Included: Plant-based patty, vegan cheese, lettuce, tomato, onion, and a choice of regular or sweet potato fries.\n",
	"\n",
	"Menu: Desserts\n",
	"Food Item: Chocolate Lava Cake\n",
	"Price: $6.99\n",
	"Vegan: N\n",
	"Popularity: 5/5\n",
	"Included: Warm chocolate cake with a gooey molten center, served with vanilla ice cream.\n",
	"\n",
	"Menu: Desserts\n",
	"Food Item: Fresh Berry Parfait\n",
	"Price: $5.99\n",
	"Vegan: Y\n",
	"Popularity: 4/5\n",
	"Included: Layers of mixed berries, granola, and vegan coconut yogurt.\n",
	"\"\"\"\n"
	],
	"metadata": {
	"id": "P8vs5LU8wrvl"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# prompt template for the Factuality metric\n",
	"prompt_template = \"\"\"\n",
	"###INSTRUCTIONS###\n",
	"\n",
	"You are a helpful assistant who should evaluate if a food chatbot's response is factual given user requests and a menu (delimited by +++++). Output 1 if the chatbot response is factually answering the user message and 0 if it doesn't.\n",
	"\n",
	"+++++\n",
	"{menu_items}\n",
	"+++++\n",
	"\n",
	"###EXAMPLE OUTPUT FORMAT###\n",
	"{{\n",
	" \"value\": 0,\n",
	" \"reason\": \"The response is not factually answering the user question.\"\n",
	"}}\n",
	"\n",
	"###INPUTS:###\n",
	"{user_message}\n",
	"\n",
	"###RESPONSE:###\n",
	"{chatbot_response}\n",
	"\"\"\"\n"
	],
	"metadata": {
	"id": "33y7keDlwtjo"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"question_template = \"\"\"Answer a question about the following menu:\n",
	"\n",
	"# MENU\n",
	"{menu}\n",
	"\n",
	"# QUESTION\n",
	"{question}\n",
	"\"\"\""
	],
	"metadata": {
	"id": "mI1sS8Awwzu2"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Dataset"
	],
	"metadata": {
	"id": "yWxSUH-X1q7x"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"# Create or get the dataset\n",
	"dataset = client.get_or_create_dataset(name=\"foodchatbot_eval\")"
	],
	"metadata": {
	"id": "NL8CIFRl1qac"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"## Optional: Download Dataset From Comet"
	],
	"metadata": {
	"id": "XdPwkTV01zgu"
	}
	},
	{
	"cell_type": "markdown",
	"source": [
	"If you have not previously created the `foodchatbot_eval` dataset in your Opik workspace, run the following code to download the dataset as a Comet Artifact and populate your Opik dataset.\n",
	"\n",
	"If you have already created the `foodchatbot_eval` dataset, you can skip to the next section"
	],
	"metadata": {
	"id": "hWV6j7MM2X38"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"import comet_ml"
	],
	"metadata": {
	"id": "1L8re4Wu1yDv"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"experiment = comet_ml.start(project_name=\"foodchatbot_eval\")\n",
	"\n",
	"logged_artifact = experiment.get_artifact(artifact_name=\"foodchatbot_eval\",\n",
	" workspace=\"examples\")\n",
	"local_artifact = logged_artifact.download(\"./\")\n",
	"experiment.end()"
	],
	"metadata": {
	"id": "5_naIp7_12on"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"import csv\n",
	"import json\n",
	"# Read the CSV file and insert items into the dataset\n",
	"with open('./foodchatbot_clean_eval_dataset.csv', newline='') as csvfile:\n",
	" reader = csv.reader(csvfile)\n",
	" for row in reader:\n",
	" index, question, response = row\n",
	" item = {\n",
	" \"index\": index,\n",
	" \"question\": question,\n",
	" \"response\": response\n",
	" }\n",
	"\n",
	" dataset.insert([item])"
	],
	"metadata": {
	"id": "NtuhLA0X12mN"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Build Your Application"
	],
	"metadata": {
	"id": "3nmTICPOw2XV"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"# Simple little client class for using different LLM APIs (OpenAI or LiteLLM)\n",
	"class LLMClient:\n",
	" def __init__(self, client_type: str =\"openai\", model: str =\"gpt-4o-mini\"):\n",
	" self.client_type = client_type\n",
	" self.model = model\n",
	"\n",
	" if self.client_type == \"openai\":\n",
	" self.client = track_openai(openai.OpenAI())\n",
	"\n",
	" else:\n",
	" self.client = None\n",
	"\n",
	" # LiteLLM query function\n",
	" def _get_litellm_response(self, query: str, system: str = \"You are a helpful assistant.\"):\n",
	" messages = [\n",
	" {\"role\": \"system\", \"content\": system },\n",
	" { \"role\": \"user\", \"content\": query }\n",
	" ]\n",
	"\n",
	" response = litellm.completion(\n",
	" model=self.model,\n",
	" messages=messages\n",
	" )\n",
	"\n",
	" return response.choices[0].message.content\n",
	"\n",
	" # OpenAI query function - use **kwargs to pass arguments like temperature\n",
	" def _get_openai_response(self, query: str, system: str = \"You are a helpful assistant.\", **kwargs):\n",
	" messages = [\n",
	" {\"role\": \"system\", \"content\": system },\n",
	" { \"role\": \"user\", \"content\": query }\n",
	" ]\n",
	"\n",
	" response = self.client.chat.completions.create(\n",
	" model=self.model,\n",
	" messages=messages,\n",
	" **kwargs\n",
	" )\n",
	"\n",
	" return response.choices[0].message.content\n",
	"\n",
	"\n",
	" def query(self, query: str, system: str = \"You are a helpful assistant.\", **kwargs):\n",
	" if self.client_type == 'openai':\n",
	" return self._get_openai_response(query, system, **kwargs)\n",
	"\n",
	" else:\n",
	" return self._get_litellm_response(query, system)\n",
	"\n",
	"\n",
	"\n"
	],
	"metadata": {
	"id": "bZ1hCID4vaAE"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# Initialize your client!\n",
	"\n",
	"llm_client = LLMClient()"
	],
	"metadata": {
	"id": "aiLj5YNUyTXZ"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Evaluation"
	],
	"metadata": {
	"id": "9a9UlyKZ9ZNl"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"# Define the Factuality Metric\n",
	"class Factuality(base_metric.BaseMetric):\n",
	" def __init__(self, name: str):\n",
	" self.name = name\n",
	"\n",
	" def score(self, input: str, output: str, context: str, reference: str):\n",
	" response = llm_client.query(prompt_template.format(menu_items=context, user_message=input, chatbot_response=output))\n",
	"\n",
	" response = eval(response)\n",
	"\n",
	" return score_result.ScoreResult(\n",
	" value=response[\"value\"],\n",
	" name=self.name,\n",
	" reason=response[\"reason\"]\n",
	" )\n"
	],
	"metadata": {
	"id": "vwmKvx8_vT2M"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [],
	"metadata": {
	"id": "dI-rxB3iwSlR"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "8MJp8lQP9j4r"
	},
	"outputs": [],
	"source": [
	"@track\n",
	"def chatbot_application(input: str) -> str:\n",
	" response = llm_client.query(question_template.format(menu=menu_items, question=input))\n",
	" return response\n"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"# Define the evaluation task\n",
	"def evaluation_task(x: DatasetItem):\n",
	" return {\n",
	" \"input\": x['question'],\n",
	" \"output\": chatbot_application(x['question']),\n",
	" \"context\": menu_items,\n",
	" \"reference\": x['response']\n",
	" }\n"
	],
	"metadata": {
	"id": "Gvj3h1CixJ2K"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"client = Opik()"
	],
	"metadata": {
	"id": "EtYYa2La2mzx"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# Define the metrics\n",
	"metrics = [Factuality(\"Factuality\")]"
	],
	"metadata": {
	"id": "WU5yRdaQxKo1"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# Run evaluation\n",
	"experiment_name = \"gpt-4o-mini\" + \"_\" + dataset.name + \"_\" + datetime.now().strftime(\"%Y-%m-%d_%H-%M-%S\")\n",
	"\n",
	"evaluation = evaluate(\n",
	" experiment_name=experiment_name,\n",
	" dataset=dataset,\n",
	" task=evaluation_task,\n",
	" scoring_metrics=metrics,\n",
	" experiment_config={\n",
	" \"model\": \"gpt-4o-mini\"\n",
	" }\n",
	")"
	],
	"metadata": {
	"id": "ebvqkbRDxKly"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [],
	"metadata": {
	"id": "FdUvk7lvxKiK"
	},
	"execution_count": null,
	"outputs": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "comet-eval",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.10.15"
	},
	"colab": {
	"provenance": [],
	"collapsed_sections": [
	"5psT3jEC9e83",
	"_-5O4uA5wpnU",
	"yWxSUH-X1q7x",
	"3nmTICPOw2XV",
	"9a9UlyKZ9ZNl"
	],
	"include_colab_link": true
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}
No results found