Created
October 24, 2024 00:35
-
-
Save caleb-kaiser/63a115156d140aec9c29fc655a4da38d to your computer and use it in GitHub Desktop.
3-2-evaluation-llm-based.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/caleb-kaiser/63a115156d140aec9c29fc655a4da38d/3-2-evaluation-llm-based.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "<img src=\"https://raw.githubusercontent.com/comet-ml/opik/main/apps/opik-documentation/documentation/static/img/opik-logo.svg\" width=\"250\"/>" | |
| ], | |
| "metadata": { | |
| "id": "VyT73jSw8nQJ" | |
| } | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "4tnPHsVuzY2O" | |
| }, | |
| "source": [ | |
| "# LLM-Based Evaluation with Opik" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "In this exercise, you'll be evaluationg LLM applications with LLM-as-a-judge metrics. You can use OpenAI or open source models via LiteLLM. To make the exercise a little more exciting, you'll be running your evaluations using HaluBench, the popular hallucination dataset." | |
| ], | |
| "metadata": { | |
| "id": "oCCyTFCia3A2" | |
| } | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# Imports & Configuration" | |
| ], | |
| "metadata": { | |
| "id": "rkMMwPik0obY" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "%pip install opik openai comet_ml litellm --quiet" | |
| ], | |
| "metadata": { | |
| "id": "jqGjzWRc0k0n" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "id": "MfaBKoVnzY2R" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import opik\n", | |
| "from opik import Opik, track\n", | |
| "from opik.evaluation import evaluate\n", | |
| "from opik.evaluation.metrics import (Hallucination, AnswerRelevance)\n", | |
| "from opik.integrations.openai import track_openai\n", | |
| "import openai\n", | |
| "import os\n", | |
| "from datetime import datetime\n", | |
| "from getpass import getpass\n", | |
| "import litellm\n", | |
| "\n", | |
| "# Define project name to enable tracing\n", | |
| "os.environ[\"OPIK_PROJECT_NAME\"] = \"llm-based-eval\"\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# opik configs\n", | |
| "if \"OPIK_API_KEY\" not in os.environ:\n", | |
| " os.environ[\"OPIK_API_KEY\"] = getpass(\"Enter your Opik API key: \")\n", | |
| "\n", | |
| "opik.configure()" | |
| ], | |
| "metadata": { | |
| "id": "BNDJe4iZ1Ogd" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# OpenAI configuration (ignore if you're using LiteLLM)\n", | |
| "if \"OPENAI_API_KEY\" not in os.environ:\n", | |
| " os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter your OpenAI API key: \")" | |
| ], | |
| "metadata": { | |
| "id": "cffXwutl1PBe" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "MODEL = \"gpt-4o-mini\"" | |
| ], | |
| "metadata": { | |
| "id": "8g_BBxJz0zzj" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "client = Opik()" | |
| ], | |
| "metadata": { | |
| "id": "BiM11ddP1JQj" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# Prompts & Templates" | |
| ], | |
| "metadata": { | |
| "id": "hrSbwEqA14y2" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "prompt_template = \"\"\"Use the following context to answer my question:\n", | |
| "\n", | |
| "### CONTEXT:\n", | |
| "{context}\n", | |
| "\n", | |
| "### QUESTION:\n", | |
| "{question}\n", | |
| "\"\"\"" | |
| ], | |
| "metadata": { | |
| "id": "Ch0EkZW317Qk" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# Dataset" | |
| ], | |
| "metadata": { | |
| "id": "kG5cyliF1G6y" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Create dataset\n", | |
| "dataset = client.get_or_create_dataset(\n", | |
| " name=\"HaluBench\", description=\"HaluBench dataset\"\n", | |
| ")" | |
| ], | |
| "metadata": { | |
| "id": "URvNSIYq20Vq" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "import pandas as pd\n", | |
| "\n", | |
| "df = pd.read_parquet(\n", | |
| " \"hf://datasets/PatronusAI/HaluBench/data/test-00000-of-00001.parquet\"\n", | |
| ")" | |
| ], | |
| "metadata": { | |
| "id": "SiatWxGI3NCy" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "df.head()" | |
| ], | |
| "metadata": { | |
| "id": "BK_DY_5_3T0N" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "cleaned_ds = df.drop(['answer', 'label', 'source_ds', 'id'], axis=1).iloc[0:100]" | |
| ], | |
| "metadata": { | |
| "id": "cGs9V96R5B4f" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "dataset.insert(cleaned_ds.to_dict('records'))" | |
| ], | |
| "metadata": { | |
| "id": "5qbQottd8d2-" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# LLM Application" | |
| ], | |
| "metadata": { | |
| "id": "oOMXQEh_01_u" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Simple little client class for using different LLM APIs (OpenAI or LiteLLM)\n", | |
| "class LLMClient:\n", | |
| " def __init__(self, client_type: str =\"openai\", model: str =\"gpt-4\"):\n", | |
| " self.client_type = client_type\n", | |
| " self.model = model\n", | |
| "\n", | |
| " if self.client_type == \"openai\":\n", | |
| " self.client = track_openai(openai.OpenAI())\n", | |
| "\n", | |
| " else:\n", | |
| " self.client = None\n", | |
| "\n", | |
| " # LiteLLM query function\n", | |
| " def _get_litellm_response(self, query: str, system: str = \"You are a helpful assistant.\"):\n", | |
| " messages = [\n", | |
| " {\"role\": \"system\", \"content\": system },\n", | |
| " { \"role\": \"user\", \"content\": query }\n", | |
| " ]\n", | |
| "\n", | |
| " response = litellm.completion(\n", | |
| " model=self.model,\n", | |
| " messages=messages\n", | |
| " )\n", | |
| "\n", | |
| " return response.choices[0].message.content\n", | |
| "\n", | |
| " # OpenAI query function - use **kwargs to pass arguments like temperature\n", | |
| " def _get_openai_response(self, query: str, system: str = \"You are a helpful assistant.\", **kwargs):\n", | |
| " messages = [\n", | |
| " {\"role\": \"system\", \"content\": system },\n", | |
| " { \"role\": \"user\", \"content\": query }\n", | |
| " ]\n", | |
| "\n", | |
| " response = self.client.chat.completions.create(\n", | |
| " model=self.model,\n", | |
| " messages=messages,\n", | |
| " **kwargs\n", | |
| " )\n", | |
| "\n", | |
| " return response.choices[0].message.content\n", | |
| "\n", | |
| "\n", | |
| " def query(self, query: str, system: str = \"You are a helpful assistant.\", **kwargs):\n", | |
| " if self.client_type == 'openai':\n", | |
| " return self._get_openai_response(query, system, **kwargs)\n", | |
| "\n", | |
| " else:\n", | |
| " return self._get_litellm_response(query, system)\n" | |
| ], | |
| "metadata": { | |
| "id": "6pElcVpW1qUb" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "llm_client = LLMClient(model=MODEL)" | |
| ], | |
| "metadata": { | |
| "id": "OTzgFYWk1qRT" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "@track\n", | |
| "def chatbot_application(question: str, context: str) -> str:\n", | |
| " response = llm_client.query(prompt_template.format(context=context, question=question))\n", | |
| " return response\n" | |
| ], | |
| "metadata": { | |
| "id": "-0E5u9Zr1qO0" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [], | |
| "metadata": { | |
| "id": "1K928HHl1qLv" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "jA2XRQDQzY2T" | |
| }, | |
| "source": [ | |
| "# Evaluation" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "id": "CXoa_D87zY2T" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# Define the evaluation task\n", | |
| "def evaluation_task(x):\n", | |
| " return {\n", | |
| " \"input\": x['question'],\n", | |
| " \"output\": chatbot_application(x['question'], x['passage']),\n", | |
| " \"context\": x['passage']\n", | |
| " }\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Retrieve the dataset\n", | |
| "client = Opik()" | |
| ], | |
| "metadata": { | |
| "id": "XlQ-dA9A7SpC" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Define the metrics\n", | |
| "metrics = [Hallucination(), AnswerRelevance()]\n", | |
| "\n", | |
| "# experiment_name\n", | |
| "experiment_name = MODEL + \"_\" + dataset.name + \"_\" + datetime.now().strftime(\"%Y-%m-%d_%H-%M-%S\")\n", | |
| "\n", | |
| "# run evaluation\n", | |
| "evaluation = evaluate(\n", | |
| " experiment_name=experiment_name,\n", | |
| " dataset=dataset,\n", | |
| " task=evaluation_task,\n", | |
| " scoring_metrics=metrics,\n", | |
| " experiment_config={\n", | |
| " \"model\": MODEL\n", | |
| " }\n", | |
| ")" | |
| ], | |
| "metadata": { | |
| "id": "LW7YCZI67T7k" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [], | |
| "metadata": { | |
| "id": "S3ozbQJZ7SkP" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [], | |
| "metadata": { | |
| "id": "HtjC569w55mP" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "comet-eval", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.10.15" | |
| }, | |
| "colab": { | |
| "provenance": [], | |
| "collapsed_sections": [ | |
| "rkMMwPik0obY", | |
| "hrSbwEqA14y2", | |
| "kG5cyliF1G6y", | |
| "oOMXQEh_01_u", | |
| "jA2XRQDQzY2T" | |
| ], | |
| "include_colab_link": true | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment