caleb-kaiser · October 24, 2024 00:35
diff --git a/3-2-evaluation-llm-based.ipynb b/3-2-evaluation-llm-based.ipynb
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/caleb-kaiser/63a115156d140aec9c29fc655a4da38d/3-2-evaluation-llm-based.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "<img src=\"https://raw.githubusercontent.com/comet-ml/opik/main/apps/opik-documentation/documentation/static/img/opik-logo.svg\" width=\"250\"/>"
      ],
      "metadata": {
        "id": "VyT73jSw8nQJ"
      }
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "4tnPHsVuzY2O"
      },
      "source": [
        "# LLM-Based Evaluation with Opik"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "In this exercise, you'll be evaluationg LLM applications with LLM-as-a-judge metrics. You can use OpenAI or open source models via LiteLLM. To make the exercise a little more exciting, you'll be running your evaluations using HaluBench, the popular hallucination dataset."
      ],
      "metadata": {
        "id": "oCCyTFCia3A2"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Imports & Configuration"
      ],
      "metadata": {
        "id": "rkMMwPik0obY"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "%pip install opik openai comet_ml litellm --quiet"
      ],
      "metadata": {
        "id": "jqGjzWRc0k0n"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "MfaBKoVnzY2R"
      },
      "outputs": [],
      "source": [
        "import opik\n",
        "from opik import Opik, track\n",
        "from opik.evaluation import evaluate\n",
        "from opik.evaluation.metrics import (Hallucination, AnswerRelevance)\n",
        "from opik.integrations.openai import track_openai\n",
        "import openai\n",
        "import os\n",
        "from datetime import datetime\n",
        "from getpass import getpass\n",
        "import litellm\n",
        "\n",
        "# Define project name to enable tracing\n",
        "os.environ[\"OPIK_PROJECT_NAME\"] = \"llm-based-eval\"\n"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# opik configs\n",
        "if \"OPIK_API_KEY\" not in os.environ:\n",
        "    os.environ[\"OPIK_API_KEY\"] = getpass(\"Enter your Opik API key: \")\n",
        "\n",
        "opik.configure()"
      ],
      "metadata": {
        "id": "BNDJe4iZ1Ogd"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# OpenAI configuration (ignore if you're using LiteLLM)\n",
        "if \"OPENAI_API_KEY\" not in os.environ:\n",
        "    os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter your OpenAI API key: \")"
      ],
      "metadata": {
        "id": "cffXwutl1PBe"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "MODEL = \"gpt-4o-mini\""
      ],
      "metadata": {
        "id": "8g_BBxJz0zzj"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "client = Opik()"
      ],
      "metadata": {
        "id": "BiM11ddP1JQj"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Prompts & Templates"
      ],
      "metadata": {
        "id": "hrSbwEqA14y2"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "prompt_template = \"\"\"Use the following context to answer my question:\n",
        "\n",
        "### CONTEXT:\n",
        "{context}\n",
        "\n",
        "### QUESTION:\n",
        "{question}\n",
        "\"\"\""
      ],
      "metadata": {
        "id": "Ch0EkZW317Qk"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Dataset"
      ],
      "metadata": {
        "id": "kG5cyliF1G6y"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Create dataset\n",
        "dataset = client.get_or_create_dataset(\n",
        "    name=\"HaluBench\", description=\"HaluBench dataset\"\n",
        ")"
      ],
      "metadata": {
        "id": "URvNSIYq20Vq"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import pandas as pd\n",
        "\n",
        "df = pd.read_parquet(\n",
        "    \"hf://datasets/PatronusAI/HaluBench/data/test-00000-of-00001.parquet\"\n",
        ")"
      ],
      "metadata": {
        "id": "SiatWxGI3NCy"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "df.head()"
      ],
      "metadata": {
        "id": "BK_DY_5_3T0N"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "cleaned_ds = df.drop(['answer', 'label', 'source_ds', 'id'], axis=1).iloc[0:100]"
      ],
      "metadata": {
        "id": "cGs9V96R5B4f"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "dataset.insert(cleaned_ds.to_dict('records'))"
      ],
      "metadata": {
        "id": "5qbQottd8d2-"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# LLM Application"
      ],
      "metadata": {
        "id": "oOMXQEh_01_u"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Simple little client class for using different LLM APIs (OpenAI or LiteLLM)\n",
        "class LLMClient:\n",
        "  def __init__(self, client_type: str =\"openai\", model: str =\"gpt-4\"):\n",
        "    self.client_type = client_type\n",
        "    self.model = model\n",
        "\n",
        "    if self.client_type == \"openai\":\n",
        "      self.client = track_openai(openai.OpenAI())\n",
        "\n",
        "    else:\n",
        "      self.client = None\n",
        "\n",
        "  # LiteLLM query function\n",
        "  def _get_litellm_response(self, query: str, system: str = \"You are a helpful assistant.\"):\n",
        "    messages = [\n",
        "        {\"role\": \"system\", \"content\": system },\n",
        "        { \"role\": \"user\", \"content\": query }\n",
        "    ]\n",
        "\n",
        "    response = litellm.completion(\n",
        "        model=self.model,\n",
        "        messages=messages\n",
        "    )\n",
        "\n",
        "    return response.choices[0].message.content\n",
        "\n",
        "  # OpenAI query function - use **kwargs to pass arguments like temperature\n",
        "  def _get_openai_response(self, query: str, system: str = \"You are a helpful assistant.\", **kwargs):\n",
        "    messages = [\n",
        "        {\"role\": \"system\", \"content\": system },\n",
        "        { \"role\": \"user\", \"content\": query }\n",
        "    ]\n",
        "\n",
        "    response = self.client.chat.completions.create(\n",
        "        model=self.model,\n",
        "        messages=messages,\n",
        "        **kwargs\n",
        "    )\n",
        "\n",
        "    return response.choices[0].message.content\n",
        "\n",
        "\n",
        "  def query(self, query: str, system: str = \"You are a helpful assistant.\", **kwargs):\n",
        "    if self.client_type == 'openai':\n",
        "      return self._get_openai_response(query, system, **kwargs)\n",
        "\n",
        "    else:\n",
        "      return self._get_litellm_response(query, system)\n"
      ],
      "metadata": {
        "id": "6pElcVpW1qUb"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "llm_client = LLMClient(model=MODEL)"
      ],
      "metadata": {
        "id": "OTzgFYWk1qRT"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "@track\n",
        "def chatbot_application(question: str, context: str) -> str:\n",
        "    response = llm_client.query(prompt_template.format(context=context, question=question))\n",
        "    return response\n"
      ],
      "metadata": {
        "id": "-0E5u9Zr1qO0"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "1K928HHl1qLv"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "jA2XRQDQzY2T"
      },
      "source": [
        "# Evaluation"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "CXoa_D87zY2T"
      },
      "outputs": [],
      "source": [
        "# Define the evaluation task\n",
        "def evaluation_task(x):\n",
        "    return {\n",
        "        \"input\": x['question'],\n",
        "        \"output\": chatbot_application(x['question'], x['passage']),\n",
        "        \"context\": x['passage']\n",
        "    }\n"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Retrieve the dataset\n",
        "client = Opik()"
      ],
      "metadata": {
        "id": "XlQ-dA9A7SpC"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Define the metrics\n",
        "metrics = [Hallucination(), AnswerRelevance()]\n",
        "\n",
        "# experiment_name\n",
        "experiment_name = MODEL + \"_\" + dataset.name + \"_\" + datetime.now().strftime(\"%Y-%m-%d_%H-%M-%S\")\n",
        "\n",
        "# run evaluation\n",
        "evaluation = evaluate(\n",
        "    experiment_name=experiment_name,\n",
        "    dataset=dataset,\n",
        "    task=evaluation_task,\n",
        "    scoring_metrics=metrics,\n",
        "    experiment_config={\n",
        "        \"model\": MODEL\n",
        "    }\n",
        ")"
      ],
      "metadata": {
        "id": "LW7YCZI67T7k"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "S3ozbQJZ7SkP"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "HtjC569w55mP"
      },
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "comet-eval",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.15"
    },
    "colab": {
      "provenance": [],
      "collapsed_sections": [
        "rkMMwPik0obY",
        "hrSbwEqA14y2",
        "kG5cyliF1G6y",
        "oOMXQEh_01_u",
        "jA2XRQDQzY2T"
      ],
      "include_colab_link": true
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/caleb-kaiser/63a115156d140aec9c29fc655a4da38d/3-2-evaluation-llm-based.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"<img src=\"https://raw.githubusercontent.com/comet-ml/opik/main/apps/opik-documentation/documentation/static/img/opik-logo.svg\" width=\"250\"/>"
	],
	"metadata": {
	"id": "VyT73jSw8nQJ"
	}
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "4tnPHsVuzY2O"
	},
	"source": [
	"# LLM-Based Evaluation with Opik"
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"In this exercise, you'll be evaluationg LLM applications with LLM-as-a-judge metrics. You can use OpenAI or open source models via LiteLLM. To make the exercise a little more exciting, you'll be running your evaluations using HaluBench, the popular hallucination dataset."
	],
	"metadata": {
	"id": "oCCyTFCia3A2"
	}
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Imports & Configuration"
	],
	"metadata": {
	"id": "rkMMwPik0obY"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"%pip install opik openai comet_ml litellm --quiet"
	],
	"metadata": {
	"id": "jqGjzWRc0k0n"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "MfaBKoVnzY2R"
	},
	"outputs": [],
	"source": [
	"import opik\n",
	"from opik import Opik, track\n",
	"from opik.evaluation import evaluate\n",
	"from opik.evaluation.metrics import (Hallucination, AnswerRelevance)\n",
	"from opik.integrations.openai import track_openai\n",
	"import openai\n",
	"import os\n",
	"from datetime import datetime\n",
	"from getpass import getpass\n",
	"import litellm\n",
	"\n",
	"# Define project name to enable tracing\n",
	"os.environ[\"OPIK_PROJECT_NAME\"] = \"llm-based-eval\"\n"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"# opik configs\n",
	"if \"OPIK_API_KEY\" not in os.environ:\n",
	" os.environ[\"OPIK_API_KEY\"] = getpass(\"Enter your Opik API key: \")\n",
	"\n",
	"opik.configure()"
	],
	"metadata": {
	"id": "BNDJe4iZ1Ogd"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# OpenAI configuration (ignore if you're using LiteLLM)\n",
	"if \"OPENAI_API_KEY\" not in os.environ:\n",
	" os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter your OpenAI API key: \")"
	],
	"metadata": {
	"id": "cffXwutl1PBe"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"MODEL = \"gpt-4o-mini\""
	],
	"metadata": {
	"id": "8g_BBxJz0zzj"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"client = Opik()"
	],
	"metadata": {
	"id": "BiM11ddP1JQj"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Prompts & Templates"
	],
	"metadata": {
	"id": "hrSbwEqA14y2"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"prompt_template = \"\"\"Use the following context to answer my question:\n",
	"\n",
	"### CONTEXT:\n",
	"{context}\n",
	"\n",
	"### QUESTION:\n",
	"{question}\n",
	"\"\"\""
	],
	"metadata": {
	"id": "Ch0EkZW317Qk"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Dataset"
	],
	"metadata": {
	"id": "kG5cyliF1G6y"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"# Create dataset\n",
	"dataset = client.get_or_create_dataset(\n",
	" name=\"HaluBench\", description=\"HaluBench dataset\"\n",
	")"
	],
	"metadata": {
	"id": "URvNSIYq20Vq"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"import pandas as pd\n",
	"\n",
	"df = pd.read_parquet(\n",
	" \"hf://datasets/PatronusAI/HaluBench/data/test-00000-of-00001.parquet\"\n",
	")"
	],
	"metadata": {
	"id": "SiatWxGI3NCy"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"df.head()"
	],
	"metadata": {
	"id": "BK_DY_5_3T0N"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"cleaned_ds = df.drop(['answer', 'label', 'source_ds', 'id'], axis=1).iloc[0:100]"
	],
	"metadata": {
	"id": "cGs9V96R5B4f"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"dataset.insert(cleaned_ds.to_dict('records'))"
	],
	"metadata": {
	"id": "5qbQottd8d2-"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"# LLM Application"
	],
	"metadata": {
	"id": "oOMXQEh_01_u"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"# Simple little client class for using different LLM APIs (OpenAI or LiteLLM)\n",
	"class LLMClient:\n",
	" def __init__(self, client_type: str =\"openai\", model: str =\"gpt-4\"):\n",
	" self.client_type = client_type\n",
	" self.model = model\n",
	"\n",
	" if self.client_type == \"openai\":\n",
	" self.client = track_openai(openai.OpenAI())\n",
	"\n",
	" else:\n",
	" self.client = None\n",
	"\n",
	" # LiteLLM query function\n",
	" def _get_litellm_response(self, query: str, system: str = \"You are a helpful assistant.\"):\n",
	" messages = [\n",
	" {\"role\": \"system\", \"content\": system },\n",
	" { \"role\": \"user\", \"content\": query }\n",
	" ]\n",
	"\n",
	" response = litellm.completion(\n",
	" model=self.model,\n",
	" messages=messages\n",
	" )\n",
	"\n",
	" return response.choices[0].message.content\n",
	"\n",
	" # OpenAI query function - use **kwargs to pass arguments like temperature\n",
	" def _get_openai_response(self, query: str, system: str = \"You are a helpful assistant.\", **kwargs):\n",
	" messages = [\n",
	" {\"role\": \"system\", \"content\": system },\n",
	" { \"role\": \"user\", \"content\": query }\n",
	" ]\n",
	"\n",
	" response = self.client.chat.completions.create(\n",
	" model=self.model,\n",
	" messages=messages,\n",
	" **kwargs\n",
	" )\n",
	"\n",
	" return response.choices[0].message.content\n",
	"\n",
	"\n",
	" def query(self, query: str, system: str = \"You are a helpful assistant.\", **kwargs):\n",
	" if self.client_type == 'openai':\n",
	" return self._get_openai_response(query, system, **kwargs)\n",
	"\n",
	" else:\n",
	" return self._get_litellm_response(query, system)\n"
	],
	"metadata": {
	"id": "6pElcVpW1qUb"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"llm_client = LLMClient(model=MODEL)"
	],
	"metadata": {
	"id": "OTzgFYWk1qRT"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"@track\n",
	"def chatbot_application(question: str, context: str) -> str:\n",
	" response = llm_client.query(prompt_template.format(context=context, question=question))\n",
	" return response\n"
	],
	"metadata": {
	"id": "-0E5u9Zr1qO0"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [],
	"metadata": {
	"id": "1K928HHl1qLv"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "jA2XRQDQzY2T"
	},
	"source": [
	"# Evaluation"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "CXoa_D87zY2T"
	},
	"outputs": [],
	"source": [
	"# Define the evaluation task\n",
	"def evaluation_task(x):\n",
	" return {\n",
	" \"input\": x['question'],\n",
	" \"output\": chatbot_application(x['question'], x['passage']),\n",
	" \"context\": x['passage']\n",
	" }\n"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"# Retrieve the dataset\n",
	"client = Opik()"
	],
	"metadata": {
	"id": "XlQ-dA9A7SpC"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# Define the metrics\n",
	"metrics = [Hallucination(), AnswerRelevance()]\n",
	"\n",
	"# experiment_name\n",
	"experiment_name = MODEL + \"_\" + dataset.name + \"_\" + datetime.now().strftime(\"%Y-%m-%d_%H-%M-%S\")\n",
	"\n",
	"# run evaluation\n",
	"evaluation = evaluate(\n",
	" experiment_name=experiment_name,\n",
	" dataset=dataset,\n",
	" task=evaluation_task,\n",
	" scoring_metrics=metrics,\n",
	" experiment_config={\n",
	" \"model\": MODEL\n",
	" }\n",
	")"
	],
	"metadata": {
	"id": "LW7YCZI67T7k"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [],
	"metadata": {
	"id": "S3ozbQJZ7SkP"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [],
	"metadata": {
	"id": "HtjC569w55mP"
	},
	"execution_count": null,
	"outputs": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "comet-eval",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.10.15"
	},
	"colab": {
	"provenance": [],
	"collapsed_sections": [
	"rkMMwPik0obY",
	"hrSbwEqA14y2",
	"kG5cyliF1G6y",
	"oOMXQEh_01_u",
	"jA2XRQDQzY2T"
	],
	"include_colab_link": true
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}
No results found