lukehinds · December 13, 2025 16:23
diff --git a/datasets.ipynb b/datasets.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a7ed1c49",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%capture\n",
    "!pip install -q datasets transformers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "fcf1a44d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e2c7fedb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ==============================================================================\n",
    "# STANDARD: PARALLEL HETEROGENEOUS TOOL CALLING\n",
    "# ==============================================================================\n",
    "# This dataset example demonstrates a single turn where the model must fire \n",
    "# two distinct tools simultaneously.\n",
    "#\n",
    "# CONSTRAINTS:\n",
    "# 1. 'id' field is MANDATORY. \n",
    "# 2. 'id' must be exactly 9 alphanumeric characters (Regex: ^[a-zA-Z0-9]{9}$).\n",
    "# 3. 'arguments' in 'assistant' block are DICTIONARIES (not strings).\n",
    "# ==============================================================================\n",
    "\n",
    "messages = [\n",
    "    {\n",
    "    \"messages\": [\n",
    "      {\"role\": \"user\", \"content\": \"Investigate issue #342 in acme-corp/web-platform - get the issue details, find related commits, and show me any PRs that address it\"},\n",
    "      \n",
    "      {\"role\": \"assistant\", \"content\": \"\", \"tool_calls\": [{\"function\": {\"name\": \"issue_read\", \"arguments\": \"{\\\"owner\\\":\\\"acme-corp\\\",\\\"repo\\\":\\\"web-platform\\\",\\\"issue_number\\\":342,\\\"method\\\":\\\"get\\\"}\"}, \"id\": \"call1\", \"type\": \"function\"}]},\n",
    "      {\"role\": \"tool\", \"tool_call_id\": \"call1\", \"content\": \"{...issue #342 details...}\"},\n",
    "      \n",
    "      {\"role\": \"assistant\", \"content\": \"\", \"tool_calls\": [{\"function\": {\"name\": \"list_commits\", \"arguments\": \"{\\\"owner\\\":\\\"acme-corp\\\",\\\"repo\\\":\\\"web-platform\\\"}\"}, \"id\": \"call2\", \"type\": \"function\"}]},\n",
    "      {\"role\": \"tool\", \"tool_call_id\": \"call2\", \"content\": \"{...commits...}\"},\n",
    "      \n",
    "      {\"role\": \"assistant\", \"content\": \"\", \"tool_calls\": [{\"function\": {\"name\": \"list_pull_requests\", \"arguments\": \"{\\\"owner\\\":\\\"acme-corp\\\",\\\"repo\\\":\\\"web-platform\\\"}\"}, \"id\": \"call3\", \"type\": \"function\"}]},\n",
    "      {\"role\": \"tool\", \"tool_call_id\": \"call3\", \"content\": \"{...PRs...}\"},\n",
    "      \n",
    "      {\"role\": \"assistant\", \"content\": \"Here's my complete investigation of issue #342:\\n\\n**Issue Details:**\\nTitle: Authentication fails silently...\\n\\n**Related Commits:**\\n- abc123: feat(auth): implement token refresh...\\n\\n**Related PRs:**\\n- PR #345: Fix authentication silent failure\\n\\nThe issue appears to be addressed by PR #345.\"}\n",
    "    ]\n",
    "  }\n",
    "]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "5528837a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ==============================================================================\n",
    "# TOOL DEFINITIONS (SCHEMA)\n",
    "# ==============================================================================\n",
    "# Required by chat templates to generate the system prompt instructions.\n",
    "\n",
    "tools = [\n",
    "    {\n",
    "        \"type\": \"function\",\n",
    "        \"function\": {\n",
    "            \"name\": \"issue_read\",\n",
    "            \"description\": \"Get details of a specific issue in a repository\",\n",
    "            \"parameters\": {\n",
    "                \"type\": \"object\",\n",
    "                \"properties\": {\n",
    "                    \"owner\": {\"type\": \"string\"},\n",
    "                    \"repo\": {\"type\": \"string\"},\n",
    "                    \"issue_number\": {\"type\": \"integer\"},\n",
    "                    \"method\": {\"type\": \"string\", \"enum\": [\"get\"]}\n",
    "                },\n",
    "                \"required\": [\"owner\", \"repo\", \"issue_number\", \"method\"]\n",
    "            }\n",
    "        }\n",
    "    },\n",
    "    {\n",
    "        \"type\": \"function\",\n",
    "        \"function\": {\n",
    "            \"name\": \"list_commits\",\n",
    "            \"description\": \"Get the list of commits for a repository\",\n",
    "            \"parameters\": {\n",
    "                \"type\": \"object\",\n",
    "                \"properties\": {\n",
    "                    \"owner\": {\"type\": \"string\"},\n",
    "                    \"repo\": {\"type\": \"string\"}\n",
    "                },\n",
    "                \"required\": [\"owner\", \"repo\"]\n",
    "            }\n",
    "        }\n",
    "    },\n",
    "    {\n",
    "        \"type\": \"function\",\n",
    "        \"function\": {\n",
    "            \"name\": \"list_pull_requests\",\n",
    "            \"description\": \"Get the list of pull requests for a repository\",\n",
    "            \"parameters\": {\n",
    "                \"type\": \"object\",\n",
    "                \"properties\": {\n",
    "                    \"owner\": {\"type\": \"string\"},\n",
    "                    \"repo\": {\"type\": \"string\"}\n",
    "                },\n",
    "                \"required\": [\"owner\", \"repo\"]\n",
    "            }\n",
    "        }\n",
    "    }\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "668bbd80",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=== Formatted Chat Template ===\n",
      "<|im_start|>system\n",
      "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\n",
      "\n",
      "# Tools\n",
      "\n",
      "You may call one or more functions to assist with the user query.\n",
      "\n",
      "You are provided with function signatures within <tools></tools> XML tags:\n",
      "<tools>\n",
      "{\"type\": \"function\", \"function\": {\"name\": \"issue_read\", \"description\": \"Get details of a specific issue in a repository\", \"parameters\": {\"type\": \"object\", \"properties\": {\"owner\": {\"type\": \"string\"}, \"repo\": {\"type\": \"string\"}, \"issue_number\": {\"type\": \"integer\"}, \"method\": {\"type\": \"string\", \"enum\": [\"get\"]}}, \"required\": [\"owner\", \"repo\", \"issue_number\", \"method\"]}}}\n",
      "{\"type\": \"function\", \"function\": {\"name\": \"list_commits\", \"description\": \"Get the list of commits for a repository\", \"parameters\": {\"type\": \"object\", \"properties\": {\"owner\": {\"type\": \"string\"}, \"repo\": {\"type\": \"string\"}}, \"required\": [\"owner\", \"repo\"]}}}\n",
      "{\"type\": \"function\", \"function\": {\"name\": \"list_pull_requests\", \"description\": \"Get the list of pull requests for a repository\", \"parameters\": {\"type\": \"object\", \"properties\": {\"owner\": {\"type\": \"string\"}, \"repo\": {\"type\": \"string\"}}, \"required\": [\"owner\", \"repo\"]}}}\n",
      "</tools>\n",
      "\n",
      "For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n",
      "<tool_call>\n",
      "{\"name\": <function-name>, \"arguments\": <args-json-object>}\n",
      "</tool_call><|im_end|>\n",
      "<|im_start|>user\n",
      "Investigate issue #342 in acme-corp/web-platform - get the issue details, find related commits, and show me any PRs that address it<|im_end|>\n",
      "<|im_start|>assistant\n",
      "<tool_call>\n",
      "{\"name\": \"issue_read\", \"arguments\": \"{\\\"owner\\\":\\\"acme-corp\\\",\\\"repo\\\":\\\"web-platform\\\",\\\"issue_number\\\":342,\\\"method\\\":\\\"get\\\"}\"}\n",
      "</tool_call><|im_end|>\n",
      "<|im_start|>user\n",
      "<tool_response>\n",
      "{...issue #342 details...}\n",
      "</tool_response><|im_end|>\n",
      "<|im_start|>assistant\n",
      "<tool_call>\n",
      "{\"name\": \"list_commits\", \"arguments\": \"{\\\"owner\\\":\\\"acme-corp\\\",\\\"repo\\\":\\\"web-platform\\\"}\"}\n",
      "</tool_call><|im_end|>\n",
      "<|im_start|>user\n",
      "<tool_response>\n",
      "{...commits...}\n",
      "</tool_response><|im_end|>\n",
      "<|im_start|>assistant\n",
      "<tool_call>\n",
      "{\"name\": \"list_pull_requests\", \"arguments\": \"{\\\"owner\\\":\\\"acme-corp\\\",\\\"repo\\\":\\\"web-platform\\\"}\"}\n",
      "</tool_call><|im_end|>\n",
      "<|im_start|>user\n",
      "<tool_response>\n",
      "{...PRs...}\n",
      "</tool_response><|im_end|>\n",
      "<|im_start|>assistant\n",
      "Here's my complete investigation of issue #342:\n",
      "\n",
      "**Issue Details:**\n",
      "Title: Authentication fails silently...\n",
      "\n",
      "**Related Commits:**\n",
      "- abc123: feat(auth): implement token refresh...\n",
      "\n",
      "**Related PRs:**\n",
      "- PR #345: Fix authentication silent failure\n",
      "\n",
      "The issue appears to be addressed by PR #345.<|im_end|>\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# ==============================================================================\n",
    "# Load a single example into the tokenizer's chat template from variable\n",
    "# ==============================================================================\n",
    "\n",
    "from datasets import load_dataset\n",
    "from transformers import AutoTokenizer\n",
    "\n",
    "model_name = \"Qwen/Qwen2.5-3B-Instruct\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
    "\n",
    "# Extract the actual conversation\n",
    "conversation = messages[0][\"messages\"]\n",
    "\n",
    "result = tokenizer.apply_chat_template(\n",
    "    conversation,\n",
    "    tools=tools,\n",
    "    tokenize=False\n",
    ")\n",
    "\n",
    "# Print the formatted chat template\n",
    "print(\"=== Formatted Chat Template ===\")\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "92cd4544",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ==============================================================================\n",
    "# From huggingface dataset\n",
    "# ==============================================================================\n",
    "\n",
    "from datasets import load_dataset  # noqa: F811\n",
    "\n",
    "dataset = load_dataset(\"your-dataset-name\")\n",
    "\n",
    "# Apply to single example\n",
    "result = tokenizer.apply_chat_template(\n",
    "    dataset[\"train\"][0][\"messages\"], # type: ignore\n",
    "    tools=tools,\n",
    "    tokenize=False)\n",
    "\n",
    "# Or map over entire dataset\n",
    "def format_conversation(example):\n",
    "    return {\"text\": tokenizer.apply_chat_template(example[\"messages\"], tokenize=False)}\n",
    "\n",
    "formatted_dataset = dataset.map(format_conversation)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d8ec47c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ==============================================================================\n",
    "# For processing many examples from a dataset\n",
    "# ==============================================================================\n",
    "\n",
    "from datasets import load_dataset\n",
    "from transformers import AutoTokenizer\n",
    "\n",
    "model_name = \"Qwen/Qwen2.5-3B-Instruct\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "\n",
    "# Load your dataset\n",
    "dataset = load_dataset(\"your-dataset-name\")\n",
    "\n",
    "# Method 1: Using map (recommended for large datasets)\n",
    "def format_conversation(example):\n",
    "    text = tokenizer.apply_chat_template(\n",
    "        example[\"messages\"], \n",
    "        tokenize=False,\n",
    "        add_generation_prompt=False\n",
    "    )\n",
    "    return {\"text\": text}\n",
    "\n",
    "formatted_dataset = dataset.map(format_conversation)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "87805c00",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ==============================================================================\n",
    "# FUsing batched map (faster)\n",
    "# ==============================================================================\n",
    "from datasets import load_dataset\n",
    "from transformers import AutoTokenizer\n",
    "\n",
    "model_name = \"Qwen/Qwen2.5-3B-Instruct\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "\n",
    "# Load your dataset\n",
    "dataset = load_dataset(\"your-dataset-name\")\n",
    "\n",
    "# Method 2: Using batched map (faster)\n",
    "def format_conversations_batched(examples):\n",
    "    texts = [\n",
    "        tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=False)\n",
    "        for msgs in examples[\"messages\"]\n",
    "    ]\n",
    "    return {\"text\": texts}\n",
    "\n",
    "formatted_dataset = dataset.map(format_conversations_batched, batched=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c878ad1e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ==============================================================================\n",
    "# Splitting train/test from files\n",
    "# ==============================================================================\n",
    "from datasets import load_dataset\n",
    "from transformers import AutoTokenizer\n",
    "\n",
    "model_name = \"Qwen/Qwen2.5-3B-Instruct\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
    "# Train/test split from separate files\n",
    "\n",
    "dataset = load_dataset(\"json\", data_files={\n",
    "    \"train\": \"train.json\",\n",
    "    \"test\": \"test.json\"\n",
    "})\n",
    "\n",
    "# Or from a single dataset\n",
    "\n",
    "# Split into train/eval\n",
    "splits = dataset.train_test_split(test_size=0.1, seed=42)\n",
    "train_ds = splits[\"train\"]\n",
    "eval_ds = splits[\"test\"]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "4eb4d9e0-5eeb-4b08-9f61-549998758ddf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MODEL           | STATUS                                            \n",
      "-----------------------------------------------------------------\n",
      "\n",
      "[Llama-3] Success! Preview of Tool Call:\n",
      "...\n",
      "\n",
      "[Qwen-2.5] Success! Preview of Tool Call:\n",
      " {\"type\": \"string\"}}, \"required\": [\"owner\", \"repo\"]}}}\n",
      "</tools>\n",
      "\n",
      "For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n",
      "<tool_call>\n",
      "{\"name\": <function-name>, \"arguments\": <args-json-object>}\n",
      "</tool_call><|im_end|>\n",
      "<|im_start|>assistant\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The tokenizer you are loading from 'mistralai/Mistral-Nemo-Instruct-2407' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e.  This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "[Mistral] Error: After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\n",
      "\n",
      "[Command-R] Success! Preview of Tool Call:\n",
      "the code.\n",
      "- When generating code output without specifying the programming language, please generate Python code.\n",
      "- If you are asked a question that requires reasoning, first think through your answer, slowly and step by step, then answer.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>\n"
     ]
    }
   ],
   "source": [
    "# 3. TEST ACROSS ARCHITECTURES\n",
    "# ---------------------------------------------------------\n",
    "model_map = {\n",
    "    \"Llama-3\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
    "    \"Qwen-2.5\": \"Qwen/Qwen2.5-7B-Instruct\",\n",
    "    \"Mistral\": \"mistralai/Mistral-Nemo-Instruct-2407\",\n",
    "    \"Command-R\": \"estrogen/c4ai-command-r7b-12-2024\"\n",
    "}\n",
    "\n",
    "print(f\"{'MODEL':<15} | {'STATUS':<50}\")\n",
    "print(\"-\" * 65)\n",
    "\n",
    "for name, model_id in model_map.items():\n",
    "    try:\n",
    "        tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
    "        \n",
    "        # NOTE: Mistral requires valid tool definitions passed to apply_chat_template\n",
    "        # to trigger the tool-use logic correctly.\n",
    "        prompt = tokenizer.apply_chat_template(\n",
    "            messages,\n",
    "            tools=tools,\n",
    "            tokenize=False,\n",
    "            add_generation_prompt=True\n",
    "        )\n",
    "        \n",
    "        # Printing just the tool call section for brevity\n",
    "        print(f\"\\n[{name}] Success! Preview of Tool Call:\")\n",
    "        \n",
    "        # A simple slice to show the relevant part of the output string\n",
    "        if \"Mistral\" in name:\n",
    "            # Mistral puts calls inside [TOOL_CALLS]\n",
    "            start = prompt.find(\"[TOOL_CALLS]\")\n",
    "            print(prompt[start:start+150] + \"...\")\n",
    "        elif \"Llama\" in name:\n",
    "            # Llama puts calls in header blocks\n",
    "            start = prompt.find(\"<|python_tag|>\")\n",
    "            print(prompt[start:start+150] + \"...\")\n",
    "        else:\n",
    "            # Fallback preview\n",
    "            print(prompt[-300:]) \n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f\"\\n[{name}] Error: {e}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "cee35917",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Validating standard against tokenizer: mistralai/Mistral-Nemo-Instruct-2407...\n",
      "\n",
      "[VALIDATION FAILED] Error: After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\n"
     ]
    }
   ],
   "source": [
    "# Mistral will error \"The tokenizer you are loading from 'mistralai/Mistral-Nemo-Instruct-2407' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e.  This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.\"\n",
    "\n",
    "model_id = \"mistralai/Mistral-Nemo-Instruct-2407\"\n",
    "\n",
    "try:\n",
    "    print(f\"Validating standard against tokenizer: {model_id}...\")\n",
    "    \n",
    "    # flag 'fix_mistral_regex=True' ensures correct handling of special tokens\n",
    "    tokenizer = AutoTokenizer.from_pretrained(model_id, fix_mistral_regex=True)\n",
    "    \n",
    "    # Render the prompt\n",
    "    prompt = tokenizer.apply_chat_template(\n",
    "        messages,\n",
    "        tools=tools,\n",
    "        tokenize=False,\n",
    "        add_generation_prompt=True\n",
    "    )\n",
    "    \n",
    "    print(\"\\n[VALIDATION SUCCESSFUL] Output Template Preview:\\n\")\n",
    "    print(\"-\" * 60)\n",
    "    \n",
    "    # Extracting the relevant tool call section for verification\n",
    "    start_calls = prompt.find(\"[TOOL_CALLS]\")\n",
    "    end_results = prompt.rfind(\"[/TOOL_RESULTS]\") + 15\n",
    "    \n",
    "    print(prompt[start_calls:end_results])\n",
    "    print(\"-\" * 60)\n",
    "\n",
    "except Exception as e:\n",
    "    print(f\"\\n[VALIDATION FAILED] Error: {e}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5fbf01bc",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "a7ed1c49",
	"metadata": {},
	"outputs": [],
	"source": [
	"%%capture\n",
	"!pip install -q datasets transformers"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "fcf1a44d",
	"metadata": {},
	"outputs": [],
	"source": [
	"from transformers import AutoTokenizer\n",
	"import json"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "e2c7fedb",
	"metadata": {},
	"outputs": [],
	"source": [
	"# ==============================================================================\n",
	"# STANDARD: PARALLEL HETEROGENEOUS TOOL CALLING\n",
	"# ==============================================================================\n",
	"# This dataset example demonstrates a single turn where the model must fire \n",
	"# two distinct tools simultaneously.\n",
	"#\n",
	"# CONSTRAINTS:\n",
	"# 1. 'id' field is MANDATORY. \n",
	"# 2. 'id' must be exactly 9 alphanumeric characters (Regex: ^[a-zA-Z0-9]{9}$).\n",
	"# 3. 'arguments' in 'assistant' block are DICTIONARIES (not strings).\n",
	"# ==============================================================================\n",
	"\n",
	"messages = [\n",
	" {\n",
	" \"messages\": [\n",
	" {\"role\": \"user\", \"content\": \"Investigate issue #342 in acme-corp/web-platform - get the issue details, find related commits, and show me any PRs that address it\"},\n",
	" \n",
	" {\"role\": \"assistant\", \"content\": \"\", \"tool_calls\": [{\"function\": {\"name\": \"issue_read\", \"arguments\": \"{\\\"owner\\\":\\\"acme-corp\\\",\\\"repo\\\":\\\"web-platform\\\",\\\"issue_number\\\":342,\\\"method\\\":\\\"get\\\"}\"}, \"id\": \"call1\", \"type\": \"function\"}]},\n",
	" {\"role\": \"tool\", \"tool_call_id\": \"call1\", \"content\": \"{...issue #342 details...}\"},\n",
	" \n",
	" {\"role\": \"assistant\", \"content\": \"\", \"tool_calls\": [{\"function\": {\"name\": \"list_commits\", \"arguments\": \"{\\\"owner\\\":\\\"acme-corp\\\",\\\"repo\\\":\\\"web-platform\\\"}\"}, \"id\": \"call2\", \"type\": \"function\"}]},\n",
	" {\"role\": \"tool\", \"tool_call_id\": \"call2\", \"content\": \"{...commits...}\"},\n",
	" \n",
	" {\"role\": \"assistant\", \"content\": \"\", \"tool_calls\": [{\"function\": {\"name\": \"list_pull_requests\", \"arguments\": \"{\\\"owner\\\":\\\"acme-corp\\\",\\\"repo\\\":\\\"web-platform\\\"}\"}, \"id\": \"call3\", \"type\": \"function\"}]},\n",
	" {\"role\": \"tool\", \"tool_call_id\": \"call3\", \"content\": \"{...PRs...}\"},\n",
	" \n",
	" {\"role\": \"assistant\", \"content\": \"Here's my complete investigation of issue #342:\\n\\nIssue Details:\\nTitle: Authentication fails silently...\\n\\nRelated Commits:\\n- abc123: feat(auth): implement token refresh...\\n\\nRelated PRs:\\n- PR #345: Fix authentication silent failure\\n\\nThe issue appears to be addressed by PR #345.\"}\n",
	" ]\n",
	" }\n",
	"]\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "5528837a",
	"metadata": {},
	"outputs": [],
	"source": [
	"# ==============================================================================\n",
	"# TOOL DEFINITIONS (SCHEMA)\n",
	"# ==============================================================================\n",
	"# Required by chat templates to generate the system prompt instructions.\n",
	"\n",
	"tools = [\n",
	" {\n",
	" \"type\": \"function\",\n",
	" \"function\": {\n",
	" \"name\": \"issue_read\",\n",
	" \"description\": \"Get details of a specific issue in a repository\",\n",
	" \"parameters\": {\n",
	" \"type\": \"object\",\n",
	" \"properties\": {\n",
	" \"owner\": {\"type\": \"string\"},\n",
	" \"repo\": {\"type\": \"string\"},\n",
	" \"issue_number\": {\"type\": \"integer\"},\n",
	" \"method\": {\"type\": \"string\", \"enum\": [\"get\"]}\n",
	" },\n",
	" \"required\": [\"owner\", \"repo\", \"issue_number\", \"method\"]\n",
	" }\n",
	" }\n",
	" },\n",
	" {\n",
	" \"type\": \"function\",\n",
	" \"function\": {\n",
	" \"name\": \"list_commits\",\n",
	" \"description\": \"Get the list of commits for a repository\",\n",
	" \"parameters\": {\n",
	" \"type\": \"object\",\n",
	" \"properties\": {\n",
	" \"owner\": {\"type\": \"string\"},\n",
	" \"repo\": {\"type\": \"string\"}\n",
	" },\n",
	" \"required\": [\"owner\", \"repo\"]\n",
	" }\n",
	" }\n",
	" },\n",
	" {\n",
	" \"type\": \"function\",\n",
	" \"function\": {\n",
	" \"name\": \"list_pull_requests\",\n",
	" \"description\": \"Get the list of pull requests for a repository\",\n",
	" \"parameters\": {\n",
	" \"type\": \"object\",\n",
	" \"properties\": {\n",
	" \"owner\": {\"type\": \"string\"},\n",
	" \"repo\": {\"type\": \"string\"}\n",
	" },\n",
	" \"required\": [\"owner\", \"repo\"]\n",
	" }\n",
	" }\n",
	" }\n",
	"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "668bbd80",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"=== Formatted Chat Template ===\n",
	"<\|im_start\|>system\n",
	"You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\n",
	"\n",
	"# Tools\n",
	"\n",
	"You may call one or more functions to assist with the user query.\n",
	"\n",
	"You are provided with function signatures within <tools></tools> XML tags:\n",
	"<tools>\n",
	"{\"type\": \"function\", \"function\": {\"name\": \"issue_read\", \"description\": \"Get details of a specific issue in a repository\", \"parameters\": {\"type\": \"object\", \"properties\": {\"owner\": {\"type\": \"string\"}, \"repo\": {\"type\": \"string\"}, \"issue_number\": {\"type\": \"integer\"}, \"method\": {\"type\": \"string\", \"enum\": [\"get\"]}}, \"required\": [\"owner\", \"repo\", \"issue_number\", \"method\"]}}}\n",
	"{\"type\": \"function\", \"function\": {\"name\": \"list_commits\", \"description\": \"Get the list of commits for a repository\", \"parameters\": {\"type\": \"object\", \"properties\": {\"owner\": {\"type\": \"string\"}, \"repo\": {\"type\": \"string\"}}, \"required\": [\"owner\", \"repo\"]}}}\n",
	"{\"type\": \"function\", \"function\": {\"name\": \"list_pull_requests\", \"description\": \"Get the list of pull requests for a repository\", \"parameters\": {\"type\": \"object\", \"properties\": {\"owner\": {\"type\": \"string\"}, \"repo\": {\"type\": \"string\"}}, \"required\": [\"owner\", \"repo\"]}}}\n",
	"</tools>\n",
	"\n",
	"For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n",
	"<tool_call>\n",
	"{\"name\": <function-name>, \"arguments\": <args-json-object>}\n",
	"</tool_call><\|im_end\|>\n",
	"<\|im_start\|>user\n",
	"Investigate issue #342 in acme-corp/web-platform - get the issue details, find related commits, and show me any PRs that address it<\|im_end\|>\n",
	"<\|im_start\|>assistant\n",
	"<tool_call>\n",
	"{\"name\": \"issue_read\", \"arguments\": \"{\\\"owner\\\":\\\"acme-corp\\\",\\\"repo\\\":\\\"web-platform\\\",\\\"issue_number\\\":342,\\\"method\\\":\\\"get\\\"}\"}\n",
	"</tool_call><\|im_end\|>\n",
	"<\|im_start\|>user\n",
	"<tool_response>\n",
	"{...issue #342 details...}\n",
	"</tool_response><\|im_end\|>\n",
	"<\|im_start\|>assistant\n",
	"<tool_call>\n",
	"{\"name\": \"list_commits\", \"arguments\": \"{\\\"owner\\\":\\\"acme-corp\\\",\\\"repo\\\":\\\"web-platform\\\"}\"}\n",
	"</tool_call><\|im_end\|>\n",
	"<\|im_start\|>user\n",
	"<tool_response>\n",
	"{...commits...}\n",
	"</tool_response><\|im_end\|>\n",
	"<\|im_start\|>assistant\n",
	"<tool_call>\n",
	"{\"name\": \"list_pull_requests\", \"arguments\": \"{\\\"owner\\\":\\\"acme-corp\\\",\\\"repo\\\":\\\"web-platform\\\"}\"}\n",
	"</tool_call><\|im_end\|>\n",
	"<\|im_start\|>user\n",
	"<tool_response>\n",
	"{...PRs...}\n",
	"</tool_response><\|im_end\|>\n",
	"<\|im_start\|>assistant\n",
	"Here's my complete investigation of issue #342:\n",
	"\n",
	"Issue Details:\n",
	"Title: Authentication fails silently...\n",
	"\n",
	"Related Commits:\n",
	"- abc123: feat(auth): implement token refresh...\n",
	"\n",
	"Related PRs:\n",
	"- PR #345: Fix authentication silent failure\n",
	"\n",
	"The issue appears to be addressed by PR #345.<\|im_end\|>\n",
	"\n"
	]
	}
	],
	"source": [
	"# ==============================================================================\n",
	"# Load a single example into the tokenizer's chat template from variable\n",
	"# ==============================================================================\n",
	"\n",
	"from datasets import load_dataset\n",
	"from transformers import AutoTokenizer\n",
	"\n",
	"model_name = \"Qwen/Qwen2.5-3B-Instruct\"\n",
	"tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
	"\n",
	"# Extract the actual conversation\n",
	"conversation = messages[0][\"messages\"]\n",
	"\n",
	"result = tokenizer.apply_chat_template(\n",
	" conversation,\n",
	" tools=tools,\n",
	" tokenize=False\n",
	")\n",
	"\n",
	"# Print the formatted chat template\n",
	"print(\"=== Formatted Chat Template ===\")\n",
	"print(result)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "92cd4544",
	"metadata": {},
	"outputs": [],
	"source": [
	"# ==============================================================================\n",
	"# From huggingface dataset\n",
	"# ==============================================================================\n",
	"\n",
	"from datasets import load_dataset # noqa: F811\n",
	"\n",
	"dataset = load_dataset(\"your-dataset-name\")\n",
	"\n",
	"# Apply to single example\n",
	"result = tokenizer.apply_chat_template(\n",
	" dataset[\"train\"][0][\"messages\"], # type: ignore\n",
	" tools=tools,\n",
	" tokenize=False)\n",
	"\n",
	"# Or map over entire dataset\n",
	"def format_conversation(example):\n",
	" return {\"text\": tokenizer.apply_chat_template(example[\"messages\"], tokenize=False)}\n",
	"\n",
	"formatted_dataset = dataset.map(format_conversation)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "7d8ec47c",
	"metadata": {},
	"outputs": [],
	"source": [
	"# ==============================================================================\n",
	"# For processing many examples from a dataset\n",
	"# ==============================================================================\n",
	"\n",
	"from datasets import load_dataset\n",
	"from transformers import AutoTokenizer\n",
	"\n",
	"model_name = \"Qwen/Qwen2.5-3B-Instruct\"\n",
	"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
	"\n",
	"# Load your dataset\n",
	"dataset = load_dataset(\"your-dataset-name\")\n",
	"\n",
	"# Method 1: Using map (recommended for large datasets)\n",
	"def format_conversation(example):\n",
	" text = tokenizer.apply_chat_template(\n",
	" example[\"messages\"], \n",
	" tokenize=False,\n",
	" add_generation_prompt=False\n",
	" )\n",
	" return {\"text\": text}\n",
	"\n",
	"formatted_dataset = dataset.map(format_conversation)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "87805c00",
	"metadata": {},
	"outputs": [],
	"source": [
	"# ==============================================================================\n",
	"# FUsing batched map (faster)\n",
	"# ==============================================================================\n",
	"from datasets import load_dataset\n",
	"from transformers import AutoTokenizer\n",
	"\n",
	"model_name = \"Qwen/Qwen2.5-3B-Instruct\"\n",
	"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
	"\n",
	"# Load your dataset\n",
	"dataset = load_dataset(\"your-dataset-name\")\n",
	"\n",
	"# Method 2: Using batched map (faster)\n",
	"def format_conversations_batched(examples):\n",
	" texts = [\n",
	" tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=False)\n",
	" for msgs in examples[\"messages\"]\n",
	" ]\n",
	" return {\"text\": texts}\n",
	"\n",
	"formatted_dataset = dataset.map(format_conversations_batched, batched=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "c878ad1e",
	"metadata": {},
	"outputs": [],
	"source": [
	"# ==============================================================================\n",
	"# Splitting train/test from files\n",
	"# ==============================================================================\n",
	"from datasets import load_dataset\n",
	"from transformers import AutoTokenizer\n",
	"\n",
	"model_name = \"Qwen/Qwen2.5-3B-Instruct\"\n",
	"tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
	"# Train/test split from separate files\n",
	"\n",
	"dataset = load_dataset(\"json\", data_files={\n",
	" \"train\": \"train.json\",\n",
	" \"test\": \"test.json\"\n",
	"})\n",
	"\n",
	"# Or from a single dataset\n",
	"\n",
	"# Split into train/eval\n",
	"splits = dataset.train_test_split(test_size=0.1, seed=42)\n",
	"train_ds = splits[\"train\"]\n",
	"eval_ds = splits[\"test\"]\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"id": "4eb4d9e0-5eeb-4b08-9f61-549998758ddf",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"MODEL \| STATUS \n",
	"-----------------------------------------------------------------\n",
	"\n",
	"[Llama-3] Success! Preview of Tool Call:\n",
	"...\n",
	"\n",
	"[Qwen-2.5] Success! Preview of Tool Call:\n",
	" {\"type\": \"string\"}}, \"required\": [\"owner\", \"repo\"]}}}\n",
	"</tools>\n",
	"\n",
	"For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n",
	"<tool_call>\n",
	"{\"name\": <function-name>, \"arguments\": <args-json-object>}\n",
	"</tool_call><\|im_end\|>\n",
	"<\|im_start\|>assistant\n",
	"\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"The tokenizer you are loading from 'mistralai/Mistral-Nemo-Instruct-2407' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"\n",
	"[Mistral] Error: After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\n",
	"\n",
	"[Command-R] Success! Preview of Tool Call:\n",
	"the code.\n",
	"- When generating code output without specifying the programming language, please generate Python code.\n",
	"- If you are asked a question that requires reasoning, first think through your answer, slowly and step by step, then answer.<\|END_OF_TURN_TOKEN\|><\|START_OF_TURN_TOKEN\|><\|CHATBOT_TOKEN\|>\n"
	]
	}
	],
	"source": [
	"# 3. TEST ACROSS ARCHITECTURES\n",
	"# ---------------------------------------------------------\n",
	"model_map = {\n",
	" \"Llama-3\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
	" \"Qwen-2.5\": \"Qwen/Qwen2.5-7B-Instruct\",\n",
	" \"Mistral\": \"mistralai/Mistral-Nemo-Instruct-2407\",\n",
	" \"Command-R\": \"estrogen/c4ai-command-r7b-12-2024\"\n",
	"}\n",
	"\n",
	"print(f\"{'MODEL':<15} \| {'STATUS':<50}\")\n",
	"print(\"-\" * 65)\n",
	"\n",
	"for name, model_id in model_map.items():\n",
	" try:\n",
	" tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
	" \n",
	" # NOTE: Mistral requires valid tool definitions passed to apply_chat_template\n",
	" # to trigger the tool-use logic correctly.\n",
	" prompt = tokenizer.apply_chat_template(\n",
	" messages,\n",
	" tools=tools,\n",
	" tokenize=False,\n",
	" add_generation_prompt=True\n",
	" )\n",
	" \n",
	" # Printing just the tool call section for brevity\n",
	" print(f\"\\n[{name}] Success! Preview of Tool Call:\")\n",
	" \n",
	" # A simple slice to show the relevant part of the output string\n",
	" if \"Mistral\" in name:\n",
	" # Mistral puts calls inside [TOOL_CALLS]\n",
	" start = prompt.find(\"[TOOL_CALLS]\")\n",
	" print(prompt[start:start+150] + \"...\")\n",
	" elif \"Llama\" in name:\n",
	" # Llama puts calls in header blocks\n",
	" start = prompt.find(\"<\|python_tag\|>\")\n",
	" print(prompt[start:start+150] + \"...\")\n",
	" else:\n",
	" # Fallback preview\n",
	" print(prompt[-300:]) \n",
	" \n",
	" except Exception as e:\n",
	" print(f\"\\n[{name}] Error: {e}\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"id": "cee35917",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Validating standard against tokenizer: mistralai/Mistral-Nemo-Instruct-2407...\n",
	"\n",
	"[VALIDATION FAILED] Error: After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\n"
	]
	}
	],
	"source": [
	"# Mistral will error \"The tokenizer you are loading from 'mistralai/Mistral-Nemo-Instruct-2407' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.\"\n",
	"\n",
	"model_id = \"mistralai/Mistral-Nemo-Instruct-2407\"\n",
	"\n",
	"try:\n",
	" print(f\"Validating standard against tokenizer: {model_id}...\")\n",
	" \n",
	" # flag 'fix_mistral_regex=True' ensures correct handling of special tokens\n",
	" tokenizer = AutoTokenizer.from_pretrained(model_id, fix_mistral_regex=True)\n",
	" \n",
	" # Render the prompt\n",
	" prompt = tokenizer.apply_chat_template(\n",
	" messages,\n",
	" tools=tools,\n",
	" tokenize=False,\n",
	" add_generation_prompt=True\n",
	" )\n",
	" \n",
	" print(\"\\n[VALIDATION SUCCESSFUL] Output Template Preview:\\n\")\n",
	" print(\"-\" * 60)\n",
	" \n",
	" # Extracting the relevant tool call section for verification\n",
	" start_calls = prompt.find(\"[TOOL_CALLS]\")\n",
	" end_results = prompt.rfind(\"[/TOOL_RESULTS]\") + 15\n",
	" \n",
	" print(prompt[start_calls:end_results])\n",
	" print(\"-\" * 60)\n",
	"\n",
	"except Exception as e:\n",
	" print(f\"\\n[VALIDATION FAILED] Error: {e}\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "5fbf01bc",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": ".venv",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.12.11"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}
No results found