Skip to content

Instantly share code, notes, and snippets.

@lukehinds
Created December 13, 2025 16:23
Show Gist options
  • Select an option

  • Save lukehinds/ce12f7c6625416687aad50ee54aaebb7 to your computer and use it in GitHub Desktop.

Select an option

Save lukehinds/ce12f7c6625416687aad50ee54aaebb7 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"id": "a7ed1c49",
"metadata": {},
"outputs": [],
"source": [
"%%capture\n",
"!pip install -q datasets transformers"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "fcf1a44d",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer\n",
"import json"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "e2c7fedb",
"metadata": {},
"outputs": [],
"source": [
"# ==============================================================================\n",
"# STANDARD: PARALLEL HETEROGENEOUS TOOL CALLING\n",
"# ==============================================================================\n",
"# This dataset example demonstrates a single turn where the model must fire \n",
"# two distinct tools simultaneously.\n",
"#\n",
"# CONSTRAINTS:\n",
"# 1. 'id' field is MANDATORY. \n",
"# 2. 'id' must be exactly 9 alphanumeric characters (Regex: ^[a-zA-Z0-9]{9}$).\n",
"# 3. 'arguments' in 'assistant' block are DICTIONARIES (not strings).\n",
"# ==============================================================================\n",
"\n",
"messages = [\n",
" {\n",
" \"messages\": [\n",
" {\"role\": \"user\", \"content\": \"Investigate issue #342 in acme-corp/web-platform - get the issue details, find related commits, and show me any PRs that address it\"},\n",
" \n",
" {\"role\": \"assistant\", \"content\": \"\", \"tool_calls\": [{\"function\": {\"name\": \"issue_read\", \"arguments\": \"{\\\"owner\\\":\\\"acme-corp\\\",\\\"repo\\\":\\\"web-platform\\\",\\\"issue_number\\\":342,\\\"method\\\":\\\"get\\\"}\"}, \"id\": \"call1\", \"type\": \"function\"}]},\n",
" {\"role\": \"tool\", \"tool_call_id\": \"call1\", \"content\": \"{...issue #342 details...}\"},\n",
" \n",
" {\"role\": \"assistant\", \"content\": \"\", \"tool_calls\": [{\"function\": {\"name\": \"list_commits\", \"arguments\": \"{\\\"owner\\\":\\\"acme-corp\\\",\\\"repo\\\":\\\"web-platform\\\"}\"}, \"id\": \"call2\", \"type\": \"function\"}]},\n",
" {\"role\": \"tool\", \"tool_call_id\": \"call2\", \"content\": \"{...commits...}\"},\n",
" \n",
" {\"role\": \"assistant\", \"content\": \"\", \"tool_calls\": [{\"function\": {\"name\": \"list_pull_requests\", \"arguments\": \"{\\\"owner\\\":\\\"acme-corp\\\",\\\"repo\\\":\\\"web-platform\\\"}\"}, \"id\": \"call3\", \"type\": \"function\"}]},\n",
" {\"role\": \"tool\", \"tool_call_id\": \"call3\", \"content\": \"{...PRs...}\"},\n",
" \n",
" {\"role\": \"assistant\", \"content\": \"Here's my complete investigation of issue #342:\\n\\n**Issue Details:**\\nTitle: Authentication fails silently...\\n\\n**Related Commits:**\\n- abc123: feat(auth): implement token refresh...\\n\\n**Related PRs:**\\n- PR #345: Fix authentication silent failure\\n\\nThe issue appears to be addressed by PR #345.\"}\n",
" ]\n",
" }\n",
"]\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "5528837a",
"metadata": {},
"outputs": [],
"source": [
"# ==============================================================================\n",
"# TOOL DEFINITIONS (SCHEMA)\n",
"# ==============================================================================\n",
"# Required by chat templates to generate the system prompt instructions.\n",
"\n",
"tools = [\n",
" {\n",
" \"type\": \"function\",\n",
" \"function\": {\n",
" \"name\": \"issue_read\",\n",
" \"description\": \"Get details of a specific issue in a repository\",\n",
" \"parameters\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"owner\": {\"type\": \"string\"},\n",
" \"repo\": {\"type\": \"string\"},\n",
" \"issue_number\": {\"type\": \"integer\"},\n",
" \"method\": {\"type\": \"string\", \"enum\": [\"get\"]}\n",
" },\n",
" \"required\": [\"owner\", \"repo\", \"issue_number\", \"method\"]\n",
" }\n",
" }\n",
" },\n",
" {\n",
" \"type\": \"function\",\n",
" \"function\": {\n",
" \"name\": \"list_commits\",\n",
" \"description\": \"Get the list of commits for a repository\",\n",
" \"parameters\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"owner\": {\"type\": \"string\"},\n",
" \"repo\": {\"type\": \"string\"}\n",
" },\n",
" \"required\": [\"owner\", \"repo\"]\n",
" }\n",
" }\n",
" },\n",
" {\n",
" \"type\": \"function\",\n",
" \"function\": {\n",
" \"name\": \"list_pull_requests\",\n",
" \"description\": \"Get the list of pull requests for a repository\",\n",
" \"parameters\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"owner\": {\"type\": \"string\"},\n",
" \"repo\": {\"type\": \"string\"}\n",
" },\n",
" \"required\": [\"owner\", \"repo\"]\n",
" }\n",
" }\n",
" }\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "668bbd80",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== Formatted Chat Template ===\n",
"<|im_start|>system\n",
"You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\n",
"\n",
"# Tools\n",
"\n",
"You may call one or more functions to assist with the user query.\n",
"\n",
"You are provided with function signatures within <tools></tools> XML tags:\n",
"<tools>\n",
"{\"type\": \"function\", \"function\": {\"name\": \"issue_read\", \"description\": \"Get details of a specific issue in a repository\", \"parameters\": {\"type\": \"object\", \"properties\": {\"owner\": {\"type\": \"string\"}, \"repo\": {\"type\": \"string\"}, \"issue_number\": {\"type\": \"integer\"}, \"method\": {\"type\": \"string\", \"enum\": [\"get\"]}}, \"required\": [\"owner\", \"repo\", \"issue_number\", \"method\"]}}}\n",
"{\"type\": \"function\", \"function\": {\"name\": \"list_commits\", \"description\": \"Get the list of commits for a repository\", \"parameters\": {\"type\": \"object\", \"properties\": {\"owner\": {\"type\": \"string\"}, \"repo\": {\"type\": \"string\"}}, \"required\": [\"owner\", \"repo\"]}}}\n",
"{\"type\": \"function\", \"function\": {\"name\": \"list_pull_requests\", \"description\": \"Get the list of pull requests for a repository\", \"parameters\": {\"type\": \"object\", \"properties\": {\"owner\": {\"type\": \"string\"}, \"repo\": {\"type\": \"string\"}}, \"required\": [\"owner\", \"repo\"]}}}\n",
"</tools>\n",
"\n",
"For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n",
"<tool_call>\n",
"{\"name\": <function-name>, \"arguments\": <args-json-object>}\n",
"</tool_call><|im_end|>\n",
"<|im_start|>user\n",
"Investigate issue #342 in acme-corp/web-platform - get the issue details, find related commits, and show me any PRs that address it<|im_end|>\n",
"<|im_start|>assistant\n",
"<tool_call>\n",
"{\"name\": \"issue_read\", \"arguments\": \"{\\\"owner\\\":\\\"acme-corp\\\",\\\"repo\\\":\\\"web-platform\\\",\\\"issue_number\\\":342,\\\"method\\\":\\\"get\\\"}\"}\n",
"</tool_call><|im_end|>\n",
"<|im_start|>user\n",
"<tool_response>\n",
"{...issue #342 details...}\n",
"</tool_response><|im_end|>\n",
"<|im_start|>assistant\n",
"<tool_call>\n",
"{\"name\": \"list_commits\", \"arguments\": \"{\\\"owner\\\":\\\"acme-corp\\\",\\\"repo\\\":\\\"web-platform\\\"}\"}\n",
"</tool_call><|im_end|>\n",
"<|im_start|>user\n",
"<tool_response>\n",
"{...commits...}\n",
"</tool_response><|im_end|>\n",
"<|im_start|>assistant\n",
"<tool_call>\n",
"{\"name\": \"list_pull_requests\", \"arguments\": \"{\\\"owner\\\":\\\"acme-corp\\\",\\\"repo\\\":\\\"web-platform\\\"}\"}\n",
"</tool_call><|im_end|>\n",
"<|im_start|>user\n",
"<tool_response>\n",
"{...PRs...}\n",
"</tool_response><|im_end|>\n",
"<|im_start|>assistant\n",
"Here's my complete investigation of issue #342:\n",
"\n",
"**Issue Details:**\n",
"Title: Authentication fails silently...\n",
"\n",
"**Related Commits:**\n",
"- abc123: feat(auth): implement token refresh...\n",
"\n",
"**Related PRs:**\n",
"- PR #345: Fix authentication silent failure\n",
"\n",
"The issue appears to be addressed by PR #345.<|im_end|>\n",
"\n"
]
}
],
"source": [
"# ==============================================================================\n",
"# Load a single example into the tokenizer's chat template from variable\n",
"# ==============================================================================\n",
"\n",
"from datasets import load_dataset\n",
"from transformers import AutoTokenizer\n",
"\n",
"model_name = \"Qwen/Qwen2.5-3B-Instruct\"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
"\n",
"# Extract the actual conversation\n",
"conversation = messages[0][\"messages\"]\n",
"\n",
"result = tokenizer.apply_chat_template(\n",
" conversation,\n",
" tools=tools,\n",
" tokenize=False\n",
")\n",
"\n",
"# Print the formatted chat template\n",
"print(\"=== Formatted Chat Template ===\")\n",
"print(result)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "92cd4544",
"metadata": {},
"outputs": [],
"source": [
"# ==============================================================================\n",
"# From huggingface dataset\n",
"# ==============================================================================\n",
"\n",
"from datasets import load_dataset # noqa: F811\n",
"\n",
"dataset = load_dataset(\"your-dataset-name\")\n",
"\n",
"# Apply to single example\n",
"result = tokenizer.apply_chat_template(\n",
" dataset[\"train\"][0][\"messages\"], # type: ignore\n",
" tools=tools,\n",
" tokenize=False)\n",
"\n",
"# Or map over entire dataset\n",
"def format_conversation(example):\n",
" return {\"text\": tokenizer.apply_chat_template(example[\"messages\"], tokenize=False)}\n",
"\n",
"formatted_dataset = dataset.map(format_conversation)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7d8ec47c",
"metadata": {},
"outputs": [],
"source": [
"# ==============================================================================\n",
"# For processing many examples from a dataset\n",
"# ==============================================================================\n",
"\n",
"from datasets import load_dataset\n",
"from transformers import AutoTokenizer\n",
"\n",
"model_name = \"Qwen/Qwen2.5-3B-Instruct\"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
"\n",
"# Load your dataset\n",
"dataset = load_dataset(\"your-dataset-name\")\n",
"\n",
"# Method 1: Using map (recommended for large datasets)\n",
"def format_conversation(example):\n",
" text = tokenizer.apply_chat_template(\n",
" example[\"messages\"], \n",
" tokenize=False,\n",
" add_generation_prompt=False\n",
" )\n",
" return {\"text\": text}\n",
"\n",
"formatted_dataset = dataset.map(format_conversation)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "87805c00",
"metadata": {},
"outputs": [],
"source": [
"# ==============================================================================\n",
"# FUsing batched map (faster)\n",
"# ==============================================================================\n",
"from datasets import load_dataset\n",
"from transformers import AutoTokenizer\n",
"\n",
"model_name = \"Qwen/Qwen2.5-3B-Instruct\"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
"\n",
"# Load your dataset\n",
"dataset = load_dataset(\"your-dataset-name\")\n",
"\n",
"# Method 2: Using batched map (faster)\n",
"def format_conversations_batched(examples):\n",
" texts = [\n",
" tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=False)\n",
" for msgs in examples[\"messages\"]\n",
" ]\n",
" return {\"text\": texts}\n",
"\n",
"formatted_dataset = dataset.map(format_conversations_batched, batched=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c878ad1e",
"metadata": {},
"outputs": [],
"source": [
"# ==============================================================================\n",
"# Splitting train/test from files\n",
"# ==============================================================================\n",
"from datasets import load_dataset\n",
"from transformers import AutoTokenizer\n",
"\n",
"model_name = \"Qwen/Qwen2.5-3B-Instruct\"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
"# Train/test split from separate files\n",
"\n",
"dataset = load_dataset(\"json\", data_files={\n",
" \"train\": \"train.json\",\n",
" \"test\": \"test.json\"\n",
"})\n",
"\n",
"# Or from a single dataset\n",
"\n",
"# Split into train/eval\n",
"splits = dataset.train_test_split(test_size=0.1, seed=42)\n",
"train_ds = splits[\"train\"]\n",
"eval_ds = splits[\"test\"]\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "4eb4d9e0-5eeb-4b08-9f61-549998758ddf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MODEL | STATUS \n",
"-----------------------------------------------------------------\n",
"\n",
"[Llama-3] Success! Preview of Tool Call:\n",
"...\n",
"\n",
"[Qwen-2.5] Success! Preview of Tool Call:\n",
" {\"type\": \"string\"}}, \"required\": [\"owner\", \"repo\"]}}}\n",
"</tools>\n",
"\n",
"For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n",
"<tool_call>\n",
"{\"name\": <function-name>, \"arguments\": <args-json-object>}\n",
"</tool_call><|im_end|>\n",
"<|im_start|>assistant\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The tokenizer you are loading from 'mistralai/Mistral-Nemo-Instruct-2407' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"[Mistral] Error: After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\n",
"\n",
"[Command-R] Success! Preview of Tool Call:\n",
"the code.\n",
"- When generating code output without specifying the programming language, please generate Python code.\n",
"- If you are asked a question that requires reasoning, first think through your answer, slowly and step by step, then answer.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>\n"
]
}
],
"source": [
"# 3. TEST ACROSS ARCHITECTURES\n",
"# ---------------------------------------------------------\n",
"model_map = {\n",
" \"Llama-3\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" \"Qwen-2.5\": \"Qwen/Qwen2.5-7B-Instruct\",\n",
" \"Mistral\": \"mistralai/Mistral-Nemo-Instruct-2407\",\n",
" \"Command-R\": \"estrogen/c4ai-command-r7b-12-2024\"\n",
"}\n",
"\n",
"print(f\"{'MODEL':<15} | {'STATUS':<50}\")\n",
"print(\"-\" * 65)\n",
"\n",
"for name, model_id in model_map.items():\n",
" try:\n",
" tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
" \n",
" # NOTE: Mistral requires valid tool definitions passed to apply_chat_template\n",
" # to trigger the tool-use logic correctly.\n",
" prompt = tokenizer.apply_chat_template(\n",
" messages,\n",
" tools=tools,\n",
" tokenize=False,\n",
" add_generation_prompt=True\n",
" )\n",
" \n",
" # Printing just the tool call section for brevity\n",
" print(f\"\\n[{name}] Success! Preview of Tool Call:\")\n",
" \n",
" # A simple slice to show the relevant part of the output string\n",
" if \"Mistral\" in name:\n",
" # Mistral puts calls inside [TOOL_CALLS]\n",
" start = prompt.find(\"[TOOL_CALLS]\")\n",
" print(prompt[start:start+150] + \"...\")\n",
" elif \"Llama\" in name:\n",
" # Llama puts calls in header blocks\n",
" start = prompt.find(\"<|python_tag|>\")\n",
" print(prompt[start:start+150] + \"...\")\n",
" else:\n",
" # Fallback preview\n",
" print(prompt[-300:]) \n",
" \n",
" except Exception as e:\n",
" print(f\"\\n[{name}] Error: {e}\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "cee35917",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Validating standard against tokenizer: mistralai/Mistral-Nemo-Instruct-2407...\n",
"\n",
"[VALIDATION FAILED] Error: After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\n"
]
}
],
"source": [
"# Mistral will error \"The tokenizer you are loading from 'mistralai/Mistral-Nemo-Instruct-2407' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.\"\n",
"\n",
"model_id = \"mistralai/Mistral-Nemo-Instruct-2407\"\n",
"\n",
"try:\n",
" print(f\"Validating standard against tokenizer: {model_id}...\")\n",
" \n",
" # flag 'fix_mistral_regex=True' ensures correct handling of special tokens\n",
" tokenizer = AutoTokenizer.from_pretrained(model_id, fix_mistral_regex=True)\n",
" \n",
" # Render the prompt\n",
" prompt = tokenizer.apply_chat_template(\n",
" messages,\n",
" tools=tools,\n",
" tokenize=False,\n",
" add_generation_prompt=True\n",
" )\n",
" \n",
" print(\"\\n[VALIDATION SUCCESSFUL] Output Template Preview:\\n\")\n",
" print(\"-\" * 60)\n",
" \n",
" # Extracting the relevant tool call section for verification\n",
" start_calls = prompt.find(\"[TOOL_CALLS]\")\n",
" end_results = prompt.rfind(\"[/TOOL_RESULTS]\") + 15\n",
" \n",
" print(prompt[start_calls:end_results])\n",
" print(\"-\" * 60)\n",
"\n",
"except Exception as e:\n",
" print(f\"\\n[VALIDATION FAILED] Error: {e}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5fbf01bc",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment