Last active
December 31, 2025 13:36
-
-
Save mratsim/027bef32f6ae294379333e7aac8efdfe to your computer and use it in GitHub Desktop.
LLM calibration set for quantizing code-focused LLMs with AWQ
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| calibration_set: | |
| _templates: | |
| programming_languages: &programming_languages "Solve the following problem using {{ ['Zephyr', 'Prolog', 'Cobol', 'Apex', 'Crystal', 'Fortran', 'Nim', 'Delphi', 'Ada', 'Objective-C', 'VBA', 'Perl', 'Groovy', 'MATLAB', 'Solidity', 'Visual Basic', 'OCaml', 'Erlang', 'Julia', 'Lisp', 'F#', 'Clojure', 'GDScript', 'Scala', 'R', 'Haskell', 'Ruby', 'Elixir', 'Lua', 'Zig', 'Dart', 'Swift', 'Metal', 'PowerShell', 'PHP', 'Kotlin', 'C', 'Java', 'C++', 'C#', 'Bash/Shell', 'Go', 'Rust', 'TypeScript', 'HTML/CSS', 'SQL', 'JavaScript', 'Python', 'Lean', 'Coq', 'Pony', 'D', 'Racket', 'Haxe', 'x86-64 ASM', 'ARM-64 ASM', 'LLVM IR', 'GLSL', 'CUDA', 'Vulkan'][hash(row|string) % 60] }}\n***\n" | |
| spoken_languages: &spoken_languages "Answer in {{ ['Arabic', 'Chinese', 'French', 'German', 'Hebrew', 'Hindi', 'Japanese', 'Korean', 'Portuguese', 'Russian', 'Spanish', 'Turkish'][hash(row|string) % 12] }}\n***\n" | |
| max_seq_length: 8192 | |
| shuffle: true | |
| seed: 42 | |
| datasets: | |
| # Category Summary (Total: 602 samples) | |
| # ===================================================== | |
| # General chat (24 samples - 3.99%) | |
| # Instruction and Reasoning tuning (14 samples - 2.33%) | |
| # Multilingual (36 samples - 5.98%) | |
| # Tool use (100 samples - 16.61%) | |
| # Code / Programming / Software Engineering / Devops (336 samples - 55.81%) | |
| # Math (12 samples - 1.99%) | |
| # Sciences (16 samples - 2.66%) | |
| # Medical (8 samples - 1.33%) | |
| # Finance (8 samples - 1.33%) | |
| # Business (16 samples - 2.66%) | |
| # Humanities and Philosophy (8 samples - 1.33%) | |
| # Creative Writing, Adventure, Roleplay (13 samples - 2.16%) | |
| # General Knowledge and Pop Culture (2 samples - 0.33%) | |
| # Specialized skills (8 samples - 1.33%) | |
| # Misc (1 sample - 0.17%) | |
| # ===================================================== | |
| # Research | |
| # ===================================================== | |
| # According to this presentation https://minjiazhang.github.io/courses/fall24-resource/slides/awq.pdf | |
| # AWQ only needs 64 samples to identify salient weights that need to be preserved. | |
| # | |
| # This research predates the boom of MoE (Mixture-of-Experts) models | |
| # and it's safer to assume that 64 samples of a general dataset | |
| # cannot properly identify salient weights of experts. | |
| # General chat (24 samples) | |
| # --------------------------------------------------------------------------- | |
| - dataset: HuggingFaceH4/ultrachat_200k | |
| columns: [messages] | |
| split: train_sft | |
| formatter: chat_completion | |
| num_samples: 8 | |
| - dataset: databricks/databricks-dolly-15k | |
| split: train | |
| columns: [instruction, response] | |
| formatter: prompt_answer | |
| num_samples: 8 | |
| - dataset: neuralmagic/calibration | |
| subset: LLM | |
| split: train | |
| columns: [messages] | |
| formatter: chat_completion | |
| num_samples: 8 | |
| # Instruction and Reasoning tuning (14 samples) | |
| # --------------------------------------------------------------------------- | |
| - dataset: HuggingFaceH4/no_robots | |
| split: train | |
| columns: [messages] | |
| formatter: chat_completion | |
| num_samples: 2 | |
| - dataset: nvidia/HelpSteer | |
| split: train | |
| columns: [prompt, response] | |
| formatter: prompt_answer | |
| num_samples: 2 | |
| - dataset: garage-bAInd/Open-Platypus | |
| split: train | |
| columns: [instruction, output] | |
| formatter: prompt_answer | |
| num_samples: 2 | |
| - dataset: PJMixers/grimulkan_physical-reasoning-ShareGPT | |
| split: train | |
| columns: [conversations] | |
| formatter: sharegpt | |
| num_samples: 4 | |
| - dataset: PJMixers/grimulkan_theory-of-mind-ShareGPT | |
| split: train | |
| columns: [conversations] | |
| formatter: sharegpt | |
| num_samples: 4 | |
| # Multilingual (36 samples) | |
| # --------------------------------------------------------------------------- | |
| - dataset: HuggingFaceH4/Multilingual-Thinking | |
| split: train | |
| columns: [user] | |
| formatter: raw_text | |
| num_samples: 32 | |
| formatter_params: | |
| prefix: *spoken_languages | |
| - dataset: ServiceNow-AI/M2Lingual | |
| subset: full_data | |
| split: train | |
| columns: [conversation] | |
| formatter: chat_completion | |
| num_samples: 4 | |
| streaming: true | |
| # Tool use (include commented out ToolAce) (100 samples) | |
| # --------------------------------------------------------------------------- | |
| # Fail with minimax! | |
| # jinja2.exceptions.TemplateError: Message has tool role, but there was no previous assistant message with a tool call! | |
| # - dataset: Team-ACE/ToolACE | |
| # split: train | |
| # columns: [system, conversations] | |
| # formatter: chat_completion_with_sysprompt | |
| # num_samples: 100 | |
| - dataset: interstellarninja/hermes_reasoning_tool_use | |
| split: train | |
| columns: [conversations] | |
| formatter: sharegpt | |
| num_samples: 100 | |
| # Code / Programming / Software Engineering / Devops (336 samples) | |
| # --------------------------------------------------------------------------- | |
| - dataset: deepmind/code_contests | |
| split: train | |
| columns: [name] | |
| formatter: deepmind_code_contests | |
| num_samples: 50 | |
| streaming: true | |
| - dataset: dh02391735/stackoverflow-kubernetes-questions | |
| split: train | |
| columns: [instruction] | |
| formatter: raw_text | |
| num_samples: 8 | |
| - dataset: diversoailab/humaneval-rust | |
| split: train | |
| columns: [prompt] | |
| formatter: raw_text | |
| num_samples: 100 | |
| formatter_params: # The dataset actually doesn't hardcode the language | |
| prefix: *programming_languages | |
| - dataset: ammarnasr/the-stack-rust-clean | |
| split: train | |
| columns: [content] | |
| formatter: raw_text | |
| num_samples: 8 | |
| streaming: true | |
| formatter_params: | |
| prefix: "Explain this code and comment it for a junior dev.\n***\n" | |
| - dataset: CSJianYang/CodeArena | |
| split: train | |
| columns: [messages] | |
| formatter: chat_completion | |
| num_samples: 8 | |
| - dataset: nvidia/OpenCodeInstruct | |
| split: train | |
| columns: [input, output] | |
| formatter: prompt_answer | |
| num_samples: 8 | |
| streaming: true | |
| - dataset: nvidia/Llama-Nemotron-Post-Training-Dataset | |
| split: code | |
| columns: [input] | |
| formatter: chat_completion | |
| num_samples: 8 | |
| streaming: true | |
| - dataset: nvidia/Nemotron-Competitive-Programming-v1 | |
| split: competitive_coding_cpp_part_00 | |
| columns: [messages] | |
| formatter: chat_completion | |
| num_samples: 8 | |
| - dataset: sr5434/CodegebraGPT_data | |
| subset: 100k-text | |
| split: train | |
| columns: [conversations] | |
| formatter: sharegpt | |
| num_samples: 8 | |
| - dataset: rombodawg/code_bagel_hermes-2.5 | |
| split: train | |
| columns: [input, output] | |
| formatter: prompt_answer | |
| num_samples: 100 | |
| - dataset: MathArena/project_euler | |
| split: train | |
| columns: [problem] | |
| formatter: raw_text | |
| num_samples: 30 | |
| formatter_params: | |
| prefix: *programming_languages | |
| # Math (12 samples) | |
| - dataset: nvidia/Llama-Nemotron-Post-Training-Dataset | |
| split: math | |
| columns: [input] | |
| formatter: chat_completion | |
| num_samples: 4 | |
| streaming: true | |
| - dataset: nvidia/Nemotron-Math-Proofs-v1 | |
| split: lean | |
| columns: [formal_statement] | |
| formatter: raw_text | |
| num_samples: 4 | |
| formatter_params: | |
| prefix: "Can you improve, document and add comment to this Lean proof for a non-mathematician?\n***\n" | |
| - dataset: nvidia/OpenMathInstruct-2 | |
| split: train | |
| columns: [problem, generated_solution] | |
| formatter: prompt_answer | |
| num_samples: 4 | |
| streaming: true | |
| # Sciences (16 samples) | |
| - dataset: nvidia/Llama-Nemotron-Post-Training-Dataset | |
| split: science | |
| columns: [input] | |
| formatter: chat_completion | |
| num_samples: 4 | |
| streaming: true | |
| - dataset: nvidia/OpenScienceReasoning-2 | |
| split: train | |
| columns: [input, output] | |
| formatter: prompt_answer | |
| num_samples: 8 | |
| streaming: true | |
| - dataset: MegaScience/MegaScience | |
| split: train | |
| columns: [question, answer] | |
| formatter: prompt_answer | |
| num_samples: 4 | |
| streaming: true | |
| # Medical (8 samples) | |
| - dataset: OpenMed/Medical-Reasoning-SFT-GPT-OSS-120B | |
| split: train | |
| columns: [messages] | |
| formatter: chat_completion | |
| num_samples: 4 | |
| streaming: true | |
| - dataset: ccdv/pubmed-summarization | |
| subset: section | |
| split: train | |
| columns: [article] | |
| formatter: raw_text | |
| num_samples: 4 | |
| streaming: true | |
| formatter_params: | |
| prefix: "Summarize this:\n***\n" | |
| # Finance (8 samples) | |
| - dataset: gbharti/finance-alpaca | |
| split: train | |
| columns: [instruction, output] | |
| formatter: prompt_answer | |
| num_samples: 4 | |
| - dataset: vladlen32230/summarization-yahoo-stock-finance-article-text | |
| split: train | |
| columns: [text] | |
| formatter: raw_text | |
| num_samples: 4 | |
| formatter_params: | |
| prefix: "Summarize this:\n***\n" | |
| # Business (16 samples) | |
| - dataset: fka/awesome-chatgpt-prompts | |
| split: train | |
| columns: [prompt] | |
| formatter: raw_text | |
| num_samples: 8 | |
| - dataset: theoldmandthesea/17k_business_book | |
| split: train | |
| columns: [question, answer] | |
| formatter: prompt_answer | |
| num_samples: 8 | |
| # Humanities and Philosophy (8 samples) | |
| - dataset: ruggsea/stanford-encyclopedia-of-philosophy_instruct | |
| split: train | |
| columns: [question, answer] | |
| formatter: prompt_answer | |
| num_samples: 2 | |
| - dataset: mlfoundations-dev/stackexchange_philosophy | |
| split: train | |
| columns: [conversations] | |
| formatter: sharegpt | |
| num_samples: 2 | |
| - dataset: FreedomIntelligence/SocraticChat | |
| split: train | |
| columns: [conversations] | |
| formatter: sharegpt | |
| num_samples: 4 | |
| # Creative Writing, Adventure, Roleplay (13 samples) | |
| - dataset: Gryphe/Opus-WritingPrompts | |
| split: train | |
| columns: [conversations] | |
| formatter: sharegpt | |
| num_samples: 2 | |
| - dataset: anthracite-org/nopm_claude_writing_fixed | |
| split: train | |
| columns: [conversations] | |
| formatter: sharegpt | |
| num_samples: 2 | |
| - dataset: zerofata/Roleplay-Anime-Characters | |
| split: train | |
| columns: [messages] | |
| formatter: chat_completion | |
| num_samples: 1 | |
| - dataset: zerofata/Instruct-Anime | |
| split: train | |
| columns: [messages] | |
| formatter: chat_completion | |
| num_samples: 1 | |
| - dataset: zerofata/Instruct-Anime-CreativeWriting | |
| split: train | |
| columns: [messages] | |
| formatter: chat_completion | |
| num_samples: 1 | |
| - dataset: sam-paech/gutenberg3-generalfiction-scifi-fantasy-romance-adventure-dpo | |
| split: train | |
| columns: [chosen] | |
| formatter: chat_completion | |
| num_samples: 2 | |
| - dataset: PocketDoc/Dans-Prosemaxx-Adventure | |
| split: train | |
| columns: [conversations] | |
| formatter: sharegpt | |
| num_samples: 2 | |
| - dataset: anthracite-org/stheno-filtered-v1.1 | |
| split: train | |
| columns: [conversations] | |
| formatter: sharegpt | |
| num_samples: 2 | |
| # General Knowledge and Pop Culture (2 samples) | |
| - dataset: KaraKaraWitch/TvTroper-2025 | |
| split: train | |
| columns: [article] | |
| formatter: raw_text | |
| num_samples: 2 | |
| formatter_params: | |
| prefix: "Explain this trope like I'm your grandmother\n***\n" | |
| # Specialized skills (8 samples) | |
| - dataset: AquaV/US-Army-Survival-Sharegpt | |
| split: train | |
| columns: [conversations] | |
| formatter: sharegpt | |
| num_samples: 2 | |
| - dataset: AquaV/Interrogation-Sharegpt | |
| split: train | |
| columns: [conversations] | |
| formatter: sharegpt | |
| num_samples: 2 | |
| - dataset: AquaV/Multi-Environment-Operations-Sharegpt | |
| split: train | |
| columns: [conversations] | |
| formatter: sharegpt | |
| num_samples: 2 | |
| - dataset: AquaV/Resistance-Sharegpt | |
| split: train | |
| columns: [conversations] | |
| formatter: sharegpt | |
| num_samples: 2 | |
| # Misc (1 sample) | |
| - dataset: PocketDoc/Dans-Kinomaxx-VanillaBackrooms | |
| split: train | |
| columns: [conversations] | |
| formatter: sharegpt | |
| num_samples: 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment