Skip to content

Instantly share code, notes, and snippets.

@shaltielshmid
Last active February 6, 2026 04:11
Show Gist options
  • Select an option

  • Save shaltielshmid/af27fc1ac24fcb85592bbf12dadcd10f to your computer and use it in GitHub Desktop.

Select an option

Save shaltielshmid/af27fc1ac24fcb85592bbf12dadcd10f to your computer and use it in GitHub Desktop.
NeMo-Framework training code from the paper "Learning to Reason: Training LLMs with GPT-OSS or DeepSeek R1 Reasoning Traces", for training Mistral-Nemo-Base-2407 or nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base on reasoning data, generated either via gpt-oss-120b or DeepSeek-R1-0528
MODEL_TO_USE = "NanoV2" # or "MistralNemo"
IMPORT_MODEL_AND_DATA = False # set to True to import the model from HF hub, only needs to be run once
REASONING_STYLE = "gpt_oss_120b" # or DeepSeek_R1_0528
NUM_NODES = 2
GPUS_PER_NODE = 8
import nemo_run as run
from nemo.collections import llm
from nemo.collections.llm.gpt.model.mistral import MistralModel, MistralNeMoConfig12B
from nemo.collections.llm.gpt.model.ssm import MambaModel, NemotronNano12Bv2
from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
from nemo.collections.llm.gpt.data.chat import ChatDataModule
if __name__ == '__main__':
# run the following code once, to import the model:
if IMPORT_MODEL:
if MODEL_TO_USE == "NanoV2":
llm.import_ckpt(model=MambaModel(config=NemotronNano12Bv2()), source="hf://nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base")
elif MODEL_TO_USE == "MistralNemo":
llm.import_ckpt(model=MistralModel(config=MistralNeMoConfig12B()), source="hf://mistralai/Mistral-Nemo-Base-2407")
else: raise ValueError(f"Unknown MODEL_TO_USE {MODEL_TO_USE}")
# download and save the data
from datasets import load_dataset
dataset = load_dataset("dicta-il/MathCOT-oss-vs-DeepSeek")
os.makedirs('data/gpt_oss_120b', exist_ok=True)
os.makedirs('data/DeepSeek_R1_0528', exist_ok=True)
for split in dataset.keys():
dataset[split].to_json(f"data/{split}/training.jsonl", lines=True, orient="records", force_ascii=False)
dataset[split].select(range(1000)).to_json(f"data/{split}/validation.jsonl", lines=True, orient="records", force_ascii=False)
exit(0)
if MODEL_TO_USE == "NanoV2":
recipe = model.finetune_recipe(
resume_path="nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base",
name="math_cot_test",
dir="checkpoints/math_cot_test",
num_nodes=NUM_NODES,
num_gpus_per_node=GPUS_PER_NODE,
peft_scheme=None
)
tokenizer_name = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base"
# NanoV2 recipe defaults to custom FP8 mixed precision for SSM, so we don't need to set it here
elif MODEL_TO_USE == "MistralNemo":
recipe = model.finetune_recipe(
name="math_cot_test",
dir="checkpoints/math_cot_test",
num_nodes=NUM_NODES,
num_gpus_per_node=GPUS_PER_NODE,
peft_scheme=None
)
tokenizer_name = "mistralai/Mistral-Nemo-Base-2407"
finetune.trainer.plugins = bf16_with_fp8_mixed()
else:
raise ValueError(f"Unknown MODEL_TO_USE {MODEL_TO_USE}")
# override configuration in the recipe
finetune.trainer.max_steps = 3_000
finetune.optim.config.lr = 5e-6
finetune.optim.lr_scheduler.min_lr = 5e-7
finetune.optim.lr_scheduler.warmup_steps = int(3_000 * 0.03)
# For the long context, we're going to set tensor_parallel_size to 8 and pipeline_parallelism to 2
# we are running on 2 nodes of 8 GPUs, so we can shard to 16. Context_parallelism doesn't work with packing.
finetune.trainer.strategy.tensor_model_parallel_size = 8
finetune.trainer.strategy.pipeline_model_parallel_size = 2
seq_length = 59_280
finetune.data = run.Config(ChatDataModule,
dataset_root=f"data/{REASONING_STYLE}",
seq_length=seq_length,
tokenizer=run.Config(AutoTokenizer, pretrained_model_name=tokenizer_name),
micro_batch_size=1, # has to be 1 when packing
global_batch_size=64,
num_workers=0, # increase based on available memory
packed_sequence_specs=PackedSequenceSpecs(packed_sequence_size=seq_length),
use_hf_tokenizer_chat_template=True,
dataset_kwargs=dict(
add_eos=False,
add_bos=False,
pad_seq_length_to_mult=128
)
)
# replace with your information / SlurmExecutor
executor = run.LeptonExecutor(
resource_shape="gpu.8xh200",
node_group='nv-nodegroup-h200-01',
container_image='nvcr.io/nvidia/nemo:25.09',
nodes=NUM_NODES,
gpus_per_node=GPUS_PER_NODE,
nemo_run_dir="/path/to/.nemo_run/",
mounts=[], # fill in shared storage here
env_vars={}, # fill in any env vars here
packager=run.Packager()
)
# alternatively:
# executor = run.LocalExecutor(...) # see https://docs.nvidia.com/nemo/run/latest/api/nemo_run/index.html#nemo_run.LocalExecutor
# executor = run.SlurmExecutor(...) # see https://docs.nvidia.com/nemo/run/latest/api/nemo_run/index.html#nemo_run.SlurmExecutor
run.run(finetune, executor)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment