shaltielshmid · February 6, 2026 04:11
diff --git a/train_math_nemo.py b/train_math_nemo.py
 MODEL_TO_USE = "NanoV2" # or "MistralNemo"
 IMPORT_MODEL_AND_DATA = False # set to True to import the model from HF hub, only needs to be run once
 REASONING_STYLE = "gpt_oss_120b" # or DeepSeek_R1_0528
 NUM_NODES = 2
 GPUS_PER_NODE = 8

 import nemo_run as run
 from nemo.collections import llm
 from nemo.collections.llm.gpt.model.mistral import MistralModel, MistralNeMoConfig12B
 from nemo.collections.llm.gpt.model.ssm import MambaModel, NemotronNano12Bv2

 from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
 from nemo.collections.llm.gpt.data.chat import ChatDataModule


 if __name__ == '__main__':

    # run the following code once, to import the model:
    if IMPORT_MODEL:
        if MODEL_TO_USE == "NanoV2":
            llm.import_ckpt(model=MambaModel(config=NemotronNano12Bv2()), source="hf://nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base")
        elif MODEL_TO_USE == "MistralNemo":
            llm.import_ckpt(model=MistralModel(config=MistralNeMoConfig12B()), source="hf://mistralai/Mistral-Nemo-Base-2407")
        else: raise ValueError(f"Unknown MODEL_TO_USE {MODEL_TO_USE}")

        # download and save the data
        from datasets import load_dataset
        dataset = load_dataset("dicta-il/MathCOT-oss-vs-DeepSeek")
        
        os.makedirs('data/gpt_oss_120b', exist_ok=True)
        os.makedirs('data/DeepSeek_R1_0528', exist_ok=True)
        for split in dataset.keys():
            dataset[split].to_json(f"data/{split}/training.jsonl", lines=True, orient="records", force_ascii=False)
            dataset[split].select(range(1000)).to_json(f"data/{split}/validation.jsonl", lines=True, orient="records", force_ascii=False)
        exit(0)

    if MODEL_TO_USE == "NanoV2":
        recipe = model.finetune_recipe(
            resume_path="nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base",
            name="math_cot_test",
            dir="checkpoints/math_cot_test",
            num_nodes=NUM_NODES,
            num_gpus_per_node=GPUS_PER_NODE,
            peft_scheme=None
        )
        tokenizer_name = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base"
        # NanoV2 recipe defaults to custom FP8 mixed precision for SSM, so we don't need to set it here
    elif MODEL_TO_USE == "MistralNemo":
        recipe = model.finetune_recipe(
            name="math_cot_test",
            dir="checkpoints/math_cot_test",
            num_nodes=NUM_NODES,
            num_gpus_per_node=GPUS_PER_NODE,
            peft_scheme=None
        )
        tokenizer_name = "mistralai/Mistral-Nemo-Base-2407"
        finetune.trainer.plugins = bf16_with_fp8_mixed()
    else:
        raise ValueError(f"Unknown MODEL_TO_USE {MODEL_TO_USE}")

    # override configuration in the recipe
    finetune.trainer.max_steps = 3_000
    finetune.optim.config.lr = 5e-6
    finetune.optim.lr_scheduler.min_lr = 5e-7
    finetune.optim.lr_scheduler.warmup_steps = int(3_000 * 0.03)
    
    # For the long context, we're going to set tensor_parallel_size to 8 and pipeline_parallelism to 2
    # we are running on 2 nodes of 8 GPUs, so we can shard to 16. Context_parallelism doesn't work with packing.
    finetune.trainer.strategy.tensor_model_parallel_size = 8
    finetune.trainer.strategy.pipeline_model_parallel_size = 2
    
    seq_length = 59_280 
    finetune.data = run.Config(ChatDataModule, 
        dataset_root=f"data/{REASONING_STYLE}",
        seq_length=seq_length,
        tokenizer=run.Config(AutoTokenizer, pretrained_model_name=tokenizer_name),
        micro_batch_size=1, # has to be 1 when packing
        global_batch_size=64,
        num_workers=0, # increase based on available memory
        packed_sequence_specs=PackedSequenceSpecs(packed_sequence_size=seq_length),
        use_hf_tokenizer_chat_template=True,
        dataset_kwargs=dict(
            add_eos=False,
            add_bos=False,
            pad_seq_length_to_mult=128
        )
    )

    # replace with your information / SlurmExecutor
    executor = run.LeptonExecutor(
        resource_shape="gpu.8xh200",
        node_group='nv-nodegroup-h200-01',
        container_image='nvcr.io/nvidia/nemo:25.09',
        nodes=NUM_NODES,
        gpus_per_node=GPUS_PER_NODE,
        nemo_run_dir="/path/to/.nemo_run/",
        mounts=[], # fill in shared storage here
        env_vars={}, # fill in any env vars here
        packager=run.Packager()
    )
    # alternatively:
    # executor = run.LocalExecutor(...) # see https://docs.nvidia.com/nemo/run/latest/api/nemo_run/index.html#nemo_run.LocalExecutor
    # executor = run.SlurmExecutor(...) # see https://docs.nvidia.com/nemo/run/latest/api/nemo_run/index.html#nemo_run.SlurmExecutor
   
    run.run(finetune, executor)
	MODEL_TO_USE = "NanoV2" # or "MistralNemo"
	IMPORT_MODEL_AND_DATA = False # set to True to import the model from HF hub, only needs to be run once
	REASONING_STYLE = "gpt_oss_120b" # or DeepSeek_R1_0528
	NUM_NODES = 2
	GPUS_PER_NODE = 8

	import nemo_run as run
	from nemo.collections import llm
	from nemo.collections.llm.gpt.model.mistral import MistralModel, MistralNeMoConfig12B
	from nemo.collections.llm.gpt.model.ssm import MambaModel, NemotronNano12Bv2

	from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
	from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
	from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
	from nemo.collections.llm.gpt.data.chat import ChatDataModule


	if __name__ == '__main__':

	# run the following code once, to import the model:
	if IMPORT_MODEL:
	if MODEL_TO_USE == "NanoV2":
	llm.import_ckpt(model=MambaModel(config=NemotronNano12Bv2()), source="hf://nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base")
	elif MODEL_TO_USE == "MistralNemo":
	llm.import_ckpt(model=MistralModel(config=MistralNeMoConfig12B()), source="hf://mistralai/Mistral-Nemo-Base-2407")
	else: raise ValueError(f"Unknown MODEL_TO_USE {MODEL_TO_USE}")

	# download and save the data
	from datasets import load_dataset
	dataset = load_dataset("dicta-il/MathCOT-oss-vs-DeepSeek")

	os.makedirs('data/gpt_oss_120b', exist_ok=True)
	os.makedirs('data/DeepSeek_R1_0528', exist_ok=True)
	for split in dataset.keys():
	dataset[split].to_json(f"data/{split}/training.jsonl", lines=True, orient="records", force_ascii=False)
	dataset[split].select(range(1000)).to_json(f"data/{split}/validation.jsonl", lines=True, orient="records", force_ascii=False)
	exit(0)

	if MODEL_TO_USE == "NanoV2":
	recipe = model.finetune_recipe(
	resume_path="nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base",
	name="math_cot_test",
	dir="checkpoints/math_cot_test",
	num_nodes=NUM_NODES,
	num_gpus_per_node=GPUS_PER_NODE,
	peft_scheme=None
	)
	tokenizer_name = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base"
	# NanoV2 recipe defaults to custom FP8 mixed precision for SSM, so we don't need to set it here
	elif MODEL_TO_USE == "MistralNemo":
	recipe = model.finetune_recipe(
	name="math_cot_test",
	dir="checkpoints/math_cot_test",
	num_nodes=NUM_NODES,
	num_gpus_per_node=GPUS_PER_NODE,
	peft_scheme=None
	)
	tokenizer_name = "mistralai/Mistral-Nemo-Base-2407"
	finetune.trainer.plugins = bf16_with_fp8_mixed()
	else:
	raise ValueError(f"Unknown MODEL_TO_USE {MODEL_TO_USE}")

	# override configuration in the recipe
	finetune.trainer.max_steps = 3_000
	finetune.optim.config.lr = 5e-6
	finetune.optim.lr_scheduler.min_lr = 5e-7
	finetune.optim.lr_scheduler.warmup_steps = int(3_000 * 0.03)

	# For the long context, we're going to set tensor_parallel_size to 8 and pipeline_parallelism to 2
	# we are running on 2 nodes of 8 GPUs, so we can shard to 16. Context_parallelism doesn't work with packing.
	finetune.trainer.strategy.tensor_model_parallel_size = 8
	finetune.trainer.strategy.pipeline_model_parallel_size = 2

	seq_length = 59_280
	finetune.data = run.Config(ChatDataModule,
	dataset_root=f"data/{REASONING_STYLE}",
	seq_length=seq_length,
	tokenizer=run.Config(AutoTokenizer, pretrained_model_name=tokenizer_name),
	micro_batch_size=1, # has to be 1 when packing
	global_batch_size=64,
	num_workers=0, # increase based on available memory
	packed_sequence_specs=PackedSequenceSpecs(packed_sequence_size=seq_length),
	use_hf_tokenizer_chat_template=True,
	dataset_kwargs=dict(
	add_eos=False,
	add_bos=False,
	pad_seq_length_to_mult=128
	)
	)

	# replace with your information / SlurmExecutor
	executor = run.LeptonExecutor(
	resource_shape="gpu.8xh200",
	node_group='nv-nodegroup-h200-01',
	container_image='nvcr.io/nvidia/nemo:25.09',
	nodes=NUM_NODES,
	gpus_per_node=GPUS_PER_NODE,
	nemo_run_dir="/path/to/.nemo_run/",
	mounts=[], # fill in shared storage here
	env_vars={}, # fill in any env vars here
	packager=run.Packager()
	)
	# alternatively:
	# executor = run.LocalExecutor(...) # see https://docs.nvidia.com/nemo/run/latest/api/nemo_run/index.html#nemo_run.LocalExecutor
	# executor = run.SlurmExecutor(...) # see https://docs.nvidia.com/nemo/run/latest/api/nemo_run/index.html#nemo_run.SlurmExecutor

	run.run(finetune, executor)
No results found