-
-
Save SALMANKHANPM/fe4c7aab4ad03760e0292ab1a268261d to your computer and use it in GitHub Desktop.
NeMo-Framework training code from the paper "Learning to Reason: Training LLMs with GPT-OSS or DeepSeek R1 Reasoning Traces", for training Mistral-Nemo-Base-2407 or nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base on reasoning data, generated either via gpt-oss-120b or DeepSeek-R1-0528
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| MODEL_TO_USE = "NanoV2" # or "MistralNemo" | |
| IMPORT_MODEL_AND_DATA = False # set to True to import the model from HF hub, only needs to be run once | |
| REASONING_STYLE = "gpt_oss_120b" # or DeepSeek_R1_0528 | |
| NUM_NODES = 2 | |
| GPUS_PER_NODE = 8 | |
| import nemo_run as run | |
| from nemo.collections import llm | |
| from nemo.collections.llm.gpt.model.mistral import MistralModel, MistralNeMoConfig12B | |
| from nemo.collections.llm.gpt.model.ssm import MambaModel, NemotronNano12Bv2 | |
| from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer | |
| from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed | |
| from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs | |
| from nemo.collections.llm.gpt.data.chat import ChatDataModule | |
| if __name__ == '__main__': | |
| # run the following code once, to import the model: | |
| if IMPORT_MODEL: | |
| if MODEL_TO_USE == "NanoV2": | |
| llm.import_ckpt(model=MambaModel(config=NemotronNano12Bv2()), source="hf://nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base") | |
| elif MODEL_TO_USE == "MistralNemo": | |
| llm.import_ckpt(model=MistralModel(config=MistralNeMoConfig12B()), source="hf://mistralai/Mistral-Nemo-Base-2407") | |
| else: raise ValueError(f"Unknown MODEL_TO_USE {MODEL_TO_USE}") | |
| # download and save the data | |
| from datasets import load_dataset | |
| dataset = load_dataset("dicta-il/MathCOT-oss-vs-DeepSeek") | |
| os.makedirs('data/gpt_oss_120b', exist_ok=True) | |
| os.makedirs('data/DeepSeek_R1_0528', exist_ok=True) | |
| for split in dataset.keys(): | |
| dataset[split].to_json(f"data/{split}/training.jsonl", lines=True, orient="records", force_ascii=False) | |
| dataset[split].select(range(1000)).to_json(f"data/{split}/validation.jsonl", lines=True, orient="records", force_ascii=False) | |
| exit(0) | |
| if MODEL_TO_USE == "NanoV2": | |
| recipe = model.finetune_recipe( | |
| resume_path="nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base", | |
| name="math_cot_test", | |
| dir="checkpoints/math_cot_test", | |
| num_nodes=NUM_NODES, | |
| num_gpus_per_node=GPUS_PER_NODE, | |
| peft_scheme=None | |
| ) | |
| tokenizer_name = "nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base" | |
| # NanoV2 recipe defaults to custom FP8 mixed precision for SSM, so we don't need to set it here | |
| elif MODEL_TO_USE == "MistralNemo": | |
| recipe = model.finetune_recipe( | |
| name="math_cot_test", | |
| dir="checkpoints/math_cot_test", | |
| num_nodes=NUM_NODES, | |
| num_gpus_per_node=GPUS_PER_NODE, | |
| peft_scheme=None | |
| ) | |
| tokenizer_name = "mistralai/Mistral-Nemo-Base-2407" | |
| finetune.trainer.plugins = bf16_with_fp8_mixed() | |
| else: | |
| raise ValueError(f"Unknown MODEL_TO_USE {MODEL_TO_USE}") | |
| # override configuration in the recipe | |
| finetune.trainer.max_steps = 3_000 | |
| finetune.optim.config.lr = 5e-6 | |
| finetune.optim.lr_scheduler.min_lr = 5e-7 | |
| finetune.optim.lr_scheduler.warmup_steps = int(3_000 * 0.03) | |
| # For the long context, we're going to set tensor_parallel_size to 8 and pipeline_parallelism to 2 | |
| # we are running on 2 nodes of 8 GPUs, so we can shard to 16. Context_parallelism doesn't work with packing. | |
| finetune.trainer.strategy.tensor_model_parallel_size = 8 | |
| finetune.trainer.strategy.pipeline_model_parallel_size = 2 | |
| seq_length = 59_280 | |
| finetune.data = run.Config(ChatDataModule, | |
| dataset_root=f"data/{REASONING_STYLE}", | |
| seq_length=seq_length, | |
| tokenizer=run.Config(AutoTokenizer, pretrained_model_name=tokenizer_name), | |
| micro_batch_size=1, # has to be 1 when packing | |
| global_batch_size=64, | |
| num_workers=0, # increase based on available memory | |
| packed_sequence_specs=PackedSequenceSpecs(packed_sequence_size=seq_length), | |
| use_hf_tokenizer_chat_template=True, | |
| dataset_kwargs=dict( | |
| add_eos=False, | |
| add_bos=False, | |
| pad_seq_length_to_mult=128 | |
| ) | |
| ) | |
| # replace with your information / SlurmExecutor | |
| executor = run.LeptonExecutor( | |
| resource_shape="gpu.8xh200", | |
| node_group='nv-nodegroup-h200-01', | |
| container_image='nvcr.io/nvidia/nemo:25.09', | |
| nodes=NUM_NODES, | |
| gpus_per_node=GPUS_PER_NODE, | |
| nemo_run_dir="/path/to/.nemo_run/", | |
| mounts=[], # fill in shared storage here | |
| env_vars={}, # fill in any env vars here | |
| packager=run.Packager() | |
| ) | |
| # alternatively: | |
| # executor = run.LocalExecutor(...) # see https://docs.nvidia.com/nemo/run/latest/api/nemo_run/index.html#nemo_run.LocalExecutor | |
| # executor = run.SlurmExecutor(...) # see https://docs.nvidia.com/nemo/run/latest/api/nemo_run/index.html#nemo_run.SlurmExecutor | |
| run.run(finetune, executor) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment