Created
December 31, 2025 07:56
-
-
Save kn-neeraj/140c2184233d922be0c2cb92d21dfa00 to your computer and use it in GitHub Desktop.
Finetuning an LLM with SFT Appraoch using HuggingFace infrastructure.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # /// script | |
| # dependencies = [ | |
| # "trl>=0.12.0", | |
| # "peft>=0.7.0", | |
| # "transformers>=4.36.0", | |
| # "accelerate>=0.24.0", | |
| # "bitsandbytes>=0.41.0", | |
| # "trackio", | |
| # ] | |
| # /// | |
| """ | |
| Fine-tune Qwen3-0.6B on CodeForces-CoTs dataset for instruction following. | |
| ENHANCED RUN: 1000 examples with train/eval metrics tracking. | |
| Model: Qwen/Qwen3-0.6B | |
| Dataset: open-r1/codeforces-cots (solutions_w_editorials_decontaminated) | |
| Sample Size: 1000 examples | |
| Epochs: 3 | |
| Evaluation: Enabled (every 50 steps) | |
| This script follows HF Jobs best practices: | |
| - UV script format with inline dependencies | |
| - Self-contained (no local file access needed) | |
| - Designed to run with appropriate timeout and GPU flavor | |
| - Uses secrets for HF_TOKEN to push to hub | |
| - Tracks both training and validation metrics | |
| """ | |
| import os | |
| from datasets import load_dataset | |
| from peft import LoraConfig | |
| from trl import SFTTrainer, SFTConfig | |
| import trackio | |
| print("=" * 70) | |
| print("π Qwen3-0.6B Fine-tuning on CodeForces-CoTs") | |
| print(" ENHANCED RUN: 1000 examples with evaluation metrics") | |
| print("=" * 70) | |
| # Configuration | |
| MODEL_NAME = "Qwen/Qwen3-0.6B" | |
| DATASET_NAME = "open-r1/codeforces-cots" | |
| DATASET_CONFIG = "solutions_w_editorials_decontaminated" | |
| OUTPUT_DIR = "qwen3-codeforces-with-eval" | |
| HUB_MODEL_ID = "kneeraj/qwen3-codeforces-with-eval-1000" | |
| # Sample run settings | |
| SAMPLE_SIZE = 1000 | |
| NUM_EPOCHS = 3 | |
| # Initialize Trackio for training metrics visualization | |
| # This will sync training metrics to HF Spaces for online viewing | |
| TRACKIO_PROJECT = "qwen3-codeforces-training" | |
| TRACKIO_SPACE = "kneeraj/qwen3-training-metrics" | |
| print(f"\nπ Initializing Trackio metrics tracking...") | |
| trackio.init( | |
| project=TRACKIO_PROJECT, | |
| space_id=TRACKIO_SPACE, | |
| config={ | |
| "model": MODEL_NAME, | |
| "dataset": DATASET_NAME, | |
| "sample_size": SAMPLE_SIZE, | |
| "epochs": NUM_EPOCHS, | |
| "learning_rate": 2e-4, | |
| "lora_r": 16, | |
| "lora_alpha": 32, | |
| } | |
| ) | |
| print(f" β Metrics will be synced to: https://huggingface.co/spaces/{TRACKIO_SPACE}") | |
| print(f"\nπ Configuration:") | |
| print(f" Model: {MODEL_NAME}") | |
| print(f" Dataset: {DATASET_NAME}") | |
| print(f" Config: {DATASET_CONFIG}") | |
| print(f" Sample Size: {SAMPLE_SIZE} examples") | |
| print(f" Epochs: {NUM_EPOCHS}") | |
| # Load dataset subset | |
| print(f"\nπ₯ Loading {SAMPLE_SIZE} examples...") | |
| dataset = load_dataset( | |
| DATASET_NAME, | |
| name=DATASET_CONFIG, | |
| split=f"train[:{SAMPLE_SIZE}]" | |
| ) | |
| print(f"β Loaded {len(dataset)} examples") | |
| # Remove 'prompt' field to force SFTTrainer to use 'messages' format | |
| # The dataset has BOTH 'prompt' and 'messages', causing SFTTrainer to | |
| # incorrectly detect it as prompt-completion format | |
| print("\nπ§Ή Removing conflicting fields to ensure 'messages' format detection...") | |
| dataset = dataset.remove_columns(['prompt']) | |
| print(" β Removed 'prompt' field - SFTTrainer will now use 'messages' format") | |
| # Train/eval split (90/10) | |
| print("\nπ Creating train/eval split...") | |
| dataset_split = dataset.train_test_split(test_size=0.1, seed=42) | |
| train_dataset = dataset_split["train"] | |
| eval_dataset = dataset_split["test"] | |
| print(f" Train: {len(train_dataset)}, Eval: {len(eval_dataset)}") | |
| # Verify message format | |
| print(f"\nπ Messages per example: {len(train_dataset[0]['messages'])}") | |
| # LoRA configuration for efficient fine-tuning | |
| peft_config = LoraConfig( | |
| r=16, | |
| lora_alpha=32, | |
| lora_dropout=0.05, | |
| target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], | |
| task_type="CAUSAL_LM", | |
| bias="none", | |
| ) | |
| # Training configuration | |
| training_args = SFTConfig( | |
| # Hub settings - will use HF_TOKEN from secrets | |
| output_dir=OUTPUT_DIR, | |
| push_to_hub=True, | |
| hub_model_id=HUB_MODEL_ID, | |
| hub_strategy="end", # Push only at end for sample run | |
| # Training parameters | |
| num_train_epochs=NUM_EPOCHS, | |
| per_device_train_batch_size=2, | |
| gradient_accumulation_steps=4, | |
| gradient_checkpointing=True, | |
| # Learning rate | |
| learning_rate=2e-4, | |
| lr_scheduler_type="cosine", | |
| warmup_ratio=0.1, | |
| # Logging & checkpointing | |
| logging_steps=5, | |
| save_strategy="steps", | |
| save_steps=50, | |
| save_total_limit=2, | |
| # Evaluation - ENABLED for train/eval metrics tracking | |
| eval_strategy="steps", | |
| eval_steps=50, # Evaluate every 50 steps (4 checkpoints across 225 steps) | |
| # Optimization | |
| bf16=True, | |
| max_length=1024, | |
| # dataset_text_field removed - auto-detect messages format | |
| packing=False, | |
| # Reporting with Trackio for training loss visualization | |
| report_to=["trackio"], | |
| run_name="qwen3-codeforces-with-eval-1000", | |
| ) | |
| # Initialize trainer - SFTTrainer auto-detects messages format! | |
| print("\nποΈ Initializing SFT trainer...") | |
| print(" SFTTrainer auto-detects messages format") | |
| trainer = SFTTrainer( | |
| model=MODEL_NAME, | |
| train_dataset=train_dataset, | |
| eval_dataset=eval_dataset, # ADD THIS - enables validation metrics | |
| peft_config=peft_config, | |
| args=training_args, | |
| # No formatting_func or dataset_text_field needed! | |
| # SFTTrainer automatically handles messages format | |
| ) | |
| print(" β Trainer initialized successfully!") | |
| # Train | |
| print("\nπ― Starting training...") | |
| print(f" Sample run with {SAMPLE_SIZE} examples") | |
| print(f" Estimated steps: ~{len(train_dataset) // (2 * 4)}") | |
| print(f" Batch size: {training_args.per_device_train_batch_size}") | |
| print(f" Gradient accumulation: {training_args.gradient_accumulation_steps}") | |
| print(f" Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}") | |
| print(f" Evaluation checkpoints: steps 50, 100, 150, 200") | |
| print() | |
| print("β³ Training in progress...") | |
| result = trainer.train() | |
| print(f"\nβ Training completed!") | |
| print(f" Final loss: {result.training_loss:.4f}") | |
| # Push to Hub | |
| print("\nπΎ Pushing final model to Hub...") | |
| trainer.push_to_hub() | |
| print(f" β Model pushed successfully!") | |
| # Finalize Trackio tracking | |
| print("\nπ Finalizing Trackio metrics...") | |
| trackio.finish() | |
| print(f" β Training metrics synced successfully!") | |
| print("\n" + "=" * 70) | |
| print("β Training Complete with Evaluation Metrics!") | |
| print("=" * 70) | |
| print(f"\nπ Results:") | |
| print(f" Model: https://huggingface.co/{HUB_MODEL_ID}") | |
| print(f" Training Metrics: https://huggingface.co/spaces/{TRACKIO_SPACE}") | |
| print(f" Trained on: {SAMPLE_SIZE} examples") | |
| print(f" Train/Eval Split: {len(train_dataset)}/{len(eval_dataset)}") | |
| print(f" Epochs: {NUM_EPOCHS}") | |
| print(f"\nπ‘ Next Steps:") | |
| print(f" 1. View training & evaluation loss graphs at Trackio Space") | |
| print(f" 2. Compare train_loss vs eval_loss (check for overfitting)") | |
| print(f" 3. Review model performance on HF Hub") | |
| print(f" 4. Test the model with inference") | |
| print(f" 5. If satisfied, scale to full dataset and more epochs") | |
| print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment