Skip to content

Instantly share code, notes, and snippets.

@kn-neeraj
Created December 31, 2025 07:56
Show Gist options
  • Select an option

  • Save kn-neeraj/140c2184233d922be0c2cb92d21dfa00 to your computer and use it in GitHub Desktop.

Select an option

Save kn-neeraj/140c2184233d922be0c2cb92d21dfa00 to your computer and use it in GitHub Desktop.
Finetuning an LLM with SFT Appraoch using HuggingFace infrastructure.
#!/usr/bin/env python3
# /// script
# dependencies = [
# "trl>=0.12.0",
# "peft>=0.7.0",
# "transformers>=4.36.0",
# "accelerate>=0.24.0",
# "bitsandbytes>=0.41.0",
# "trackio",
# ]
# ///
"""
Fine-tune Qwen3-0.6B on CodeForces-CoTs dataset for instruction following.
ENHANCED RUN: 1000 examples with train/eval metrics tracking.
Model: Qwen/Qwen3-0.6B
Dataset: open-r1/codeforces-cots (solutions_w_editorials_decontaminated)
Sample Size: 1000 examples
Epochs: 3
Evaluation: Enabled (every 50 steps)
This script follows HF Jobs best practices:
- UV script format with inline dependencies
- Self-contained (no local file access needed)
- Designed to run with appropriate timeout and GPU flavor
- Uses secrets for HF_TOKEN to push to hub
- Tracks both training and validation metrics
"""
import os
from datasets import load_dataset
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
import trackio
print("=" * 70)
print("πŸš€ Qwen3-0.6B Fine-tuning on CodeForces-CoTs")
print(" ENHANCED RUN: 1000 examples with evaluation metrics")
print("=" * 70)
# Configuration
MODEL_NAME = "Qwen/Qwen3-0.6B"
DATASET_NAME = "open-r1/codeforces-cots"
DATASET_CONFIG = "solutions_w_editorials_decontaminated"
OUTPUT_DIR = "qwen3-codeforces-with-eval"
HUB_MODEL_ID = "kneeraj/qwen3-codeforces-with-eval-1000"
# Sample run settings
SAMPLE_SIZE = 1000
NUM_EPOCHS = 3
# Initialize Trackio for training metrics visualization
# This will sync training metrics to HF Spaces for online viewing
TRACKIO_PROJECT = "qwen3-codeforces-training"
TRACKIO_SPACE = "kneeraj/qwen3-training-metrics"
print(f"\nπŸ“Š Initializing Trackio metrics tracking...")
trackio.init(
project=TRACKIO_PROJECT,
space_id=TRACKIO_SPACE,
config={
"model": MODEL_NAME,
"dataset": DATASET_NAME,
"sample_size": SAMPLE_SIZE,
"epochs": NUM_EPOCHS,
"learning_rate": 2e-4,
"lora_r": 16,
"lora_alpha": 32,
}
)
print(f" βœ… Metrics will be synced to: https://huggingface.co/spaces/{TRACKIO_SPACE}")
print(f"\nπŸ“‹ Configuration:")
print(f" Model: {MODEL_NAME}")
print(f" Dataset: {DATASET_NAME}")
print(f" Config: {DATASET_CONFIG}")
print(f" Sample Size: {SAMPLE_SIZE} examples")
print(f" Epochs: {NUM_EPOCHS}")
# Load dataset subset
print(f"\nπŸ“₯ Loading {SAMPLE_SIZE} examples...")
dataset = load_dataset(
DATASET_NAME,
name=DATASET_CONFIG,
split=f"train[:{SAMPLE_SIZE}]"
)
print(f"βœ… Loaded {len(dataset)} examples")
# Remove 'prompt' field to force SFTTrainer to use 'messages' format
# The dataset has BOTH 'prompt' and 'messages', causing SFTTrainer to
# incorrectly detect it as prompt-completion format
print("\n🧹 Removing conflicting fields to ensure 'messages' format detection...")
dataset = dataset.remove_columns(['prompt'])
print(" βœ… Removed 'prompt' field - SFTTrainer will now use 'messages' format")
# Train/eval split (90/10)
print("\nπŸ”€ Creating train/eval split...")
dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]
print(f" Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")
# Verify message format
print(f"\nπŸ” Messages per example: {len(train_dataset[0]['messages'])}")
# LoRA configuration for efficient fine-tuning
peft_config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.05,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
task_type="CAUSAL_LM",
bias="none",
)
# Training configuration
training_args = SFTConfig(
# Hub settings - will use HF_TOKEN from secrets
output_dir=OUTPUT_DIR,
push_to_hub=True,
hub_model_id=HUB_MODEL_ID,
hub_strategy="end", # Push only at end for sample run
# Training parameters
num_train_epochs=NUM_EPOCHS,
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
gradient_checkpointing=True,
# Learning rate
learning_rate=2e-4,
lr_scheduler_type="cosine",
warmup_ratio=0.1,
# Logging & checkpointing
logging_steps=5,
save_strategy="steps",
save_steps=50,
save_total_limit=2,
# Evaluation - ENABLED for train/eval metrics tracking
eval_strategy="steps",
eval_steps=50, # Evaluate every 50 steps (4 checkpoints across 225 steps)
# Optimization
bf16=True,
max_length=1024,
# dataset_text_field removed - auto-detect messages format
packing=False,
# Reporting with Trackio for training loss visualization
report_to=["trackio"],
run_name="qwen3-codeforces-with-eval-1000",
)
# Initialize trainer - SFTTrainer auto-detects messages format!
print("\nπŸ‹οΈ Initializing SFT trainer...")
print(" SFTTrainer auto-detects messages format")
trainer = SFTTrainer(
model=MODEL_NAME,
train_dataset=train_dataset,
eval_dataset=eval_dataset, # ADD THIS - enables validation metrics
peft_config=peft_config,
args=training_args,
# No formatting_func or dataset_text_field needed!
# SFTTrainer automatically handles messages format
)
print(" βœ… Trainer initialized successfully!")
# Train
print("\n🎯 Starting training...")
print(f" Sample run with {SAMPLE_SIZE} examples")
print(f" Estimated steps: ~{len(train_dataset) // (2 * 4)}")
print(f" Batch size: {training_args.per_device_train_batch_size}")
print(f" Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f" Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f" Evaluation checkpoints: steps 50, 100, 150, 200")
print()
print("⏳ Training in progress...")
result = trainer.train()
print(f"\nβœ… Training completed!")
print(f" Final loss: {result.training_loss:.4f}")
# Push to Hub
print("\nπŸ’Ύ Pushing final model to Hub...")
trainer.push_to_hub()
print(f" βœ… Model pushed successfully!")
# Finalize Trackio tracking
print("\nπŸ“Š Finalizing Trackio metrics...")
trackio.finish()
print(f" βœ… Training metrics synced successfully!")
print("\n" + "=" * 70)
print("βœ… Training Complete with Evaluation Metrics!")
print("=" * 70)
print(f"\nπŸ“Š Results:")
print(f" Model: https://huggingface.co/{HUB_MODEL_ID}")
print(f" Training Metrics: https://huggingface.co/spaces/{TRACKIO_SPACE}")
print(f" Trained on: {SAMPLE_SIZE} examples")
print(f" Train/Eval Split: {len(train_dataset)}/{len(eval_dataset)}")
print(f" Epochs: {NUM_EPOCHS}")
print(f"\nπŸ’‘ Next Steps:")
print(f" 1. View training & evaluation loss graphs at Trackio Space")
print(f" 2. Compare train_loss vs eval_loss (check for overfitting)")
print(f" 3. Review model performance on HF Hub")
print(f" 4. Test the model with inference")
print(f" 5. If satisfied, scale to full dataset and more epochs")
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment