Skip to content

Instantly share code, notes, and snippets.

@KillTheAI
Created March 7, 2025 13:02
Show Gist options
  • Select an option

  • Save KillTheAI/e9a7cf093a8e915744b21762ba7c1384 to your computer and use it in GitHub Desktop.

Select an option

Save KillTheAI/e9a7cf093a8e915744b21762ba7c1384 to your computer and use it in GitHub Desktop.
FP16 half precision for p100
import torch
def optimize_model_for_p100(model, use_amp=True):
# Move model to GPU
model = model.cuda()
# Use FP16 precision (P100 supports this, but without Tensor Cores)
if use_amp:
model = model.half()
# Set model to evaluation mode
model = model.eval()
# Disable gradient computation for inference
torch.set_grad_enabled(False)
# Optional: Script/trace the model for better performance
try:
scripted_model = torch.jit.script(model)
return scripted_model
except Exception as e:
print(f"Model scripting failed: {e}")
return model
# Example usage
def inference_pipeline(model, input_data, batch_size=32):
# Optimize model for P100
optimized_model = optimize_model_for_p100(model)
# Process data in batches to manage memory
results = []
for i in range(0, len(input_data), batch_size):
batch = input_data[i:i+batch_size].cuda()
if optimized_model.half():
batch = batch.half() # Convert inputs to match model precision
# Use CUDA events for timing
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
with torch.cuda.amp.autocast(enabled=True): # Use AMP if available
output = optimized_model(batch)
end.record()
torch.cuda.synchronize()
print(f"Batch inference time: {start.elapsed_time(end):.2f} ms")
results.append(output.cpu()) # Move results back to CPU
return torch.cat(results, dim=0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment