This Mermaid diagram represents my own, personal understanding of the TRT debugging workflow.
Commit: 40e8b7d.
flowchart TD
is_real_input[Real input fixes accurracy issue?]| pipx install uv | |
| # --seed to add pip to venv, to match the behavior of the real venv | |
| uv venv /tmp/venvs/haha --seed --python 3.13 | |
| source /tmp/venvs/haha/bin/activate | |
| # ... | |
| pip list | |
| deactivate |
This Mermaid diagram represents my own, personal understanding of the TRT debugging workflow.
Commit: 40e8b7d.
flowchart TD
is_real_input[Real input fixes accurracy issue?]| # https://github.com/verdimrc/linuxcfg/blob/main/bin/ngc-apikey.py | |
| ngc-apikey.py <NGC_PROFILE> | docker login --username '$oauthtoken' --password-stdin nvcr.io |
nvidia-smi -q
nvidia-smi -q -d POWER
nvidia-smi -q -d CLOCK
sudo nvidia-smi -pm 1 => set persistence mode on
sudo nvidia-smi -pm 1 -i 0 => set persistence mode on gpu index 0
nvidia-smi -q -d SUPPORTED_CLOCKS
nvidia-smi -i 0 --query-supported-clocks=mem,gr --format=csv
sudo nvidia-smi -lgc 2520
sudo nvidia-smi -rgc # Reset GPU clocks$ cat /sys/devices/virtual/dmi/id/product_name
trn1.32xlarge
$ cat /sys/devices/virtual/dmi/id/board_asset_tag
i-0000000000example| #!/bin/bash | |
| ################################################################################ | |
| # NOTE for Slurm users: when Slurm is configured to enable cgroup, upon job | |
| # completion Slurm will kill the mount-s3 process. This causes on-access error | |
| # "transport not connected". | |
| # | |
| # [20240404] In the practical sense, running this script under srun will: | |
| # - not work on pcluster-3.9.0 (ProctrackType=proctrack/cgroup) | |
| # - probably work on SageMaker HyperPod (ProctrackType=proctrack/linuxproc) |
# Install awsume https://awsu.me/
$ brew install awsume
# Edit ~/.aws/config, and create a profile:
$ vi ~/.aws/config
[hyperpod]
output = json
region = us-west-2
mfa_serial = arn:aws:iam:::mfa/| # https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables | |
| export HF_HOME=/fsx/marcverd/hf_home | |
| export HF_HUB_DISABLE_TELEMETRY=1 |