This example showcases the following actions with apptainer containers:
- Downloading a docker container from an extenal source (NGC), and automatically converting it to a apptainer (.sif) container
- Compiling software inside a container using tools included in said container
- Launching multi-node container workloads
- Correct network performance in a multi-node container workload
# Allocate two H100 GPU nodes
salloc -p gpu_h100 -N 2 -t 02:00:00 --gpus-per-node=4
# Edit this date to change the pulled container
CONTAINER_VERSION=25.10
# Prevent xalt warning spam
unset LD_PRELOAD
# Pull the container from the NGC container repository, after which apptainer will automatically convert it to a .sif
export APPTAINER_CACHEDIR=/dev/shm
apptainer pull docker://nvcr.io/nvidia/pytorch:CONTAINER_VERSION-py3
# Let's get our tests
git clone https://github.com/NVIDIA/nccl-tests
cd nccl-tests
# Compile NCCL tests inside the container
# apptainer exec - Execute command in container
# --nv - Bind GPU resources to container
# --env MPI=1 - Enable MPI for multi-node nccl-tests
# --env MPI_HOME=/usr/local/mpi - Set MPI_HOME to OpenMPI path inside container
# pytorch_25.10-py3.sif - Container image path
# make -j nccl-tests/ - Compile tests with parallel jobs
apptainer exec \
--nv \
--env MPI=1 \
--env MPI_HOME=/usr/local/mpi \
pytorch_25.10-py3.sif \
make -j nccl-tests/
# Execute NCCL all-gather performance test
# srun - SLURM's parallel job launcher
# -N $SLURM_NNODES - Use all allocated SLURM nodes
# --ntasks-per-node=1 - Run 1 task (process) per node
# --gpus-per-node=4 - Allocate 4 GPUs per node
# env NCCL_SOCKET_IFNAME=eno - Set network interface for NCCL initialization
# apptainer exec --nv - Launch container with GPU binding
# pytorch_25.10-py3.sif - Container image path
# nccl-tests/build/all_gather_perf - Executable test binary
# -b 1K - Starting message size: 1 KiB
# -e 8G - Ending message size: 8 GiB
# -f 2 - Increase message size in powers of two
# -g 4 - 4 GPUs per process (task)
srun \
-N $SLURM_NNODES \
--ntasks-per-node=1 \
--gpus-per-node=4 \
env NCCL_SOCKET_IFNAME=eno \
apptainer exec --nv \
pytorch_25.10-py3.sif \
nccl-tests/build/all_gather_perf \
-b 1K \
-e 8G \
-f 2 \
-g 4
#####################################################################################################################################
# Output; We have 4*200Gb/s NIC per H100 node, so we expect a theoretical bandwidth for large message sizes of 100GB/s
#####################################################################################################################################
#
# nccl-tests version 2.17.4 nccl-headers=22707 nccl-library=22707
# Collective test starting: all_gather_perf
# nThread 1 nGpus 4 minBytes 1024 maxBytes 8589934592 step: 2(factor) warmup iters: 1 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
# Rank 0 Group 0 Pid 3168491 on gcn111 device 0 [0000:06:00] NVIDIA H100
# Rank 1 Group 0 Pid 3168491 on gcn111 device 1 [0000:26:00] NVIDIA H100
# Rank 2 Group 0 Pid 3168491 on gcn111 device 2 [0000:a6:00] NVIDIA H100
# Rank 3 Group 0 Pid 3168491 on gcn111 device 3 [0000:c6:00] NVIDIA H100
# Rank 4 Group 0 Pid 1911780 on gcn114 device 0 [0000:06:00] NVIDIA H100
# Rank 5 Group 0 Pid 1911780 on gcn114 device 1 [0000:26:00] NVIDIA H100
# Rank 6 Group 0 Pid 1911780 on gcn114 device 2 [0000:a6:00] NVIDIA H100
# Rank 7 Group 0 Pid 1911780 on gcn114 device 3 [0000:c6:00] NVIDIA H100
#
# out-of-place in-place
# size count type redop root time algbw busbw #wrong time algbw busbw #wrong
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
1024 32 float none -1 23.07 0.04 0.04 0 21.67 0.05 0.04 0
2048 64 float none -1 21.45 0.10 0.08 0 21.61 0.09 0.08 0
4096 128 float none -1 21.51 0.19 0.17 0 21.45 0.19 0.17 0
8192 256 float none -1 21.75 0.38 0.33 0 21.23 0.39 0.34 0
16384 512 float none -1 21.95 0.75 0.65 0 21.62 0.76 0.66 0
32768 1024 float none -1 21.85 1.50 1.31 0 21.34 1.54 1.34 0
65536 2048 float none -1 26.41 2.48 2.17 0 25.91 2.53 2.21 0
131072 4096 float none -1 26.46 4.95 4.33 0 26.86 4.88 4.27 0
262144 8192 float none -1 35.77 7.33 6.41 0 36.20 7.24 6.34 0
524288 16384 float none -1 41.36 12.68 11.09 0 40.43 12.97 11.35 0
1048576 32768 float none -1 43.72 23.98 20.99 0 43.29 24.22 21.20 0
2097152 65536 float none -1 341.41 6.14 5.37 0 52.07 40.28 35.24 0
4194304 131072 float none -1 73.66 56.94 49.83 0 72.82 57.60 50.40 0
8388608 262144 float none -1 114.19 73.46 64.28 0 113.84 73.69 64.48 0
16777216 524288 float none -1 197.10 85.12 74.48 0 208.19 80.59 70.51 0
33554432 1048576 float none -1 377.90 88.79 77.69 0 362.76 92.50 80.94 0
67108864 2097152 float none -1 674.62 99.48 87.04 0 666.21 100.73 88.14 0
134217728 4194304 float none -1 1300.02 103.24 90.34 0 1290.35 104.02 91.01 0
268435456 8388608 float none -1 2499.64 107.39 93.97 0 2489.48 107.83 94.35 0
536870912 16777216 float none -1 4899.79 109.57 95.87 0 4890.40 109.78 96.06 0
1073741824 33554432 float none -1 9701.02 110.68 96.85 0 9692.83 110.78 96.93 0
2147483648 67108864 float none -1 19306.0 111.23 97.33 0 19300.9 111.26 97.36 0
4294967296 134217728 float none -1 38522.4 111.49 97.56 0 38506.8 111.54 97.60 0
8589934592 268435456 float none -1 76975.7 111.59 97.64 0 76978.5 111.59 97.64 0
# Out of bounds values : 0 OK
# Avg bus bandwidth : 45.51
#
# Collective test concluded: all_gather_perf