This example showcases the following actions with apptainer containers:

  1. Downloading a docker container from an extenal source (NGC), and automatically converting it to a apptainer (.sif) container
  2. Compiling software inside a container using tools included in said container
  3. Launching multi-node container workloads
  4. Correct network performance in a multi-node container workload


# Allocate two H100 GPU nodes
salloc -p gpu_h100 -N 2 -t 02:00:00 --gpus-per-node=4

# Edit this date to change the pulled container 	
CONTAINER_VERSION=25.10 

# Prevent xalt warning spam 
unset LD_PRELOAD

# Pull the container from the NGC container repository, after which apptainer will automatically convert it to a .sif
export APPTAINER_CACHEDIR=/dev/shm
apptainer pull docker://nvcr.io/nvidia/pytorch:CONTAINER_VERSION-py3

# Let's get our tests
git clone https://github.com/NVIDIA/nccl-tests
cd nccl-tests

# Compile NCCL tests inside the container
# apptainer exec                 - Execute command in container
# --nv                           - Bind GPU resources to container
# --env MPI=1                    - Enable MPI for multi-node nccl-tests
# --env MPI_HOME=/usr/local/mpi  - Set MPI_HOME to OpenMPI path inside container
# pytorch_25.10-py3.sif          - Container image path
# make -j nccl-tests/            - Compile tests with parallel jobs

apptainer exec \
  --nv \
  --env MPI=1 \
  --env MPI_HOME=/usr/local/mpi \
  pytorch_25.10-py3.sif \
  make -j nccl-tests/


# Execute NCCL all-gather performance test
# srun                           - SLURM's parallel job launcher
# -N $SLURM_NNODES               - Use all allocated SLURM nodes
# --ntasks-per-node=1            - Run 1 task (process) per node
# --gpus-per-node=4              - Allocate 4 GPUs per node
# env NCCL_SOCKET_IFNAME=eno     - Set network interface for NCCL initialization
# apptainer exec --nv            - Launch container with GPU binding
# pytorch_25.10-py3.sif          - Container image path
# nccl-tests/build/all_gather_perf - Executable test binary
# -b 1K                          - Starting message size: 1 KiB
# -e 8G                          - Ending message size: 8 GiB
# -f 2                           - Increase message size in powers of two
# -g 4                           - 4 GPUs per process (task)

srun \
  -N $SLURM_NNODES \
  --ntasks-per-node=1 \
  --gpus-per-node=4 \
  env NCCL_SOCKET_IFNAME=eno \
  apptainer exec --nv \
    pytorch_25.10-py3.sif \
    nccl-tests/build/all_gather_perf \
      -b 1K \
      -e 8G \
      -f 2 \
      -g 4



#####################################################################################################################################
# Output; We have 4*200Gb/s NIC per H100 node, so we expect a theoretical bandwidth for large message sizes of 100GB/s
#####################################################################################################################################
#
# nccl-tests version 2.17.4 nccl-headers=22707 nccl-library=22707
# Collective test starting: all_gather_perf
# nThread 1 nGpus 4 minBytes 1024 maxBytes 8589934592 step: 2(factor) warmup iters: 1 iters: 20 agg iters: 1 validation: 1 graph: 0
#
# Using devices
#  Rank  0 Group  0 Pid 3168491 on     gcn111 device  0 [0000:06:00] NVIDIA H100
#  Rank  1 Group  0 Pid 3168491 on     gcn111 device  1 [0000:26:00] NVIDIA H100
#  Rank  2 Group  0 Pid 3168491 on     gcn111 device  2 [0000:a6:00] NVIDIA H100
#  Rank  3 Group  0 Pid 3168491 on     gcn111 device  3 [0000:c6:00] NVIDIA H100
#  Rank  4 Group  0 Pid 1911780 on     gcn114 device  0 [0000:06:00] NVIDIA H100
#  Rank  5 Group  0 Pid 1911780 on     gcn114 device  1 [0000:26:00] NVIDIA H100
#  Rank  6 Group  0 Pid 1911780 on     gcn114 device  2 [0000:a6:00] NVIDIA H100
#  Rank  7 Group  0 Pid 1911780 on     gcn114 device  3 [0000:c6:00] NVIDIA H100
#
#                                                              out-of-place                       in-place
#       size         count      type   redop    root     time   algbw   busbw  #wrong     time   algbw   busbw  #wrong
#        (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)
        1024            32     float    none      -1    23.07    0.04    0.04       0    21.67    0.05    0.04       0
        2048            64     float    none      -1    21.45    0.10    0.08       0    21.61    0.09    0.08       0
        4096           128     float    none      -1    21.51    0.19    0.17       0    21.45    0.19    0.17       0
        8192           256     float    none      -1    21.75    0.38    0.33       0    21.23    0.39    0.34       0
       16384           512     float    none      -1    21.95    0.75    0.65       0    21.62    0.76    0.66       0
       32768          1024     float    none      -1    21.85    1.50    1.31       0    21.34    1.54    1.34       0
       65536          2048     float    none      -1    26.41    2.48    2.17       0    25.91    2.53    2.21       0
      131072          4096     float    none      -1    26.46    4.95    4.33       0    26.86    4.88    4.27       0
      262144          8192     float    none      -1    35.77    7.33    6.41       0    36.20    7.24    6.34       0
      524288         16384     float    none      -1    41.36   12.68   11.09       0    40.43   12.97   11.35       0
     1048576         32768     float    none      -1    43.72   23.98   20.99       0    43.29   24.22   21.20       0
     2097152         65536     float    none      -1   341.41    6.14    5.37       0    52.07   40.28   35.24       0
     4194304        131072     float    none      -1    73.66   56.94   49.83       0    72.82   57.60   50.40       0
     8388608        262144     float    none      -1   114.19   73.46   64.28       0   113.84   73.69   64.48       0
    16777216        524288     float    none      -1   197.10   85.12   74.48       0   208.19   80.59   70.51       0
    33554432       1048576     float    none      -1   377.90   88.79   77.69       0   362.76   92.50   80.94       0
    67108864       2097152     float    none      -1   674.62   99.48   87.04       0   666.21  100.73   88.14       0
   134217728       4194304     float    none      -1  1300.02  103.24   90.34       0  1290.35  104.02   91.01       0
   268435456       8388608     float    none      -1  2499.64  107.39   93.97       0  2489.48  107.83   94.35       0
   536870912      16777216     float    none      -1  4899.79  109.57   95.87       0  4890.40  109.78   96.06       0
  1073741824      33554432     float    none      -1  9701.02  110.68   96.85       0  9692.83  110.78   96.93       0
  2147483648      67108864     float    none      -1  19306.0  111.23   97.33       0  19300.9  111.26   97.36       0
  4294967296     134217728     float    none      -1  38522.4  111.49   97.56       0  38506.8  111.54   97.60       0
  8589934592     268435456     float    none      -1  76975.7  111.59   97.64       0  76978.5  111.59   97.64       0
# Out of bounds values : 0 OK
# Avg bus bandwidth    : 45.51
#
# Collective test concluded: all_gather_perf