Difference between revisions of "Short MPI Tests using Pytorch example code"

From Define Wiki
Jump to navigation Jump to search
 
(One intermediate revision by the same user not shown)
Line 139: Line 139:
 
dist.destroy_process_group()
 
dist.destroy_process_group()
 
</pre>
 
</pre>
 +
 +
== torchrun example ==
  
 
Notes from Victor at AMD
 
Notes from Victor at AMD
Line 164: Line 166:
 
8:15
 
8:15
 
You're probably already familiar with the NCCL env vars so I won't elaborate on those. But to get Pytorch to work for me I had to specify the IB GID INDEX as 3 and force to use GPU Direct RDMA only when GPU and NIC are on the same PCI switch (with the NCCL_NET_GDR_LEVEL env var set to 1) (edited)  
 
You're probably already familiar with the NCCL env vars so I won't elaborate on those. But to get Pytorch to work for me I had to specify the IB GID INDEX as 3 and force to use GPU Direct RDMA only when GPU and NIC are on the same PCI switch (with the NCCL_NET_GDR_LEVEL env var set to 1) (edited)  
 +
</pre>
 +
 +
the multimode.py file
 +
<pre>
 +
import torch
 +
import torch.nn.functional as F
 +
from torch.utils.data import Dataset, DataLoader
 +
 +
import torch.multiprocessing as mp
 +
from torch.utils.data.distributed import DistributedSampler
 +
from torch.nn.parallel import DistributedDataParallel as DDP
 +
from torch.distributed import init_process_group, destroy_process_group
 +
import os
 +
from torch.distributed.elastic.multiprocessing.errors import record
 +
 +
 +
class MyTrainDataset(Dataset):
 +
    """Custom Dataset for training data."""
 +
 +
    def __init__(self, size):
 +
        """
 +
        Initialize the dataset with random data.
 +
 +
        Args:
 +
            size (int): The size of the dataset.
 +
        """
 +
        self.size = size
 +
        self.data = [(torch.rand(20), torch.rand(1)) for _ in range(size)]
 +
 +
    def __len__(self):
 +
        """Return the size of the dataset."""
 +
        return self.size
 +
 +
    def __getitem__(self, index):
 +
        """
 +
        Get an item from the dataset at a given index.
 +
 +
        Args:
 +
            index (int): The index of the item.
 +
 +
        Returns:
 +
            tuple: A tuple containing the input data and target.
 +
        """
 +
        return self.data[index]
 +
 +
 +
def ddp_setup():
 +
    """Set up the distributed data parallel (DDP) environment."""
 +
    init_process_group(backend="nccl")
 +
    print("LOCAL RANK", int(os.environ["LOCAL_RANK"]))
 +
    torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
 +
 +
 +
class Trainer:
 +
    """Trainer class to handle training loop, snapshots, and DDP."""
 +
 +
    def __init__(
 +
        self,
 +
        model: torch.nn.Module,
 +
        train_data: DataLoader,
 +
        optimizer: torch.optim.Optimizer,
 +
        save_every: int,
 +
        snapshot_path: str,
 +
    ) -> None:
 +
        """
 +
        Initialize the Trainer.
 +
 +
        Args:
 +
            model (torch.nn.Module): The model to train.
 +
            train_data (DataLoader): The DataLoader for training data.
 +
            optimizer (torch.optim.Optimizer): The optimizer for training.
 +
            save_every (int): How often to save snapshots.
 +
            snapshot_path (str): Path to save the snapshots.
 +
        """
 +
        self.local_rank = int(os.environ["LOCAL_RANK"])
 +
        self.global_rank = int(os.environ["RANK"])
 +
        self.model = model.to(self.local_rank)
 +
        self.train_data = train_data
 +
        self.optimizer = optimizer
 +
        self.save_every = save_every
 +
        self.epochs_run = 0
 +
        self.snapshot_path = snapshot_path
 +
 +
        if os.path.exists(snapshot_path):
 +
            print("Loading snapshot")
 +
            self._load_snapshot(snapshot_path)
 +
 +
        self.model = DDP(self.model, device_ids=[self.local_rank])
 +
 +
    def _load_snapshot(self, snapshot_path):
 +
        """
 +
        Load a training snapshot to resume training.
 +
 +
        Args:
 +
            snapshot_path (str): Path to the snapshot file.
 +
        """
 +
        loc = f"cuda:{self.local_rank}"
 +
        snapshot = torch.load(snapshot_path, map_location=loc)
 +
        self.model.load_state_dict(snapshot["MODEL_STATE"])
 +
        self.epochs_run = snapshot["EPOCHS_RUN"]
 +
        print(f"Resuming training from snapshot at Epoch {self.epochs_run}")
 +
 +
    def _run_batch(self, source, targets):
 +
        """
 +
        Run a single batch through the model.
 +
 +
        Args:
 +
            source (torch.Tensor): Input data.
 +
            targets (torch.Tensor): Target data.
 +
        """
 +
        self.optimizer.zero_grad()
 +
        output = self.model(source)
 +
        loss = F.cross_entropy(output, targets)
 +
        loss.backward()
 +
        self.optimizer.step()
 +
 +
    def _run_epoch(self, epoch):
 +
        """
 +
        Run a single epoch of training.
 +
 +
        Args:
 +
            epoch (int): The current epoch number.
 +
        """
 +
        b_sz = len(next(iter(self.train_data))[0])
 +
        print(
 +
            f"[GPU{self.global_rank}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}"
 +
        )
 +
        self.train_data.sampler.set_epoch(epoch)
 +
        for source, targets in self.train_data:
 +
            source = source.to(self.local_rank)
 +
            targets = targets.to(self.local_rank)
 +
            self._run_batch(source, targets)
 +
 +
    def _save_snapshot(self, epoch):
 +
        """
 +
        Save a snapshot of the model and training state.
 +
 +
        Args:
 +
            epoch (int): The current epoch number.
 +
        """
 +
        snapshot = {
 +
            "MODEL_STATE": self.model.module.state_dict(),
 +
            "EPOCHS_RUN": epoch,
 +
        }
 +
        torch.save(snapshot, self.snapshot_path)
 +
        print(f"Epoch {epoch} | Training snapshot saved at {self.snapshot_path}")
 +
 +
    def train(self, max_epochs: int):
 +
        """
 +
        Run the training loop for a given number of epochs.
 +
 +
        Args:
 +
            max_epochs (int): The total number of epochs to train.
 +
        """
 +
        for epoch in range(self.epochs_run, max_epochs):
 +
            self._run_epoch(epoch)
 +
            if self.local_rank == 0 and epoch % self.save_every == 0:
 +
                self._save_snapshot(epoch)
 +
 +
 +
def load_train_objs():
 +
    """
 +
    Load the training objects: dataset, model, and optimizer.
 +
 +
    Returns:
 +
        tuple: A tuple containing the dataset, model, and optimizer.
 +
    """
 +
    train_set = MyTrainDataset(2048)  # load your dataset
 +
    model = torch.nn.Linear(20, 1)  # load your model
 +
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
 +
    return train_set, model, optimizer
 +
 +
 +
def prepare_dataloader(dataset: Dataset, batch_size: int):
 +
    """
 +
    Prepare the DataLoader for the dataset.
 +
 +
    Args:
 +
        dataset (Dataset): The dataset to load.
 +
        batch_size (int): The batch size for the DataLoader.
 +
 +
    Returns:
 +
        DataLoader: The prepared DataLoader.
 +
    """
 +
    return DataLoader(
 +
        dataset,
 +
        batch_size=batch_size,
 +
        pin_memory=True,
 +
        shuffle=False,
 +
        sampler=DistributedSampler(dataset),
 +
    )
 +
 +
 +
@record
 +
def main(
 +
    save_every: int,
 +
    total_epochs: int,
 +
    batch_size: int,
 +
    snapshot_path: str = "snapshot.pt",
 +
):
 +
    """
 +
    Main function to set up DDP, load data, and start training.
 +
 +
    Args:
 +
        save_every (int): How often to save snapshots.
 +
        total_epochs (int): The total number of epochs to train.
 +
        batch_size (int): The batch size for training.
 +
        snapshot_path (str, optional): Path to save snapshots. Defaults to "snapshot.pt".
 +
    """
 +
    ddp_setup()
 +
    dataset, model, optimizer = load_train_objs()
 +
    train_data = prepare_dataloader(dataset, batch_size)
 +
    trainer = Trainer(model, train_data, optimizer, save_every, snapshot_path)
 +
    trainer.train(total_epochs)
 +
    destroy_process_group()
 +
 +
 +
if __name__ == "__main__":
 +
    import argparse
 +
 +
    parser = argparse.ArgumentParser(description="Simple distributed training job")
 +
    parser.add_argument(
 +
        "total_epochs", type=int, help="Total epochs to train the model"
 +
    )
 +
    parser.add_argument("save_every", type=int, help="How often to save a snapshot")
 +
    parser.add_argument(
 +
        "--batch_size",
 +
        default=32,
 +
        type=int,
 +
        help="Input batch size on each device (default: 32)",
 +
    )
 +
    args = parser.parse_args()
 +
 +
    main(args.save_every, args.total_epochs, args.batch_size)
 +
 +
</pre>
 +
 +
Full output
 +
 +
<pre>
 +
[hpc@nscale-perf-cluster-compute-top500-15 nccl-python-testing]$ mpirun -x SCRATCH=55 -x NCCL_DEBUG=INFO -np 2 singularity run /tmp/dp-singularity/pytorch-nightly_latest.sif python ./test-nccl.py
 +
Pytorch version 2.6.0a0+gitd2207c5
 +
Pytorch version 2.6.0a0+gitd2207c5
 +
Rank 0 size 2 device cuda:0 count 8
 +
Torch NCCL version (2, 20, 5)
 +
local result: tensor(618.5157, device='cuda:0')
 +
nscale-perf-cluster-compute-top500-15:142876:142876 [0] NCCL INFO Bootstrap : Using enp37s0np0:172.16.18.118<0>
 +
nscale-perf-cluster-compute-top500-15:142876:142876 [0] NCCL INFO NET/Plugin : dlerror=librccl-net.so: cannot open shared object file: No such file or directory No plugin found (librccl-net.so), using internal implementation
 +
 +
nscale-perf-cluster-compute-top500-15:142876:142876 [0] /long_pathname_so_that_rpms_can_package_the_debug_info/src/extlibs/rccl/build/hipify/src/init.cc:115 NCCL WARN NUMA auto balancing enabled which can lead to variability in the RCCL performance! Disable by "sudo sysctl kernel.numa_balancing=0"
 +
nscale-perf-cluster-compute-top500-15:142876:142876 [0] NCCL INFO Kernel version: 4.18.0-513.24.1.el8_9.x86_64
 +
nscale-perf-cluster-compute-top500-15:142876:142876 [0] NCCL INFO ROCr version 1.14
 +
nscale-perf-cluster-compute-top500-15:142876:142876 [0] NCCL INFO Dmabuf feature disabled without NCCL_DMABUF_ENABLE=1
 +
RCCL version 2.20.5+hip6.2 HEAD:45b618a+
 +
Rank 1 size 2 device cuda:1 count 8
 +
Torch NCCL version (2, 20, 5)
 +
local result: tensor(-332.1474, device='cuda:1')
 +
nscale-perf-cluster-compute-top500-15:142875:142875 [1] NCCL INFO ROCr version 1.14
 +
nscale-perf-cluster-compute-top500-15:142875:142875 [1] NCCL INFO Dmabuf feature disabled without NCCL_DMABUF_ENABLE=1
 +
nscale-perf-cluster-compute-top500-15:142875:142875 [1] NCCL INFO Bootstrap : Using enp37s0np0:172.16.18.118<0>
 +
nscale-perf-cluster-compute-top500-15:142875:142875 [1] NCCL INFO NET/Plugin : dlerror=librccl-net.so: cannot open shared object file: No such file or directory No plugin found (librccl-net.so), using internal implementation
 +
 +
nscale-perf-cluster-compute-top500-15:142875:142875 [1] /long_pathname_so_that_rpms_can_package_the_debug_info/src/extlibs/rccl/build/hipify/src/init.cc:115 NCCL WARN NUMA auto balancing enabled which can lead to variability in the RCCL performance! Disable by "sudo sysctl kernel.numa_balancing=0"
 +
nscale-perf-cluster-compute-top500-15:142875:142875 [1] NCCL INFO Kernel version: 4.18.0-513.24.1.el8_9.x86_64
 +
libibverbs: Warning: couldn't open config directory '/etc/libibverbs.d'.
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO NET/IB : No device found.
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO NET/Socket : Using [0]enp37s0np0:172.16.18.118<0> [1]enp12s0np0:172.16.18.120<0> [2]enp180s0np0:172.16.18.116<0> [3]enp139s0np0:172.16.18.114<0> [4]bond0:192.168.7.139<0>
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Using non-device net plugin version 0
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Using network Socket
 +
libibverbs: Warning: couldn't open config directory '/etc/libibverbs.d'.
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO NET/IB : No device found.
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO NET/Socket : Using [0]enp37s0np0:172.16.18.118<0> [1]enp12s0np0:172.16.18.120<0> [2]enp180s0np0:172.16.18.116<0> [3]enp139s0np0:172.16.18.114<0> [4]bond0:192.168.7.139<0>
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Using non-device net plugin version 0
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Using network Socket
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO comm 0xbf44640 rank 1 nranks 2 cudaDev 1 busId 34000 commId 0xfe20596e81efe06c - Init START
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO comm 0xa966300 rank 0 nranks 2 cudaDev 0 busId 31000 commId 0xfe20596e81efe06c - Init START
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO [node_id = 8; gpu_id = 14571; unique_id = 1707942351238735562; location_id = 12544; bdf = 12544; domain = 0; partition = 0],
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO [node_id = 10; gpu_id = 57586; unique_id = 1879117630712565286; location_id = 4352; bdf = 4352; domain = 0; partition = 0],
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO [node_id = 12; gpu_id = 41154; unique_id = 12764043078489004147; location_id = 44544; bdf = 44544; domain = 0; partition = 0],
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO [node_id = 15; gpu_id = 8466; unique_id = 14583621907896286631; location_id = 37632; bdf = 37632; domain = 0; partition = 0],
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO [node_id = 9; gpu_id = 27432; unique_id = 16692116773638566847; location_id = 13312; bdf = 13312; domain = 0; partition = 0],
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO [node_id = 13; gpu_id = 63755; unique_id = 16987650277263352869; location_id = 45824; bdf = 45824; domain = 0; partition = 0],
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO [node_id = 14; gpu_id = 30939; unique_id = 17411672822274508524; location_id = 36352; bdf = 36352; domain = 0; partition = 0],
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO [node_id = 11; gpu_id = 45873; unique_id = 17784246647738402691; location_id = 5120; bdf = 5120; domain = 0; partition = 0],
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO [node_id = 8; gpu_id = 14571; unique_id = 1707942351238735562; location_id = 12544; bdf = 12544; domain = 0; partition = 0],
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO [node_id = 10; gpu_id = 57586; unique_id = 1879117630712565286; location_id = 4352; bdf = 4352; domain = 0; partition = 0],
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO [node_id = 12; gpu_id = 41154; unique_id = 12764043078489004147; location_id = 44544; bdf = 44544; domain = 0; partition = 0],
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO [node_id = 15; gpu_id = 8466; unique_id = 14583621907896286631; location_id = 37632; bdf = 37632; domain = 0; partition = 0],
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO [node_id = 9; gpu_id = 27432; unique_id = 16692116773638566847; location_id = 13312; bdf = 13312; domain = 0; partition = 0],
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO [node_id = 13; gpu_id = 63755; unique_id = 16987650277263352869; location_id = 45824; bdf = 45824; domain = 0; partition = 0],
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO [node_id = 14; gpu_id = 30939; unique_id = 17411672822274508524; location_id = 36352; bdf = 36352; domain = 0; partition = 0],
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO [node_id = 11; gpu_id = 45873; unique_id = 17784246647738402691; location_id = 5120; bdf = 5120; domain = 0; partition = 0],
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO initialized internal alternative rsmi functionality
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO initialized internal alternative rsmi functionality
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO comm 0xbf44640 rank 1 nRanks 2 nNodes 1 localRanks 2 localRank 1 MNNVL 0
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0 [2] -1/-1/-1->1->0 [3] 0/-1/-1->1->-1 [4] 0/-1/-1->1->-1 [5] 0/-1/-1->1->-1 [6] -1/-1/-1->1->0 [7] -1/-1/-1->1->0 [8] -1/-1/-1->1->0 [9] 0/-1/-1->1->-1 [10] 0/-1/-1->1->-1 [11] 0/-1/-1->1->-1 [12] -1/-1/-1->1->0 [13] -1/-1/-1->1->0 [14] -1/-1/-1->1->0 [15] 0/-1/-1->1->-1 [16] 0/-1/-1->1->-1 [17] 0/-1/-1->1->-1 [18] -1/-1/-1->1->0 [19] -1/-1/-1->1->0 [20] -1/-1/-1->1->0 [21] 0/-1/-1->1->-1 [22] 0/-1/-1->1->-1 [23] 0/-1/-1->1->-1 comm 0xbf44640 nRanks 02 busId 34000
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO P2P Chunksize set to 524288
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO comm 0xa966300 rank 0 nRanks 2 nNodes 1 localRanks 2 localRank 0 MNNVL 0
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 00/24 :    0  1
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 01/24 :    0  1
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 02/24 :    0  1
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 03/24 :    0  1
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 04/24 :    0  1
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 05/24 :    0  1
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 06/24 :    0  1
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 07/24 :    0  1
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 08/24 :    0  1
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 09/24 :    0  1
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 10/24 :    0  1
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 11/24 :    0  1
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 12/24 :    0  1
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 13/24 :    0  1
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 14/24 :    0  1
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 15/24 :    0  1
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 16/24 :    0  1
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 17/24 :    0  1
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 18/24 :    0  1
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 19/24 :    0  1
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 20/24 :    0  1
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 21/24 :    0  1
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 22/24 :    0  1
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 23/24 :    0  1
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] -1/-1/-1->0->1 [4] -1/-1/-1->0->1 [5] -1/-1/-1->0->1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] -1/-1/-1->0->1 [10] -1/-1/-1->0->1 [11] -1/-1/-1->0->1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] -1/-1/-1->0->1 [16] -1/-1/-1->0->1 [17] -1/-1/-1->0->1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] -1/-1/-1->0->1 [22] -1/-1/-1->0->1 [23] -1/-1/-1->0->1 comm 0xa966300 nRanks 02 busId 31000
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO P2P Chunksize set to 524288
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 00/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 01/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 00/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 02/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 03/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 01/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 04/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 02/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 05/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 03/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 06/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 04/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 07/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 08/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 05/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 09/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 06/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 07/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 10/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 08/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 11/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 09/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 12/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 10/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 13/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 11/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 14/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 12/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 15/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 13/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 16/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 14/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 17/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 15/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 18/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 16/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 19/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 17/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 20/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 18/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 21/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 19/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 22/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 23/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 20/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 21/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 22/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 23/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Connected all rings comm 0xbf44640 nRanks 02 busId 34000
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Connected all trees
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 256 | 256
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO 24 coll channels, 0 collnet channels, 0 nvls channels, 32 p2p channels, 8 p2p channels per peer
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Connected all rings comm 0xa966300 nRanks 02 busId 31000
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Connected all trees
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 256 | 256
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO 24 coll channels, 0 collnet channels, 0 nvls channels, 32 p2p channels, 8 p2p channels per peer
 +
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO comm 0xbf44640 rank 1 nranks 2 cudaDev 1 busId 34000 commId 0xfe20596e81efe06c localSize 296 used 469861984 bytes - Init COMPLETE
 +
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO comm 0xa966300 rank 0 nranks 2 cudaDev 0 busId 31000 commId 0xfe20596e81efe06c localSize 296 used 469861984 bytes - Init COMPLETE
 +
broadcast result: tensor(618.5157, device='cuda:0')
 +
broadcast result: tensor(618.5157, device='cuda:1')
 +
allreduce result: tensor(1237.0315, device='cuda:0')
 +
allreduce result: tensor(1237.0315, device='cuda:1')
 +
nscale-perf-cluster-compute-top500-15:142875:142932 [1] NCCL INFO [Service thread] Connection closed by localRank 1
 +
nscale-perf-cluster-compute-top500-15:142876:142931 [0] NCCL INFO [Service thread] Connection closed by localRank 0
 +
nscale-perf-cluster-compute-top500-15:142875:142937 [1] NCCL INFO comm 0xbf44640 rank 1 nranks 2 cudaDev 1 busId 34000 - Abort COMPLETE
 +
nscale-perf-cluster-compute-top500-15:142876:142938 [0] NCCL INFO comm 0xa966300 rank 0 nranks 2 cudaDev 0 busId 31000 - Abort COMPLETE
 
</pre>
 
</pre>

Latest revision as of 13:46, 18 September 2024

[antony@gpu5 pytorch-nccl-test]$ cat run_dp_mpi.sh
#!/bin/bash

# mpirun --mca pml ucx -np 8 -H gpu0:4,gpu5:4 ./run_dp_mpi.sh

source ~/dp-build/torch-rocm61/bin/activate

export MASTER_ADDR=172.16.16.42
export MASTER_PORT=25252
export NCCL_SOCKET_IFNAME=enp37s0np0,enp12s0np0,enp180s0np0,enp139s0np0
#export NCCL_DEBUG=INFO
export NCCL_IB_GID_INDEX=3
export NCCL_NET_GDR_LEVEL=1

python3.11 /home/antony/dp-build/pytorch-nccl-test/dp_pytorch_mpi.py

Then the dp_pytorch_mpi.py file

[antony@gpu5 pytorch-nccl-test]$ cat /home/antony/dp-build/pytorch-nccl-test/dp_pytorch_mpi.py
import os
import argparse

import torch
import torch.distributed as dist

# Environment variables set by mpirun
LOCAL_RANK = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
WORLD_SIZE = int(os.environ['OMPI_COMM_WORLD_SIZE'])
WORLD_RANK = int(os.environ['OMPI_COMM_WORLD_RANK'])

def run(backend):
    tensor = torch.zeros(1)

    # Need to put tensor on a GPU device for nccl backend
    if backend == 'nccl':
        device = torch.device("cuda:{}".format(LOCAL_RANK))
        tensor = tensor.to(device)

    if WORLD_RANK == 0:
        for rank_recv in range(1, WORLD_SIZE):
            dist.send(tensor=tensor, dst=rank_recv)
            print('worker_{} sent data to Rank {}\n'.format(0, rank_recv))
    else:
        dist.recv(tensor=tensor, src=0)
        print('worker_{} has received data from rank {}\n'.format(WORLD_RANK, 0))

def init_processes(backend):
    dist.init_process_group(backend, rank=WORLD_RANK, world_size=WORLD_SIZE)
    run(backend)

if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument("--local_rank", type=int, help="Local rank. Necessary for using the torch.distributed.launch utility.")
    parser.add_argument("--backend", type=str, default="nccl", choices=['nccl', 'gloo'])
    args = parser.parse_args()

    init_processes(backend=args.backend)
    dist.barrier()
    dist.destroy_process_group()


Another Example

Run the job using

(torch-rocm61) [antony@gpu0 pytorch-nccl-test]$ /home/shared/apps/openmpi/5.0.3/bin/mpirun -x SCRATCH=55 -x NCCL_DEBUG=INFO -np 16 -H gpu0:8,gpu3:8 run_dp.sh

Wrapper script

[antony@lighton-login pytorch-nccl-test]$ cat run_dp.sh
#!/bin/bash

source ~/dp-build/torch-rocm61/bin/activate
python3.11 /home/antony/dp-build/pytorch-nccl-test/test_nccl_dp.py

With the following file

# cat test_nccl_dp.py
import os

import torch
import torch.distributed as dist

def init_workers_nccl_file():
    rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
    n_ranks = int(os.environ['OMPI_COMM_WORLD_SIZE'])
    sync_file_dir = '%s/tmp' % os.environ['SCRATCH']
    os.makedirs(sync_file_dir, exist_ok=True)
    sync_file = 'file://%s/pytorch_sync_%s' % (
        sync_file_dir, os.environ['OMPI_WORLD_SIZE'])
    dist.init_process_group(backend='nccl', world_size=n_ranks, rank=rank,
                            init_method=sync_file)
    return rank, n_ranks

# Print pytorch version
print('Pytorch version', torch.__version__)

# Configuration
ranks_per_node = 8
shape = 2**17
dtype = torch.float32

# Initialize MPI
rank, n_ranks = init_workers_nccl_file()
local_rank = rank % ranks_per_node

# Allocate a small tensor on every gpu from every rank.
# This is an attempt to force creation of all device contexts.
for i in range(ranks_per_node):
    _ = torch.randn(1).to(torch.device('cuda', i))

# Select our gpu
device = torch.device('cuda', local_rank)
print('Rank', rank, 'size', n_ranks, 'device', device, 'count', torch.cuda.device_count())

print('Torch NCCL version', torch.cuda.nccl.version())

# Allocate a tensor on the gpu
x = torch.randn(shape, dtype=dtype).to(device)
print('local result:', x.sum())

# Do a broadcast from rank 0
dist.broadcast(x, 0)
print('broadcast result:', x.sum())

# Do an all-reduce
dist.all_reduce(x)
print('allreduce result:', x.sum())

# close out
dist.barrier()
dist.destroy_process_group()

torchrun example

Notes from Victor at AMD

virobles
  8:10 PM
pip wheel
8:13
#Environment:
conda create -n torchrun python=3.9
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0
# Log into GPU0 and run:
export NCCL_SOCKET_IFNAME="enp12s0np0,enp139s0np0,enp180s0np0,enp37s0np0"
export NCCL_IB_GID_INDEX=3
export NCCL_NET_GDR_LEVEL=1
conda activate torchrun
torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_port=29600 --master_addr=172.16.16.38 /home/shared/scripts/multinode.py 50 10
# Log into GPU5 and run:
export NCCL_SOCKET_IFNAME="enp12s0np0,enp139s0np0,enp180s0np0,enp37s0np0"
export NCCL_IB_GID_INDEX=3
export NCCL_NET_GDR_LEVEL=1
conda activate torchrun
torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_port=29600 --master_addr=172.16.16.38 /home/shared/scripts/multinode.py 50 10
8:15
You're probably already familiar with the NCCL env vars so I won't elaborate on those. But to get Pytorch to work for me I had to specify the IB GID INDEX as 3 and force to use GPU Direct RDMA only when GPU and NIC are on the same PCI switch (with the NCCL_NET_GDR_LEVEL env var set to 1) (edited) 

the multimode.py file

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
import os
from torch.distributed.elastic.multiprocessing.errors import record


class MyTrainDataset(Dataset):
    """Custom Dataset for training data."""

    def __init__(self, size):
        """
        Initialize the dataset with random data.

        Args:
            size (int): The size of the dataset.
        """
        self.size = size
        self.data = [(torch.rand(20), torch.rand(1)) for _ in range(size)]

    def __len__(self):
        """Return the size of the dataset."""
        return self.size

    def __getitem__(self, index):
        """
        Get an item from the dataset at a given index.

        Args:
            index (int): The index of the item.

        Returns:
            tuple: A tuple containing the input data and target.
        """
        return self.data[index]


def ddp_setup():
    """Set up the distributed data parallel (DDP) environment."""
    init_process_group(backend="nccl")
    print("LOCAL RANK", int(os.environ["LOCAL_RANK"]))
    torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))


class Trainer:
    """Trainer class to handle training loop, snapshots, and DDP."""

    def __init__(
        self,
        model: torch.nn.Module,
        train_data: DataLoader,
        optimizer: torch.optim.Optimizer,
        save_every: int,
        snapshot_path: str,
    ) -> None:
        """
        Initialize the Trainer.

        Args:
            model (torch.nn.Module): The model to train.
            train_data (DataLoader): The DataLoader for training data.
            optimizer (torch.optim.Optimizer): The optimizer for training.
            save_every (int): How often to save snapshots.
            snapshot_path (str): Path to save the snapshots.
        """
        self.local_rank = int(os.environ["LOCAL_RANK"])
        self.global_rank = int(os.environ["RANK"])
        self.model = model.to(self.local_rank)
        self.train_data = train_data
        self.optimizer = optimizer
        self.save_every = save_every
        self.epochs_run = 0
        self.snapshot_path = snapshot_path

        if os.path.exists(snapshot_path):
            print("Loading snapshot")
            self._load_snapshot(snapshot_path)

        self.model = DDP(self.model, device_ids=[self.local_rank])

    def _load_snapshot(self, snapshot_path):
        """
        Load a training snapshot to resume training.

        Args:
            snapshot_path (str): Path to the snapshot file.
        """
        loc = f"cuda:{self.local_rank}"
        snapshot = torch.load(snapshot_path, map_location=loc)
        self.model.load_state_dict(snapshot["MODEL_STATE"])
        self.epochs_run = snapshot["EPOCHS_RUN"]
        print(f"Resuming training from snapshot at Epoch {self.epochs_run}")

    def _run_batch(self, source, targets):
        """
        Run a single batch through the model.

        Args:
            source (torch.Tensor): Input data.
            targets (torch.Tensor): Target data.
        """
        self.optimizer.zero_grad()
        output = self.model(source)
        loss = F.cross_entropy(output, targets)
        loss.backward()
        self.optimizer.step()

    def _run_epoch(self, epoch):
        """
        Run a single epoch of training.

        Args:
            epoch (int): The current epoch number.
        """
        b_sz = len(next(iter(self.train_data))[0])
        print(
            f"[GPU{self.global_rank}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}"
        )
        self.train_data.sampler.set_epoch(epoch)
        for source, targets in self.train_data:
            source = source.to(self.local_rank)
            targets = targets.to(self.local_rank)
            self._run_batch(source, targets)

    def _save_snapshot(self, epoch):
        """
        Save a snapshot of the model and training state.

        Args:
            epoch (int): The current epoch number.
        """
        snapshot = {
            "MODEL_STATE": self.model.module.state_dict(),
            "EPOCHS_RUN": epoch,
        }
        torch.save(snapshot, self.snapshot_path)
        print(f"Epoch {epoch} | Training snapshot saved at {self.snapshot_path}")

    def train(self, max_epochs: int):
        """
        Run the training loop for a given number of epochs.

        Args:
            max_epochs (int): The total number of epochs to train.
        """
        for epoch in range(self.epochs_run, max_epochs):
            self._run_epoch(epoch)
            if self.local_rank == 0 and epoch % self.save_every == 0:
                self._save_snapshot(epoch)


def load_train_objs():
    """
    Load the training objects: dataset, model, and optimizer.

    Returns:
        tuple: A tuple containing the dataset, model, and optimizer.
    """
    train_set = MyTrainDataset(2048)  # load your dataset
    model = torch.nn.Linear(20, 1)  # load your model
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
    return train_set, model, optimizer


def prepare_dataloader(dataset: Dataset, batch_size: int):
    """
    Prepare the DataLoader for the dataset.

    Args:
        dataset (Dataset): The dataset to load.
        batch_size (int): The batch size for the DataLoader.

    Returns:
        DataLoader: The prepared DataLoader.
    """
    return DataLoader(
        dataset,
        batch_size=batch_size,
        pin_memory=True,
        shuffle=False,
        sampler=DistributedSampler(dataset),
    )


@record
def main(
    save_every: int,
    total_epochs: int,
    batch_size: int,
    snapshot_path: str = "snapshot.pt",
):
    """
    Main function to set up DDP, load data, and start training.

    Args:
        save_every (int): How often to save snapshots.
        total_epochs (int): The total number of epochs to train.
        batch_size (int): The batch size for training.
        snapshot_path (str, optional): Path to save snapshots. Defaults to "snapshot.pt".
    """
    ddp_setup()
    dataset, model, optimizer = load_train_objs()
    train_data = prepare_dataloader(dataset, batch_size)
    trainer = Trainer(model, train_data, optimizer, save_every, snapshot_path)
    trainer.train(total_epochs)
    destroy_process_group()


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Simple distributed training job")
    parser.add_argument(
        "total_epochs", type=int, help="Total epochs to train the model"
    )
    parser.add_argument("save_every", type=int, help="How often to save a snapshot")
    parser.add_argument(
        "--batch_size",
        default=32,
        type=int,
        help="Input batch size on each device (default: 32)",
    )
    args = parser.parse_args()

    main(args.save_every, args.total_epochs, args.batch_size)

Full output

[hpc@nscale-perf-cluster-compute-top500-15 nccl-python-testing]$ mpirun -x SCRATCH=55 -x NCCL_DEBUG=INFO -np 2 singularity run /tmp/dp-singularity/pytorch-nightly_latest.sif python ./test-nccl.py
Pytorch version 2.6.0a0+gitd2207c5
Pytorch version 2.6.0a0+gitd2207c5
Rank 0 size 2 device cuda:0 count 8
Torch NCCL version (2, 20, 5)
local result: tensor(618.5157, device='cuda:0')
nscale-perf-cluster-compute-top500-15:142876:142876 [0] NCCL INFO Bootstrap : Using enp37s0np0:172.16.18.118<0>
nscale-perf-cluster-compute-top500-15:142876:142876 [0] NCCL INFO NET/Plugin : dlerror=librccl-net.so: cannot open shared object file: No such file or directory No plugin found (librccl-net.so), using internal implementation

nscale-perf-cluster-compute-top500-15:142876:142876 [0] /long_pathname_so_that_rpms_can_package_the_debug_info/src/extlibs/rccl/build/hipify/src/init.cc:115 NCCL WARN NUMA auto balancing enabled which can lead to variability in the RCCL performance! Disable by "sudo sysctl kernel.numa_balancing=0"
nscale-perf-cluster-compute-top500-15:142876:142876 [0] NCCL INFO Kernel version: 4.18.0-513.24.1.el8_9.x86_64
nscale-perf-cluster-compute-top500-15:142876:142876 [0] NCCL INFO ROCr version 1.14
nscale-perf-cluster-compute-top500-15:142876:142876 [0] NCCL INFO Dmabuf feature disabled without NCCL_DMABUF_ENABLE=1
RCCL version 2.20.5+hip6.2 HEAD:45b618a+
Rank 1 size 2 device cuda:1 count 8
Torch NCCL version (2, 20, 5)
local result: tensor(-332.1474, device='cuda:1')
nscale-perf-cluster-compute-top500-15:142875:142875 [1] NCCL INFO ROCr version 1.14
nscale-perf-cluster-compute-top500-15:142875:142875 [1] NCCL INFO Dmabuf feature disabled without NCCL_DMABUF_ENABLE=1
nscale-perf-cluster-compute-top500-15:142875:142875 [1] NCCL INFO Bootstrap : Using enp37s0np0:172.16.18.118<0>
nscale-perf-cluster-compute-top500-15:142875:142875 [1] NCCL INFO NET/Plugin : dlerror=librccl-net.so: cannot open shared object file: No such file or directory No plugin found (librccl-net.so), using internal implementation

nscale-perf-cluster-compute-top500-15:142875:142875 [1] /long_pathname_so_that_rpms_can_package_the_debug_info/src/extlibs/rccl/build/hipify/src/init.cc:115 NCCL WARN NUMA auto balancing enabled which can lead to variability in the RCCL performance! Disable by "sudo sysctl kernel.numa_balancing=0"
nscale-perf-cluster-compute-top500-15:142875:142875 [1] NCCL INFO Kernel version: 4.18.0-513.24.1.el8_9.x86_64
libibverbs: Warning: couldn't open config directory '/etc/libibverbs.d'.
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO NET/IB : No device found.
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO NET/Socket : Using [0]enp37s0np0:172.16.18.118<0> [1]enp12s0np0:172.16.18.120<0> [2]enp180s0np0:172.16.18.116<0> [3]enp139s0np0:172.16.18.114<0> [4]bond0:192.168.7.139<0>
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Using non-device net plugin version 0
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Using network Socket
libibverbs: Warning: couldn't open config directory '/etc/libibverbs.d'.
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO NET/IB : No device found.
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO NET/Socket : Using [0]enp37s0np0:172.16.18.118<0> [1]enp12s0np0:172.16.18.120<0> [2]enp180s0np0:172.16.18.116<0> [3]enp139s0np0:172.16.18.114<0> [4]bond0:192.168.7.139<0>
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Using non-device net plugin version 0
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Using network Socket
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO comm 0xbf44640 rank 1 nranks 2 cudaDev 1 busId 34000 commId 0xfe20596e81efe06c - Init START
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO comm 0xa966300 rank 0 nranks 2 cudaDev 0 busId 31000 commId 0xfe20596e81efe06c - Init START
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO [node_id = 8; gpu_id = 14571; unique_id = 1707942351238735562; location_id = 12544; bdf = 12544; domain = 0; partition = 0],
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO [node_id = 10; gpu_id = 57586; unique_id = 1879117630712565286; location_id = 4352; bdf = 4352; domain = 0; partition = 0],
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO [node_id = 12; gpu_id = 41154; unique_id = 12764043078489004147; location_id = 44544; bdf = 44544; domain = 0; partition = 0],
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO [node_id = 15; gpu_id = 8466; unique_id = 14583621907896286631; location_id = 37632; bdf = 37632; domain = 0; partition = 0],
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO [node_id = 9; gpu_id = 27432; unique_id = 16692116773638566847; location_id = 13312; bdf = 13312; domain = 0; partition = 0],
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO [node_id = 13; gpu_id = 63755; unique_id = 16987650277263352869; location_id = 45824; bdf = 45824; domain = 0; partition = 0],
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO [node_id = 14; gpu_id = 30939; unique_id = 17411672822274508524; location_id = 36352; bdf = 36352; domain = 0; partition = 0],
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO [node_id = 11; gpu_id = 45873; unique_id = 17784246647738402691; location_id = 5120; bdf = 5120; domain = 0; partition = 0],
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO [node_id = 8; gpu_id = 14571; unique_id = 1707942351238735562; location_id = 12544; bdf = 12544; domain = 0; partition = 0],
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO [node_id = 10; gpu_id = 57586; unique_id = 1879117630712565286; location_id = 4352; bdf = 4352; domain = 0; partition = 0],
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO [node_id = 12; gpu_id = 41154; unique_id = 12764043078489004147; location_id = 44544; bdf = 44544; domain = 0; partition = 0],
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO [node_id = 15; gpu_id = 8466; unique_id = 14583621907896286631; location_id = 37632; bdf = 37632; domain = 0; partition = 0],
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO [node_id = 9; gpu_id = 27432; unique_id = 16692116773638566847; location_id = 13312; bdf = 13312; domain = 0; partition = 0],
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO [node_id = 13; gpu_id = 63755; unique_id = 16987650277263352869; location_id = 45824; bdf = 45824; domain = 0; partition = 0],
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO [node_id = 14; gpu_id = 30939; unique_id = 17411672822274508524; location_id = 36352; bdf = 36352; domain = 0; partition = 0],
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO [node_id = 11; gpu_id = 45873; unique_id = 17784246647738402691; location_id = 5120; bdf = 5120; domain = 0; partition = 0],
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO initialized internal alternative rsmi functionality
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO initialized internal alternative rsmi functionality
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO comm 0xbf44640 rank 1 nRanks 2 nNodes 1 localRanks 2 localRank 1 MNNVL 0
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0 [2] -1/-1/-1->1->0 [3] 0/-1/-1->1->-1 [4] 0/-1/-1->1->-1 [5] 0/-1/-1->1->-1 [6] -1/-1/-1->1->0 [7] -1/-1/-1->1->0 [8] -1/-1/-1->1->0 [9] 0/-1/-1->1->-1 [10] 0/-1/-1->1->-1 [11] 0/-1/-1->1->-1 [12] -1/-1/-1->1->0 [13] -1/-1/-1->1->0 [14] -1/-1/-1->1->0 [15] 0/-1/-1->1->-1 [16] 0/-1/-1->1->-1 [17] 0/-1/-1->1->-1 [18] -1/-1/-1->1->0 [19] -1/-1/-1->1->0 [20] -1/-1/-1->1->0 [21] 0/-1/-1->1->-1 [22] 0/-1/-1->1->-1 [23] 0/-1/-1->1->-1 comm 0xbf44640 nRanks 02 busId 34000
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO P2P Chunksize set to 524288
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO comm 0xa966300 rank 0 nRanks 2 nNodes 1 localRanks 2 localRank 0 MNNVL 0
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 00/24 :    0   1
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 01/24 :    0   1
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 02/24 :    0   1
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 03/24 :    0   1
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 04/24 :    0   1
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 05/24 :    0   1
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 06/24 :    0   1
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 07/24 :    0   1
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 08/24 :    0   1
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 09/24 :    0   1
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 10/24 :    0   1
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 11/24 :    0   1
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 12/24 :    0   1
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 13/24 :    0   1
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 14/24 :    0   1
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 15/24 :    0   1
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 16/24 :    0   1
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 17/24 :    0   1
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 18/24 :    0   1
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 19/24 :    0   1
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 20/24 :    0   1
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 21/24 :    0   1
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 22/24 :    0   1
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 23/24 :    0   1
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] -1/-1/-1->0->1 [4] -1/-1/-1->0->1 [5] -1/-1/-1->0->1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] -1/-1/-1->0->1 [10] -1/-1/-1->0->1 [11] -1/-1/-1->0->1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] -1/-1/-1->0->1 [16] -1/-1/-1->0->1 [17] -1/-1/-1->0->1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] -1/-1/-1->0->1 [22] -1/-1/-1->0->1 [23] -1/-1/-1->0->1 comm 0xa966300 nRanks 02 busId 31000
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO P2P Chunksize set to 524288
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 00/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 01/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 00/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 02/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 03/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 01/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 04/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 02/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 05/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 03/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 06/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 04/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 07/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 08/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 05/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 09/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 06/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 07/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 10/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 08/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 11/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 09/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 12/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 10/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 13/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 11/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 14/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 12/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 15/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 13/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 16/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 14/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 17/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 15/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 18/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 16/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 19/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 17/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 20/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 18/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 21/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 19/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 22/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 23/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 20/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 21/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 22/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 23/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Connected all rings comm 0xbf44640 nRanks 02 busId 34000
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Connected all trees
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 256 | 256
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO 24 coll channels, 0 collnet channels, 0 nvls channels, 32 p2p channels, 8 p2p channels per peer
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Connected all rings comm 0xa966300 nRanks 02 busId 31000
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Connected all trees
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 256 | 256
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO 24 coll channels, 0 collnet channels, 0 nvls channels, 32 p2p channels, 8 p2p channels per peer
nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO comm 0xbf44640 rank 1 nranks 2 cudaDev 1 busId 34000 commId 0xfe20596e81efe06c localSize 296 used 469861984 bytes - Init COMPLETE
nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO comm 0xa966300 rank 0 nranks 2 cudaDev 0 busId 31000 commId 0xfe20596e81efe06c localSize 296 used 469861984 bytes - Init COMPLETE
broadcast result: tensor(618.5157, device='cuda:0')
broadcast result: tensor(618.5157, device='cuda:1')
allreduce result: tensor(1237.0315, device='cuda:0')
allreduce result: tensor(1237.0315, device='cuda:1')
nscale-perf-cluster-compute-top500-15:142875:142932 [1] NCCL INFO [Service thread] Connection closed by localRank 1
nscale-perf-cluster-compute-top500-15:142876:142931 [0] NCCL INFO [Service thread] Connection closed by localRank 0
nscale-perf-cluster-compute-top500-15:142875:142937 [1] NCCL INFO comm 0xbf44640 rank 1 nranks 2 cudaDev 1 busId 34000 - Abort COMPLETE
nscale-perf-cluster-compute-top500-15:142876:142938 [0] NCCL INFO comm 0xa966300 rank 0 nranks 2 cudaDev 0 busId 31000 - Abort COMPLETE