Difference between revisions of "Short MPI Tests using Pytorch example code"
Jump to navigation
Jump to search
| Line 139: | Line 139: | ||
dist.destroy_process_group() | dist.destroy_process_group() | ||
</pre> | </pre> | ||
| + | |||
| + | == torchrun example == | ||
Notes from Victor at AMD | Notes from Victor at AMD | ||
| Line 166: | Line 168: | ||
</pre> | </pre> | ||
| + | the multimode.py file | ||
| + | <pre> | ||
| + | import torch | ||
| + | import torch.nn.functional as F | ||
| + | from torch.utils.data import Dataset, DataLoader | ||
| + | |||
| + | import torch.multiprocessing as mp | ||
| + | from torch.utils.data.distributed import DistributedSampler | ||
| + | from torch.nn.parallel import DistributedDataParallel as DDP | ||
| + | from torch.distributed import init_process_group, destroy_process_group | ||
| + | import os | ||
| + | from torch.distributed.elastic.multiprocessing.errors import record | ||
| + | |||
| + | |||
| + | class MyTrainDataset(Dataset): | ||
| + | """Custom Dataset for training data.""" | ||
| + | |||
| + | def __init__(self, size): | ||
| + | """ | ||
| + | Initialize the dataset with random data. | ||
| + | |||
| + | Args: | ||
| + | size (int): The size of the dataset. | ||
| + | """ | ||
| + | self.size = size | ||
| + | self.data = [(torch.rand(20), torch.rand(1)) for _ in range(size)] | ||
| + | |||
| + | def __len__(self): | ||
| + | """Return the size of the dataset.""" | ||
| + | return self.size | ||
| + | |||
| + | def __getitem__(self, index): | ||
| + | """ | ||
| + | Get an item from the dataset at a given index. | ||
| + | |||
| + | Args: | ||
| + | index (int): The index of the item. | ||
| + | |||
| + | Returns: | ||
| + | tuple: A tuple containing the input data and target. | ||
| + | """ | ||
| + | return self.data[index] | ||
| + | |||
| + | |||
| + | def ddp_setup(): | ||
| + | """Set up the distributed data parallel (DDP) environment.""" | ||
| + | init_process_group(backend="nccl") | ||
| + | print("LOCAL RANK", int(os.environ["LOCAL_RANK"])) | ||
| + | torch.cuda.set_device(int(os.environ["LOCAL_RANK"])) | ||
| + | |||
| + | |||
| + | class Trainer: | ||
| + | """Trainer class to handle training loop, snapshots, and DDP.""" | ||
| + | |||
| + | def __init__( | ||
| + | self, | ||
| + | model: torch.nn.Module, | ||
| + | train_data: DataLoader, | ||
| + | optimizer: torch.optim.Optimizer, | ||
| + | save_every: int, | ||
| + | snapshot_path: str, | ||
| + | ) -> None: | ||
| + | """ | ||
| + | Initialize the Trainer. | ||
| + | |||
| + | Args: | ||
| + | model (torch.nn.Module): The model to train. | ||
| + | train_data (DataLoader): The DataLoader for training data. | ||
| + | optimizer (torch.optim.Optimizer): The optimizer for training. | ||
| + | save_every (int): How often to save snapshots. | ||
| + | snapshot_path (str): Path to save the snapshots. | ||
| + | """ | ||
| + | self.local_rank = int(os.environ["LOCAL_RANK"]) | ||
| + | self.global_rank = int(os.environ["RANK"]) | ||
| + | self.model = model.to(self.local_rank) | ||
| + | self.train_data = train_data | ||
| + | self.optimizer = optimizer | ||
| + | self.save_every = save_every | ||
| + | self.epochs_run = 0 | ||
| + | self.snapshot_path = snapshot_path | ||
| + | |||
| + | if os.path.exists(snapshot_path): | ||
| + | print("Loading snapshot") | ||
| + | self._load_snapshot(snapshot_path) | ||
| + | |||
| + | self.model = DDP(self.model, device_ids=[self.local_rank]) | ||
| + | |||
| + | def _load_snapshot(self, snapshot_path): | ||
| + | """ | ||
| + | Load a training snapshot to resume training. | ||
| + | |||
| + | Args: | ||
| + | snapshot_path (str): Path to the snapshot file. | ||
| + | """ | ||
| + | loc = f"cuda:{self.local_rank}" | ||
| + | snapshot = torch.load(snapshot_path, map_location=loc) | ||
| + | self.model.load_state_dict(snapshot["MODEL_STATE"]) | ||
| + | self.epochs_run = snapshot["EPOCHS_RUN"] | ||
| + | print(f"Resuming training from snapshot at Epoch {self.epochs_run}") | ||
| + | |||
| + | def _run_batch(self, source, targets): | ||
| + | """ | ||
| + | Run a single batch through the model. | ||
| + | |||
| + | Args: | ||
| + | source (torch.Tensor): Input data. | ||
| + | targets (torch.Tensor): Target data. | ||
| + | """ | ||
| + | self.optimizer.zero_grad() | ||
| + | output = self.model(source) | ||
| + | loss = F.cross_entropy(output, targets) | ||
| + | loss.backward() | ||
| + | self.optimizer.step() | ||
| + | |||
| + | def _run_epoch(self, epoch): | ||
| + | """ | ||
| + | Run a single epoch of training. | ||
| + | |||
| + | Args: | ||
| + | epoch (int): The current epoch number. | ||
| + | """ | ||
| + | b_sz = len(next(iter(self.train_data))[0]) | ||
| + | print( | ||
| + | f"[GPU{self.global_rank}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}" | ||
| + | ) | ||
| + | self.train_data.sampler.set_epoch(epoch) | ||
| + | for source, targets in self.train_data: | ||
| + | source = source.to(self.local_rank) | ||
| + | targets = targets.to(self.local_rank) | ||
| + | self._run_batch(source, targets) | ||
| + | |||
| + | def _save_snapshot(self, epoch): | ||
| + | """ | ||
| + | Save a snapshot of the model and training state. | ||
| + | |||
| + | Args: | ||
| + | epoch (int): The current epoch number. | ||
| + | """ | ||
| + | snapshot = { | ||
| + | "MODEL_STATE": self.model.module.state_dict(), | ||
| + | "EPOCHS_RUN": epoch, | ||
| + | } | ||
| + | torch.save(snapshot, self.snapshot_path) | ||
| + | print(f"Epoch {epoch} | Training snapshot saved at {self.snapshot_path}") | ||
| + | |||
| + | def train(self, max_epochs: int): | ||
| + | """ | ||
| + | Run the training loop for a given number of epochs. | ||
| + | |||
| + | Args: | ||
| + | max_epochs (int): The total number of epochs to train. | ||
| + | """ | ||
| + | for epoch in range(self.epochs_run, max_epochs): | ||
| + | self._run_epoch(epoch) | ||
| + | if self.local_rank == 0 and epoch % self.save_every == 0: | ||
| + | self._save_snapshot(epoch) | ||
| + | |||
| + | |||
| + | def load_train_objs(): | ||
| + | """ | ||
| + | Load the training objects: dataset, model, and optimizer. | ||
| + | |||
| + | Returns: | ||
| + | tuple: A tuple containing the dataset, model, and optimizer. | ||
| + | """ | ||
| + | train_set = MyTrainDataset(2048) # load your dataset | ||
| + | model = torch.nn.Linear(20, 1) # load your model | ||
| + | optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) | ||
| + | return train_set, model, optimizer | ||
| + | |||
| + | |||
| + | def prepare_dataloader(dataset: Dataset, batch_size: int): | ||
| + | """ | ||
| + | Prepare the DataLoader for the dataset. | ||
| + | |||
| + | Args: | ||
| + | dataset (Dataset): The dataset to load. | ||
| + | batch_size (int): The batch size for the DataLoader. | ||
| + | |||
| + | Returns: | ||
| + | DataLoader: The prepared DataLoader. | ||
| + | """ | ||
| + | return DataLoader( | ||
| + | dataset, | ||
| + | batch_size=batch_size, | ||
| + | pin_memory=True, | ||
| + | shuffle=False, | ||
| + | sampler=DistributedSampler(dataset), | ||
| + | ) | ||
| + | |||
| + | |||
| + | @record | ||
| + | def main( | ||
| + | save_every: int, | ||
| + | total_epochs: int, | ||
| + | batch_size: int, | ||
| + | snapshot_path: str = "snapshot.pt", | ||
| + | ): | ||
| + | """ | ||
| + | Main function to set up DDP, load data, and start training. | ||
| + | |||
| + | Args: | ||
| + | save_every (int): How often to save snapshots. | ||
| + | total_epochs (int): The total number of epochs to train. | ||
| + | batch_size (int): The batch size for training. | ||
| + | snapshot_path (str, optional): Path to save snapshots. Defaults to "snapshot.pt". | ||
| + | """ | ||
| + | ddp_setup() | ||
| + | dataset, model, optimizer = load_train_objs() | ||
| + | train_data = prepare_dataloader(dataset, batch_size) | ||
| + | trainer = Trainer(model, train_data, optimizer, save_every, snapshot_path) | ||
| + | trainer.train(total_epochs) | ||
| + | destroy_process_group() | ||
| + | |||
| + | |||
| + | if __name__ == "__main__": | ||
| + | import argparse | ||
| + | |||
| + | parser = argparse.ArgumentParser(description="Simple distributed training job") | ||
| + | parser.add_argument( | ||
| + | "total_epochs", type=int, help="Total epochs to train the model" | ||
| + | ) | ||
| + | parser.add_argument("save_every", type=int, help="How often to save a snapshot") | ||
| + | parser.add_argument( | ||
| + | "--batch_size", | ||
| + | default=32, | ||
| + | type=int, | ||
| + | help="Input batch size on each device (default: 32)", | ||
| + | ) | ||
| + | args = parser.parse_args() | ||
| + | |||
| + | main(args.save_every, args.total_epochs, args.batch_size) | ||
| + | |||
| + | </pre> | ||
Full output | Full output | ||
Latest revision as of 13:46, 18 September 2024
[antony@gpu5 pytorch-nccl-test]$ cat run_dp_mpi.sh #!/bin/bash # mpirun --mca pml ucx -np 8 -H gpu0:4,gpu5:4 ./run_dp_mpi.sh source ~/dp-build/torch-rocm61/bin/activate export MASTER_ADDR=172.16.16.42 export MASTER_PORT=25252 export NCCL_SOCKET_IFNAME=enp37s0np0,enp12s0np0,enp180s0np0,enp139s0np0 #export NCCL_DEBUG=INFO export NCCL_IB_GID_INDEX=3 export NCCL_NET_GDR_LEVEL=1 python3.11 /home/antony/dp-build/pytorch-nccl-test/dp_pytorch_mpi.py
Then the dp_pytorch_mpi.py file
[antony@gpu5 pytorch-nccl-test]$ cat /home/antony/dp-build/pytorch-nccl-test/dp_pytorch_mpi.py
import os
import argparse
import torch
import torch.distributed as dist
# Environment variables set by mpirun
LOCAL_RANK = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
WORLD_SIZE = int(os.environ['OMPI_COMM_WORLD_SIZE'])
WORLD_RANK = int(os.environ['OMPI_COMM_WORLD_RANK'])
def run(backend):
tensor = torch.zeros(1)
# Need to put tensor on a GPU device for nccl backend
if backend == 'nccl':
device = torch.device("cuda:{}".format(LOCAL_RANK))
tensor = tensor.to(device)
if WORLD_RANK == 0:
for rank_recv in range(1, WORLD_SIZE):
dist.send(tensor=tensor, dst=rank_recv)
print('worker_{} sent data to Rank {}\n'.format(0, rank_recv))
else:
dist.recv(tensor=tensor, src=0)
print('worker_{} has received data from rank {}\n'.format(WORLD_RANK, 0))
def init_processes(backend):
dist.init_process_group(backend, rank=WORLD_RANK, world_size=WORLD_SIZE)
run(backend)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", type=int, help="Local rank. Necessary for using the torch.distributed.launch utility.")
parser.add_argument("--backend", type=str, default="nccl", choices=['nccl', 'gloo'])
args = parser.parse_args()
init_processes(backend=args.backend)
dist.barrier()
dist.destroy_process_group()
Another Example
Run the job using
(torch-rocm61) [antony@gpu0 pytorch-nccl-test]$ /home/shared/apps/openmpi/5.0.3/bin/mpirun -x SCRATCH=55 -x NCCL_DEBUG=INFO -np 16 -H gpu0:8,gpu3:8 run_dp.sh
Wrapper script
[antony@lighton-login pytorch-nccl-test]$ cat run_dp.sh #!/bin/bash source ~/dp-build/torch-rocm61/bin/activate python3.11 /home/antony/dp-build/pytorch-nccl-test/test_nccl_dp.py
With the following file
# cat test_nccl_dp.py
import os
import torch
import torch.distributed as dist
def init_workers_nccl_file():
rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
n_ranks = int(os.environ['OMPI_COMM_WORLD_SIZE'])
sync_file_dir = '%s/tmp' % os.environ['SCRATCH']
os.makedirs(sync_file_dir, exist_ok=True)
sync_file = 'file://%s/pytorch_sync_%s' % (
sync_file_dir, os.environ['OMPI_WORLD_SIZE'])
dist.init_process_group(backend='nccl', world_size=n_ranks, rank=rank,
init_method=sync_file)
return rank, n_ranks
# Print pytorch version
print('Pytorch version', torch.__version__)
# Configuration
ranks_per_node = 8
shape = 2**17
dtype = torch.float32
# Initialize MPI
rank, n_ranks = init_workers_nccl_file()
local_rank = rank % ranks_per_node
# Allocate a small tensor on every gpu from every rank.
# This is an attempt to force creation of all device contexts.
for i in range(ranks_per_node):
_ = torch.randn(1).to(torch.device('cuda', i))
# Select our gpu
device = torch.device('cuda', local_rank)
print('Rank', rank, 'size', n_ranks, 'device', device, 'count', torch.cuda.device_count())
print('Torch NCCL version', torch.cuda.nccl.version())
# Allocate a tensor on the gpu
x = torch.randn(shape, dtype=dtype).to(device)
print('local result:', x.sum())
# Do a broadcast from rank 0
dist.broadcast(x, 0)
print('broadcast result:', x.sum())
# Do an all-reduce
dist.all_reduce(x)
print('allreduce result:', x.sum())
# close out
dist.barrier()
dist.destroy_process_group()
torchrun example
Notes from Victor at AMD
virobles 8:10 PM pip wheel 8:13 #Environment: conda create -n torchrun python=3.9 pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0 # Log into GPU0 and run: export NCCL_SOCKET_IFNAME="enp12s0np0,enp139s0np0,enp180s0np0,enp37s0np0" export NCCL_IB_GID_INDEX=3 export NCCL_NET_GDR_LEVEL=1 conda activate torchrun torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_port=29600 --master_addr=172.16.16.38 /home/shared/scripts/multinode.py 50 10 # Log into GPU5 and run: export NCCL_SOCKET_IFNAME="enp12s0np0,enp139s0np0,enp180s0np0,enp37s0np0" export NCCL_IB_GID_INDEX=3 export NCCL_NET_GDR_LEVEL=1 conda activate torchrun torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_port=29600 --master_addr=172.16.16.38 /home/shared/scripts/multinode.py 50 10 8:15 You're probably already familiar with the NCCL env vars so I won't elaborate on those. But to get Pytorch to work for me I had to specify the IB GID INDEX as 3 and force to use GPU Direct RDMA only when GPU and NIC are on the same PCI switch (with the NCCL_NET_GDR_LEVEL env var set to 1) (edited)
the multimode.py file
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
import os
from torch.distributed.elastic.multiprocessing.errors import record
class MyTrainDataset(Dataset):
"""Custom Dataset for training data."""
def __init__(self, size):
"""
Initialize the dataset with random data.
Args:
size (int): The size of the dataset.
"""
self.size = size
self.data = [(torch.rand(20), torch.rand(1)) for _ in range(size)]
def __len__(self):
"""Return the size of the dataset."""
return self.size
def __getitem__(self, index):
"""
Get an item from the dataset at a given index.
Args:
index (int): The index of the item.
Returns:
tuple: A tuple containing the input data and target.
"""
return self.data[index]
def ddp_setup():
"""Set up the distributed data parallel (DDP) environment."""
init_process_group(backend="nccl")
print("LOCAL RANK", int(os.environ["LOCAL_RANK"]))
torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
class Trainer:
"""Trainer class to handle training loop, snapshots, and DDP."""
def __init__(
self,
model: torch.nn.Module,
train_data: DataLoader,
optimizer: torch.optim.Optimizer,
save_every: int,
snapshot_path: str,
) -> None:
"""
Initialize the Trainer.
Args:
model (torch.nn.Module): The model to train.
train_data (DataLoader): The DataLoader for training data.
optimizer (torch.optim.Optimizer): The optimizer for training.
save_every (int): How often to save snapshots.
snapshot_path (str): Path to save the snapshots.
"""
self.local_rank = int(os.environ["LOCAL_RANK"])
self.global_rank = int(os.environ["RANK"])
self.model = model.to(self.local_rank)
self.train_data = train_data
self.optimizer = optimizer
self.save_every = save_every
self.epochs_run = 0
self.snapshot_path = snapshot_path
if os.path.exists(snapshot_path):
print("Loading snapshot")
self._load_snapshot(snapshot_path)
self.model = DDP(self.model, device_ids=[self.local_rank])
def _load_snapshot(self, snapshot_path):
"""
Load a training snapshot to resume training.
Args:
snapshot_path (str): Path to the snapshot file.
"""
loc = f"cuda:{self.local_rank}"
snapshot = torch.load(snapshot_path, map_location=loc)
self.model.load_state_dict(snapshot["MODEL_STATE"])
self.epochs_run = snapshot["EPOCHS_RUN"]
print(f"Resuming training from snapshot at Epoch {self.epochs_run}")
def _run_batch(self, source, targets):
"""
Run a single batch through the model.
Args:
source (torch.Tensor): Input data.
targets (torch.Tensor): Target data.
"""
self.optimizer.zero_grad()
output = self.model(source)
loss = F.cross_entropy(output, targets)
loss.backward()
self.optimizer.step()
def _run_epoch(self, epoch):
"""
Run a single epoch of training.
Args:
epoch (int): The current epoch number.
"""
b_sz = len(next(iter(self.train_data))[0])
print(
f"[GPU{self.global_rank}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}"
)
self.train_data.sampler.set_epoch(epoch)
for source, targets in self.train_data:
source = source.to(self.local_rank)
targets = targets.to(self.local_rank)
self._run_batch(source, targets)
def _save_snapshot(self, epoch):
"""
Save a snapshot of the model and training state.
Args:
epoch (int): The current epoch number.
"""
snapshot = {
"MODEL_STATE": self.model.module.state_dict(),
"EPOCHS_RUN": epoch,
}
torch.save(snapshot, self.snapshot_path)
print(f"Epoch {epoch} | Training snapshot saved at {self.snapshot_path}")
def train(self, max_epochs: int):
"""
Run the training loop for a given number of epochs.
Args:
max_epochs (int): The total number of epochs to train.
"""
for epoch in range(self.epochs_run, max_epochs):
self._run_epoch(epoch)
if self.local_rank == 0 and epoch % self.save_every == 0:
self._save_snapshot(epoch)
def load_train_objs():
"""
Load the training objects: dataset, model, and optimizer.
Returns:
tuple: A tuple containing the dataset, model, and optimizer.
"""
train_set = MyTrainDataset(2048) # load your dataset
model = torch.nn.Linear(20, 1) # load your model
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
return train_set, model, optimizer
def prepare_dataloader(dataset: Dataset, batch_size: int):
"""
Prepare the DataLoader for the dataset.
Args:
dataset (Dataset): The dataset to load.
batch_size (int): The batch size for the DataLoader.
Returns:
DataLoader: The prepared DataLoader.
"""
return DataLoader(
dataset,
batch_size=batch_size,
pin_memory=True,
shuffle=False,
sampler=DistributedSampler(dataset),
)
@record
def main(
save_every: int,
total_epochs: int,
batch_size: int,
snapshot_path: str = "snapshot.pt",
):
"""
Main function to set up DDP, load data, and start training.
Args:
save_every (int): How often to save snapshots.
total_epochs (int): The total number of epochs to train.
batch_size (int): The batch size for training.
snapshot_path (str, optional): Path to save snapshots. Defaults to "snapshot.pt".
"""
ddp_setup()
dataset, model, optimizer = load_train_objs()
train_data = prepare_dataloader(dataset, batch_size)
trainer = Trainer(model, train_data, optimizer, save_every, snapshot_path)
trainer.train(total_epochs)
destroy_process_group()
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Simple distributed training job")
parser.add_argument(
"total_epochs", type=int, help="Total epochs to train the model"
)
parser.add_argument("save_every", type=int, help="How often to save a snapshot")
parser.add_argument(
"--batch_size",
default=32,
type=int,
help="Input batch size on each device (default: 32)",
)
args = parser.parse_args()
main(args.save_every, args.total_epochs, args.batch_size)
Full output
[hpc@nscale-perf-cluster-compute-top500-15 nccl-python-testing]$ mpirun -x SCRATCH=55 -x NCCL_DEBUG=INFO -np 2 singularity run /tmp/dp-singularity/pytorch-nightly_latest.sif python ./test-nccl.py Pytorch version 2.6.0a0+gitd2207c5 Pytorch version 2.6.0a0+gitd2207c5 Rank 0 size 2 device cuda:0 count 8 Torch NCCL version (2, 20, 5) local result: tensor(618.5157, device='cuda:0') nscale-perf-cluster-compute-top500-15:142876:142876 [0] NCCL INFO Bootstrap : Using enp37s0np0:172.16.18.118<0> nscale-perf-cluster-compute-top500-15:142876:142876 [0] NCCL INFO NET/Plugin : dlerror=librccl-net.so: cannot open shared object file: No such file or directory No plugin found (librccl-net.so), using internal implementation nscale-perf-cluster-compute-top500-15:142876:142876 [0] /long_pathname_so_that_rpms_can_package_the_debug_info/src/extlibs/rccl/build/hipify/src/init.cc:115 NCCL WARN NUMA auto balancing enabled which can lead to variability in the RCCL performance! Disable by "sudo sysctl kernel.numa_balancing=0" nscale-perf-cluster-compute-top500-15:142876:142876 [0] NCCL INFO Kernel version: 4.18.0-513.24.1.el8_9.x86_64 nscale-perf-cluster-compute-top500-15:142876:142876 [0] NCCL INFO ROCr version 1.14 nscale-perf-cluster-compute-top500-15:142876:142876 [0] NCCL INFO Dmabuf feature disabled without NCCL_DMABUF_ENABLE=1 RCCL version 2.20.5+hip6.2 HEAD:45b618a+ Rank 1 size 2 device cuda:1 count 8 Torch NCCL version (2, 20, 5) local result: tensor(-332.1474, device='cuda:1') nscale-perf-cluster-compute-top500-15:142875:142875 [1] NCCL INFO ROCr version 1.14 nscale-perf-cluster-compute-top500-15:142875:142875 [1] NCCL INFO Dmabuf feature disabled without NCCL_DMABUF_ENABLE=1 nscale-perf-cluster-compute-top500-15:142875:142875 [1] NCCL INFO Bootstrap : Using enp37s0np0:172.16.18.118<0> nscale-perf-cluster-compute-top500-15:142875:142875 [1] NCCL INFO NET/Plugin : dlerror=librccl-net.so: cannot open shared object file: No such file or directory No plugin found (librccl-net.so), using internal implementation nscale-perf-cluster-compute-top500-15:142875:142875 [1] /long_pathname_so_that_rpms_can_package_the_debug_info/src/extlibs/rccl/build/hipify/src/init.cc:115 NCCL WARN NUMA auto balancing enabled which can lead to variability in the RCCL performance! Disable by "sudo sysctl kernel.numa_balancing=0" nscale-perf-cluster-compute-top500-15:142875:142875 [1] NCCL INFO Kernel version: 4.18.0-513.24.1.el8_9.x86_64 libibverbs: Warning: couldn't open config directory '/etc/libibverbs.d'. nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO NET/IB : No device found. nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO NET/Socket : Using [0]enp37s0np0:172.16.18.118<0> [1]enp12s0np0:172.16.18.120<0> [2]enp180s0np0:172.16.18.116<0> [3]enp139s0np0:172.16.18.114<0> [4]bond0:192.168.7.139<0> nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Using non-device net plugin version 0 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Using network Socket libibverbs: Warning: couldn't open config directory '/etc/libibverbs.d'. nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO NET/IB : No device found. nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO NET/Socket : Using [0]enp37s0np0:172.16.18.118<0> [1]enp12s0np0:172.16.18.120<0> [2]enp180s0np0:172.16.18.116<0> [3]enp139s0np0:172.16.18.114<0> [4]bond0:192.168.7.139<0> nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Using non-device net plugin version 0 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Using network Socket nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO comm 0xbf44640 rank 1 nranks 2 cudaDev 1 busId 34000 commId 0xfe20596e81efe06c - Init START nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO comm 0xa966300 rank 0 nranks 2 cudaDev 0 busId 31000 commId 0xfe20596e81efe06c - Init START nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO [node_id = 8; gpu_id = 14571; unique_id = 1707942351238735562; location_id = 12544; bdf = 12544; domain = 0; partition = 0], nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO [node_id = 10; gpu_id = 57586; unique_id = 1879117630712565286; location_id = 4352; bdf = 4352; domain = 0; partition = 0], nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO [node_id = 12; gpu_id = 41154; unique_id = 12764043078489004147; location_id = 44544; bdf = 44544; domain = 0; partition = 0], nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO [node_id = 15; gpu_id = 8466; unique_id = 14583621907896286631; location_id = 37632; bdf = 37632; domain = 0; partition = 0], nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO [node_id = 9; gpu_id = 27432; unique_id = 16692116773638566847; location_id = 13312; bdf = 13312; domain = 0; partition = 0], nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO [node_id = 13; gpu_id = 63755; unique_id = 16987650277263352869; location_id = 45824; bdf = 45824; domain = 0; partition = 0], nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO [node_id = 14; gpu_id = 30939; unique_id = 17411672822274508524; location_id = 36352; bdf = 36352; domain = 0; partition = 0], nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO [node_id = 11; gpu_id = 45873; unique_id = 17784246647738402691; location_id = 5120; bdf = 5120; domain = 0; partition = 0], nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO [node_id = 8; gpu_id = 14571; unique_id = 1707942351238735562; location_id = 12544; bdf = 12544; domain = 0; partition = 0], nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO [node_id = 10; gpu_id = 57586; unique_id = 1879117630712565286; location_id = 4352; bdf = 4352; domain = 0; partition = 0], nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO [node_id = 12; gpu_id = 41154; unique_id = 12764043078489004147; location_id = 44544; bdf = 44544; domain = 0; partition = 0], nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO [node_id = 15; gpu_id = 8466; unique_id = 14583621907896286631; location_id = 37632; bdf = 37632; domain = 0; partition = 0], nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO [node_id = 9; gpu_id = 27432; unique_id = 16692116773638566847; location_id = 13312; bdf = 13312; domain = 0; partition = 0], nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO [node_id = 13; gpu_id = 63755; unique_id = 16987650277263352869; location_id = 45824; bdf = 45824; domain = 0; partition = 0], nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO [node_id = 14; gpu_id = 30939; unique_id = 17411672822274508524; location_id = 36352; bdf = 36352; domain = 0; partition = 0], nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO [node_id = 11; gpu_id = 45873; unique_id = 17784246647738402691; location_id = 5120; bdf = 5120; domain = 0; partition = 0], nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO initialized internal alternative rsmi functionality nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO initialized internal alternative rsmi functionality nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO comm 0xbf44640 rank 1 nRanks 2 nNodes 1 localRanks 2 localRank 1 MNNVL 0 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0 [2] -1/-1/-1->1->0 [3] 0/-1/-1->1->-1 [4] 0/-1/-1->1->-1 [5] 0/-1/-1->1->-1 [6] -1/-1/-1->1->0 [7] -1/-1/-1->1->0 [8] -1/-1/-1->1->0 [9] 0/-1/-1->1->-1 [10] 0/-1/-1->1->-1 [11] 0/-1/-1->1->-1 [12] -1/-1/-1->1->0 [13] -1/-1/-1->1->0 [14] -1/-1/-1->1->0 [15] 0/-1/-1->1->-1 [16] 0/-1/-1->1->-1 [17] 0/-1/-1->1->-1 [18] -1/-1/-1->1->0 [19] -1/-1/-1->1->0 [20] -1/-1/-1->1->0 [21] 0/-1/-1->1->-1 [22] 0/-1/-1->1->-1 [23] 0/-1/-1->1->-1 comm 0xbf44640 nRanks 02 busId 34000 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO P2P Chunksize set to 524288 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO comm 0xa966300 rank 0 nRanks 2 nNodes 1 localRanks 2 localRank 0 MNNVL 0 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 00/24 : 0 1 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 01/24 : 0 1 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 02/24 : 0 1 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 03/24 : 0 1 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 04/24 : 0 1 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 05/24 : 0 1 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 06/24 : 0 1 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 07/24 : 0 1 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 08/24 : 0 1 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 09/24 : 0 1 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 10/24 : 0 1 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 11/24 : 0 1 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 12/24 : 0 1 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 13/24 : 0 1 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 14/24 : 0 1 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 15/24 : 0 1 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 16/24 : 0 1 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 17/24 : 0 1 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 18/24 : 0 1 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 19/24 : 0 1 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 20/24 : 0 1 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 21/24 : 0 1 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 22/24 : 0 1 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 23/24 : 0 1 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] -1/-1/-1->0->1 [4] -1/-1/-1->0->1 [5] -1/-1/-1->0->1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] -1/-1/-1->0->1 [10] -1/-1/-1->0->1 [11] -1/-1/-1->0->1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] -1/-1/-1->0->1 [16] -1/-1/-1->0->1 [17] -1/-1/-1->0->1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] -1/-1/-1->0->1 [22] -1/-1/-1->0->1 [23] -1/-1/-1->0->1 comm 0xa966300 nRanks 02 busId 31000 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO P2P Chunksize set to 524288 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 00/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 01/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 00/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 02/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 03/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 01/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 04/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 02/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 05/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 03/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 06/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 04/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 07/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 08/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 05/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 09/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 06/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 07/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 10/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 08/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 11/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 09/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 12/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 10/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 13/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 11/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 14/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 12/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 15/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 13/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 16/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 14/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 17/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 15/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 18/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 16/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 19/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 17/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 20/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 18/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 21/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 19/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 22/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Channel 23/0 : 1[34000] -> 0[31000] via P2P/IPC comm 0xbf44640 nRanks 02 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 20/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 21/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 22/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Channel 23/0 : 0[31000] -> 1[34000] via P2P/IPC comm 0xa966300 nRanks 02 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Connected all rings comm 0xbf44640 nRanks 02 busId 34000 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO Connected all trees nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 256 | 256 nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO 24 coll channels, 0 collnet channels, 0 nvls channels, 32 p2p channels, 8 p2p channels per peer nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Connected all rings comm 0xa966300 nRanks 02 busId 31000 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO Connected all trees nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 256 | 256 nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO 24 coll channels, 0 collnet channels, 0 nvls channels, 32 p2p channels, 8 p2p channels per peer nscale-perf-cluster-compute-top500-15:142875:142930 [1] NCCL INFO comm 0xbf44640 rank 1 nranks 2 cudaDev 1 busId 34000 commId 0xfe20596e81efe06c localSize 296 used 469861984 bytes - Init COMPLETE nscale-perf-cluster-compute-top500-15:142876:142929 [0] NCCL INFO comm 0xa966300 rank 0 nranks 2 cudaDev 0 busId 31000 commId 0xfe20596e81efe06c localSize 296 used 469861984 bytes - Init COMPLETE broadcast result: tensor(618.5157, device='cuda:0') broadcast result: tensor(618.5157, device='cuda:1') allreduce result: tensor(1237.0315, device='cuda:0') allreduce result: tensor(1237.0315, device='cuda:1') nscale-perf-cluster-compute-top500-15:142875:142932 [1] NCCL INFO [Service thread] Connection closed by localRank 1 nscale-perf-cluster-compute-top500-15:142876:142931 [0] NCCL INFO [Service thread] Connection closed by localRank 0 nscale-perf-cluster-compute-top500-15:142875:142937 [1] NCCL INFO comm 0xbf44640 rank 1 nranks 2 cudaDev 1 busId 34000 - Abort COMPLETE nscale-perf-cluster-compute-top500-15:142876:142938 [0] NCCL INFO comm 0xa966300 rank 0 nranks 2 cudaDev 0 busId 31000 - Abort COMPLETE