Short MPI Tests using Pytorch example code
Jump to navigation
Jump to search
[antony@gpu5 pytorch-nccl-test]$ cat run_dp_mpi.sh #!/bin/bash # mpirun --mca pml ucx -np 8 -H gpu0:4,gpu5:4 ./run_dp_mpi.sh source ~/dp-build/torch-rocm61/bin/activate export MASTER_ADDR=172.16.16.42 export MASTER_PORT=25252 export NCCL_SOCKET_IFNAME=enp37s0np0,enp12s0np0,enp180s0np0,enp139s0np0 #export NCCL_DEBUG=INFO export NCCL_IB_GID_INDEX=3 export NCCL_NET_GDR_LEVEL=1 python3.11 /home/antony/dp-build/pytorch-nccl-test/dp_pytorch_mpi.py
Then the dp_pytorch_mpi.py file
[antony@gpu5 pytorch-nccl-test]$ cat /home/antony/dp-build/pytorch-nccl-test/dp_pytorch_mpi.py
import os
import argparse
import torch
import torch.distributed as dist
# Environment variables set by mpirun
LOCAL_RANK = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
WORLD_SIZE = int(os.environ['OMPI_COMM_WORLD_SIZE'])
WORLD_RANK = int(os.environ['OMPI_COMM_WORLD_RANK'])
def run(backend):
tensor = torch.zeros(1)
# Need to put tensor on a GPU device for nccl backend
if backend == 'nccl':
device = torch.device("cuda:{}".format(LOCAL_RANK))
tensor = tensor.to(device)
if WORLD_RANK == 0:
for rank_recv in range(1, WORLD_SIZE):
dist.send(tensor=tensor, dst=rank_recv)
print('worker_{} sent data to Rank {}\n'.format(0, rank_recv))
else:
dist.recv(tensor=tensor, src=0)
print('worker_{} has received data from rank {}\n'.format(WORLD_RANK, 0))
def init_processes(backend):
dist.init_process_group(backend, rank=WORLD_RANK, world_size=WORLD_SIZE)
run(backend)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", type=int, help="Local rank. Necessary for using the torch.distributed.launch utility.")
parser.add_argument("--backend", type=str, default="nccl", choices=['nccl', 'gloo'])
args = parser.parse_args()
init_processes(backend=args.backend)
dist.barrier()
dist.destroy_process_group()