Difference between revisions of "Short MPI Tests using Pytorch example code"
Jump to navigation
Jump to search
(Created page with "<pre> [antony@gpu5 pytorch-nccl-test]$ cat run_dp_mpi.sh #!/bin/bash # mpirun --mca pml ucx -np 8 -H gpu0:4,gpu5:4 ./run_dp_mpi.sh source ~/dp-build/torch-rocm61/bin/activat...") |
|||
| Line 63: | Line 63: | ||
dist.barrier() | dist.barrier() | ||
dist.destroy_process_group() | dist.destroy_process_group() | ||
| + | </pre> | ||
| + | |||
| + | |||
| + | == Another Example == | ||
| + | Run the job using | ||
| + | <pre> | ||
| + | (torch-rocm61) [antony@gpu0 pytorch-nccl-test]$ /home/shared/apps/openmpi/5.0.3/bin/mpirun -x SCRATCH=55 -x NCCL_DEBUG=INFO -np 16 -H gpu0:8,gpu3:8 run_dp.sh | ||
| + | </pre> | ||
| + | |||
| + | Wrapper script | ||
| + | <pre> | ||
| + | [antony@lighton-login pytorch-nccl-test]$ cat run_dp.sh | ||
| + | #!/bin/bash | ||
| + | |||
| + | source ~/dp-build/torch-rocm61/bin/activate | ||
| + | python3.11 /home/antony/dp-build/pytorch-nccl-test/test_nccl_dp.py | ||
| + | </pre> | ||
| + | |||
| + | With the following file | ||
| + | <pre> | ||
| + | # cat test_nccl_dp.py | ||
| + | import os | ||
| + | |||
| + | import torch | ||
| + | import torch.distributed as dist | ||
| + | |||
| + | def init_workers_nccl_file(): | ||
| + | rank = int(os.environ['OMPI_COMM_WORLD_RANK']) | ||
| + | n_ranks = int(os.environ['OMPI_COMM_WORLD_SIZE']) | ||
| + | sync_file_dir = '%s/tmp' % os.environ['SCRATCH'] | ||
| + | os.makedirs(sync_file_dir, exist_ok=True) | ||
| + | sync_file = 'file://%s/pytorch_sync_%s' % ( | ||
| + | sync_file_dir, os.environ['OMPI_WORLD_SIZE']) | ||
| + | dist.init_process_group(backend='nccl', world_size=n_ranks, rank=rank, | ||
| + | init_method=sync_file) | ||
| + | return rank, n_ranks | ||
| + | |||
| + | # Print pytorch version | ||
| + | print('Pytorch version', torch.__version__) | ||
| + | |||
| + | # Configuration | ||
| + | ranks_per_node = 8 | ||
| + | shape = 2**17 | ||
| + | dtype = torch.float32 | ||
| + | |||
| + | # Initialize MPI | ||
| + | rank, n_ranks = init_workers_nccl_file() | ||
| + | local_rank = rank % ranks_per_node | ||
| + | |||
| + | # Allocate a small tensor on every gpu from every rank. | ||
| + | # This is an attempt to force creation of all device contexts. | ||
| + | for i in range(ranks_per_node): | ||
| + | _ = torch.randn(1).to(torch.device('cuda', i)) | ||
| + | |||
| + | # Select our gpu | ||
| + | device = torch.device('cuda', local_rank) | ||
| + | print('Rank', rank, 'size', n_ranks, 'device', device, 'count', torch.cuda.device_count()) | ||
| + | |||
| + | print('Torch NCCL version', torch.cuda.nccl.version()) | ||
| + | |||
| + | # Allocate a tensor on the gpu | ||
| + | x = torch.randn(shape, dtype=dtype).to(device) | ||
| + | print('local result:', x.sum()) | ||
| + | |||
| + | # Do a broadcast from rank 0 | ||
| + | dist.broadcast(x, 0) | ||
| + | print('broadcast result:', x.sum()) | ||
| + | |||
| + | # Do an all-reduce | ||
| + | dist.all_reduce(x) | ||
| + | print('allreduce result:', x.sum()) | ||
| + | |||
| + | # close out | ||
| + | dist.barrier() | ||
| + | dist.destroy_process_group() | ||
</pre> | </pre> | ||
Revision as of 22:03, 17 September 2024
[antony@gpu5 pytorch-nccl-test]$ cat run_dp_mpi.sh #!/bin/bash # mpirun --mca pml ucx -np 8 -H gpu0:4,gpu5:4 ./run_dp_mpi.sh source ~/dp-build/torch-rocm61/bin/activate export MASTER_ADDR=172.16.16.42 export MASTER_PORT=25252 export NCCL_SOCKET_IFNAME=enp37s0np0,enp12s0np0,enp180s0np0,enp139s0np0 #export NCCL_DEBUG=INFO export NCCL_IB_GID_INDEX=3 export NCCL_NET_GDR_LEVEL=1 python3.11 /home/antony/dp-build/pytorch-nccl-test/dp_pytorch_mpi.py
Then the dp_pytorch_mpi.py file
[antony@gpu5 pytorch-nccl-test]$ cat /home/antony/dp-build/pytorch-nccl-test/dp_pytorch_mpi.py
import os
import argparse
import torch
import torch.distributed as dist
# Environment variables set by mpirun
LOCAL_RANK = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
WORLD_SIZE = int(os.environ['OMPI_COMM_WORLD_SIZE'])
WORLD_RANK = int(os.environ['OMPI_COMM_WORLD_RANK'])
def run(backend):
tensor = torch.zeros(1)
# Need to put tensor on a GPU device for nccl backend
if backend == 'nccl':
device = torch.device("cuda:{}".format(LOCAL_RANK))
tensor = tensor.to(device)
if WORLD_RANK == 0:
for rank_recv in range(1, WORLD_SIZE):
dist.send(tensor=tensor, dst=rank_recv)
print('worker_{} sent data to Rank {}\n'.format(0, rank_recv))
else:
dist.recv(tensor=tensor, src=0)
print('worker_{} has received data from rank {}\n'.format(WORLD_RANK, 0))
def init_processes(backend):
dist.init_process_group(backend, rank=WORLD_RANK, world_size=WORLD_SIZE)
run(backend)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", type=int, help="Local rank. Necessary for using the torch.distributed.launch utility.")
parser.add_argument("--backend", type=str, default="nccl", choices=['nccl', 'gloo'])
args = parser.parse_args()
init_processes(backend=args.backend)
dist.barrier()
dist.destroy_process_group()
Another Example
Run the job using
(torch-rocm61) [antony@gpu0 pytorch-nccl-test]$ /home/shared/apps/openmpi/5.0.3/bin/mpirun -x SCRATCH=55 -x NCCL_DEBUG=INFO -np 16 -H gpu0:8,gpu3:8 run_dp.sh
Wrapper script
[antony@lighton-login pytorch-nccl-test]$ cat run_dp.sh #!/bin/bash source ~/dp-build/torch-rocm61/bin/activate python3.11 /home/antony/dp-build/pytorch-nccl-test/test_nccl_dp.py
With the following file
# cat test_nccl_dp.py
import os
import torch
import torch.distributed as dist
def init_workers_nccl_file():
rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
n_ranks = int(os.environ['OMPI_COMM_WORLD_SIZE'])
sync_file_dir = '%s/tmp' % os.environ['SCRATCH']
os.makedirs(sync_file_dir, exist_ok=True)
sync_file = 'file://%s/pytorch_sync_%s' % (
sync_file_dir, os.environ['OMPI_WORLD_SIZE'])
dist.init_process_group(backend='nccl', world_size=n_ranks, rank=rank,
init_method=sync_file)
return rank, n_ranks
# Print pytorch version
print('Pytorch version', torch.__version__)
# Configuration
ranks_per_node = 8
shape = 2**17
dtype = torch.float32
# Initialize MPI
rank, n_ranks = init_workers_nccl_file()
local_rank = rank % ranks_per_node
# Allocate a small tensor on every gpu from every rank.
# This is an attempt to force creation of all device contexts.
for i in range(ranks_per_node):
_ = torch.randn(1).to(torch.device('cuda', i))
# Select our gpu
device = torch.device('cuda', local_rank)
print('Rank', rank, 'size', n_ranks, 'device', device, 'count', torch.cuda.device_count())
print('Torch NCCL version', torch.cuda.nccl.version())
# Allocate a tensor on the gpu
x = torch.randn(shape, dtype=dtype).to(device)
print('local result:', x.sum())
# Do a broadcast from rank 0
dist.broadcast(x, 0)
print('broadcast result:', x.sum())
# Do an all-reduce
dist.all_reduce(x)
print('allreduce result:', x.sum())
# close out
dist.barrier()
dist.destroy_process_group()