sparticlesteve · January 7, 2020 23:57
diff --git a/test_pytorch_mpi_cuda.py b/test_pytorch_mpi_cuda.py
 import torch
 import torch.distributed as dist

 # Configuration
 ranks_per_node = 8
 shape = 2**16 # fails if 2**17
 dtype = torch.float32

 # Initialize
 dist.init_process_group(backend='mpi')
 rank, n_ranks = dist.get_rank(), dist.get_world_size()
 local_rank = rank % ranks_per_node
 device = torch.device('cuda', local_rank)
 print('MPI rank', rank, 'size', n_ranks, 'device', device)

 # Allocate a tensor
 x = torch.randn(shape, dtype=dtype).to(device)
 print('local result:', x.sum())

 # Do a broadcast from rank 0
 dist.broadcast(x, 0)
 print('broadcast result:', x.sum())

 # Do an allreduce
 dist.all_reduce(x)
 print('allreduce result:', x.sum())
	import torch
	import torch.distributed as dist

	# Configuration
	ranks_per_node = 8
	shape = 216 # fails if 217
	dtype = torch.float32

	# Initialize
	dist.init_process_group(backend='mpi')
	rank, n_ranks = dist.get_rank(), dist.get_world_size()
	local_rank = rank % ranks_per_node
	device = torch.device('cuda', local_rank)
	print('MPI rank', rank, 'size', n_ranks, 'device', device)

	# Allocate a tensor
	x = torch.randn(shape, dtype=dtype).to(device)
	print('local result:', x.sum())

	# Do a broadcast from rank 0
	dist.broadcast(x, 0)
	print('broadcast result:', x.sum())

	# Do an allreduce
	dist.all_reduce(x)
	print('allreduce result:', x.sum())