Pytorch distributed

(https://pytorch.org/tutorials/intermediate/dist_tuto.html)

Pytorch has several supports for distributed version which is similar to MPI. Main code would be:

Main code

"""run.py:"""

#!/usr/bin/env python

import os

import torch

import torch.distributed as dist

import torch.multiprocessing as mp

def run(rank, size):

""" Distributed function to be implemented later. """

pass

def init_process(rank, size, fn, backend='gloo'):

""" Initialize the distributed environment. """

os.environ['MASTER_ADDR'] = '127.0.0.1'

os.environ['MASTER_PORT'] = '29500'

dist.init_process_group(backend, rank=rank, world_size=size)

fn(rank, size)

if __name__ == "__main__":

size = 2

processes = []

mp.set_start_method("spawn")

for rank in range(size):

p = mp.Process(target=init_process, args=(rank, size, run))

p.start()

processes.append(p)

for p in processes:

p.join()

-point 2 point send/recv

"""Blocking point-to-point communication."""

def run(rank, size):

tensor = torch.zeros(1)

if rank == 0:

tensor += 1

# Send the tensor to process 1

dist.send(tensor=tensor, dst=1)

else:

# Receive tensor from process 0

dist.recv(tensor=tensor, src=0)

print('Rank ', rank, ' has data ', tensor[0])

-nonblocking send/recv

-Non-blocking point-to-point communication

"""Non-blocking point-to-point communication."""

def run(rank, size):

tensor = torch.zeros(1)

req = None

if rank == 0:

tensor += 1

# Send the tensor to process 1

req = dist.isend(tensor=tensor, dst=1)

print('Rank 0 started sending')

else:

# Receive tensor from process 0

req = dist.irecv(tensor=tensor, src=0)

print('Rank 1 started receiving')

req.wait()

print('Rank ', rank, ' has data ', tensor[0])

-collective communication: gather/scatter/allgather/reduce/allreduce

""" All-Reduce example."""

def run(rank, size):

""" Simple collective communication. """

group = dist.new_group([0, 1])

tensor = torch.ones(1)

dist.all_reduce(tensor, op=dist.ReduceOp.SUM, group=group)

print('Rank ', rank, ' has data ', tensor[0])

all reduce operations are

-dist.reduce_op.SUM,

-dist.reduce_op.PRODUCT,

-dist.reduce_op.MAX,

-dist.reduce_op.MIN.

all collective operators are

dist.broadcast(tensor, src, group): Copies tensor from src to all other processes.
dist.reduce(tensor, dst, op, group): Applies op to all tensor and stores the result in dst.
dist.all_reduce(tensor, op, group): Same as reduce, but the result is stored in all processes.
dist.scatter(tensor, src, scatter_list, group): Copies the ith tensor scatter_list[i] to the ith process.
dist.gather(tensor, dst, gather_list, group): Copies tensor from all processes in dst.
dist.all_gather(tensor_list, tensor, group): Copies tensor from all processes to tensor_list, on all processes.
dist.barrier(group): block all processes in group until each one has entered this function.

Backend can be:

-TCP

-Gloo

-MPI

Follow the tutorials on distributed training

https://github.com/seba-1511/dist_tuto.pth/

Key idea:

-partition data set:

usage: train_set, batchsz = partition_dataset()

""" Partitioning MNIST """

def partition_dataset():

dataset = datasets.MNIST('./data', train=True, download=True,

transform=transforms.Compose([

transforms.ToTensor(),

transforms.Normalize((0.1307,), (0.3081,))

]))

size = dist.get_world_size()

bsz = 128 / float(size)

partition_sizes = [1.0 / size for _ in range(size)]

partition = DataPartitioner(dataset, partition_sizes)

partition = partition.use(dist.get_rank())

train_set = torch.utils.data.DataLoader(partition,

batch_size=bsz,

shuffle=True)

return train_set, bsz

""" Dataset partitioning helper """

class Partition(object):

def __init__(self, data, index):

self.data = data

self.index = index

def __len__(self):

return len(self.index)

def __getitem__(self, index):

data_idx = self.index[index]

return self.data[data_idx]

class DataPartitioner(object):

def __init__(self, data, sizes=[0.7, 0.2, 0.1], seed=1234):

self.data = data

self.partitions = []

rng = Random()

rng.seed(seed)

data_len = len(data)

indexes = [x for x in range(0, data_len)]

rng.shuffle(indexes)

for frac in sizes:

part_len = int(frac * data_len)

self.partitions.append(indexes[0:part_len])

indexes = indexes[part_len:]

def use(self, partition):

return Partition(self.data, self.partitions[partition])

-run method is also distributed:

""" Distributed Synchronous SGD Example """

def run(rank, size):

torch.manual_seed(1234)

train_set, bsz = partition_dataset()

model = Net()

optimizer = optim.SGD(model.parameters(),

lr=0.01, momentum=0.5)

num_batches = ceil(len(train_set.dataset) / float(bsz))

for epoch in range(10):

epoch_loss = 0.0

for data, target in train_set:

optimizer.zero_grad()

output = model(data)

loss = F.nll_loss(output, target)

epoch_loss += loss.item()

loss.backward()

average_gradients(model)

optimizer.step()

print('Rank ', dist.get_rank(), ', epoch ',

epoch, ': ', epoch_loss / num_batches)

-Reduce the error using allreduce

""" Gradient averaging. """

def average_gradients(model):

size = float(dist.get_world_size())

for param in model.parameters():

dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM)

param.grad.data /= size

and implement your own reduce: (here is ring)

""" Implementation of a ring-reduce with addition. """

def allreduce(send, recv):

rank = dist.get_rank()

size = dist.get_world_size()

send_buff = send.clone()

recv_buff = send.clone()

accum = send.clone()

left = ((rank - 1) + size) % size

right = (rank + 1) % size

for i in range(size - 1):

if i % 2 == 0:

# Send send_buff

send_req = dist.isend(send_buff, right)

dist.recv(recv_buff, left)

accum[:] += recv_buff[:]

else:

# Send recv_buff

send_req = dist.isend(recv_buff, right)

dist.recv(send_buff, left)

accum[:] += send_buff[:]

send_req.wait()

recv[:] = accum[:]

-Try the following distributed pytorch example.

Also the use of tensorboard along with pytorch training.

-Also consider distributed training.

dist_training_sample.ipynb shows the use of reduced gradient. However, it does not yet partition data and distributed training on different portion of data. Please consider the example in dist_training.ipynb showing the distributed data and modify dist_training_sample.ipynb so that for each process training on its own data and use average_gradient for each epoch.