My code works perfectly fine with distributed_backend=‘dp’, but fails when I use distributed_backend=‘ddp’ with the following error:
Traceback (most recent call last):
File "/scratch/nvarshn2/explore/test_ddp.py", line 89, in <module>
trainer.fit(model, train_data, val_data)
File "/home/nvarshn2/.conda/envs/pytorch_lightning_with_deepseed_env/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 510, in fit
results = self.accelerator_backend.train()
File "/home/nvarshn2/.conda/envs/pytorch_lightning_with_deepseed_env/lib/python3.6/site-packages/pytorch_lightning/accelerators/ddp_accelerator.py", line 158, in train
results = self.ddp_train(process_idx=self.task_idx, model=model)
File "/home/nvarshn2/.conda/envs/pytorch_lightning_with_deepseed_env/lib/python3.6/site-packages/pytorch_lightning/accelerators/ddp_accelerator.py", line 307, in ddp_train
results = self.train_or_test()
File "/home/nvarshn2/.conda/envs/pytorch_lightning_with_deepseed_env/lib/python3.6/site-packages/pytorch_lightning/accelerators/accelerator.py", line 74, in train_or_test
results = self.trainer.train()
File "/home/nvarshn2/.conda/envs/pytorch_lightning_with_deepseed_env/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in train
self.train_loop.run_training_epoch()
File "/home/nvarshn2/.conda/envs/pytorch_lightning_with_deepseed_env/lib/python3.6/site-packages/pytorch_lightning/trainer/training_loop.py", line 541, in run_training_epoch
for batch_idx, (batch, is_last_batch) in train_dataloader:
File "/home/nvarshn2/.conda/envs/pytorch_lightning_with_deepseed_env/lib/python3.6/site-packages/pytorch_lightning/profiler/profilers.py", line 85, in profile_iterable
value = next(iterator)
File "/home/nvarshn2/.conda/envs/pytorch_lightning_with_deepseed_env/lib/python3.6/site-packages/pytorch_lightning/trainer/connectors/data_connector.py", line 45, in _with_is_last
it = iter(iterable)
File "/home/nvarshn2/.conda/envs/pytorch_lightning_with_deepseed_env/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 352, in __iter__
return self._get_iterator()
File "/home/nvarshn2/.conda/envs/pytorch_lightning_with_deepseed_env/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 294, in _get_iterator
return _MultiProcessingDataLoaderIter(self)
File "/home/nvarshn2/.conda/envs/pytorch_lightning_with_deepseed_env/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 801, in __init__
w.start()
File "/home/nvarshn2/.conda/envs/pytorch_lightning_with_deepseed_env/lib/python3.6/multiprocessing/process.py", line 105, in start
self._popen = self._Popen(self)
File "/home/nvarshn2/.conda/envs/pytorch_lightning_with_deepseed_env/lib/python3.6/multiprocessing/context.py", line 223, in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File "/home/nvarshn2/.conda/envs/pytorch_lightning_with_deepseed_env/lib/python3.6/multiprocessing/context.py", line 277, in _Popen
return Popen(process_obj)
File "/home/nvarshn2/.conda/envs/pytorch_lightning_with_deepseed_env/lib/python3.6/multiprocessing/popen_fork.py", line 19, in __init__
self._launch(process_obj)
File "/home/nvarshn2/.conda/envs/pytorch_lightning_with_deepseed_env/lib/python3.6/multiprocessing/popen_fork.py", line 66, in _launch
self.pid = os.fork()
OSError: [Errno 12] Cannot allocate memory
Code:
import os
import torch
from torch.utils.data import Dataset
from pytorch_lightning import LightningModule, Trainer
class RandomDataset(Dataset):
def __init__(self, size, length):
self.len = length
self.data = torch.randn(length, size)
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return self.len
class BoringModel(LightningModule):
def __init__(self):
super().__init__()
self.layer = torch.nn.Linear(32, 2)
def forward(self, x):
return self.layer(x)
def loss(self, batch, prediction):
# An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls
return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction))
def step(self, x):
x = self.layer(x)
out = torch.nn.functional.mse_loss(x, torch.ones_like(x))
return out
def training_step(self, batch, batch_idx):
output = self.layer(batch)
loss = self.loss(batch, output)
return {"loss": loss}
def training_step_end(self, training_step_outputs):
return training_step_outputs
def training_epoch_end(self, outputs) -> None:
torch.stack([x["loss"] for x in outputs]).mean()
def validation_step(self, batch, batch_idx):
output = self.layer(batch)
loss = self.loss(batch, output)
return {"x": loss}
def validation_epoch_end(self, outputs) -> None:
torch.stack([x['x'] for x in outputs]).mean()
def test_step(self, batch, batch_idx):
output = self.layer(batch)
loss = self.loss(batch, output)
return {"y": loss}
def test_epoch_end(self, outputs) -> None:
torch.stack([x["y"] for x in outputs]).mean()
def configure_optimizers(self):
optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
return [optimizer], [lr_scheduler]
if __name__ == '__main__':
train_data = torch.utils.data.DataLoader(RandomDataset(32, 64), num_workers=8)
val_data = torch.utils.data.DataLoader(RandomDataset(32, 64), num_workers=8)
model = BoringModel()
trainer = Trainer(
limit_train_batches=1,
limit_val_batches=1,
max_epochs=1,
gpus=-1,
distributed_backend="ddp",
)
trainer.fit(model, train_data, val_data)
Note: I am using 4 gpus and a single machine
What could be the reason behind this?