I am using torch version: 2.0.1+cu117 and lightning version: 2.0.7, with:
device: cuda:0 and number of available GPUs = 8
I am attempting to learn multi-GPU training on a single machine using DDP and lightning. To be precise, I am implementing SimCLR paper.
The code is:
def training_transformations(
jitter_strength = 1.0, inp_height = 96,
gaussian_blur = False
):
# Train data transformations-
# brightness, contrast, saturation and hue-
color_jitter = transforms.ColorJitter(
brightness = 0.8 * jitter_strength,
contrast = 0.8 * jitter_strength,
saturation = 0.8 * jitter_strength,
hue = 0.2 * jitter_strength
)
data_transforms = [
transforms.ToPILImage(),
transforms.RandomResizedCrop(size = inp_height),
transforms.RandomHorizontalFlip(p = 0.5),
transforms.RandomApply([color_jitter], p = 0.8),
transforms.RandomGrayscale(p = 0.2)
]
if gaussian_blur:
data_transforms.append(transforms.GaussianBlur(kernel_size = int(0.1 * inp_height), sigma = (0.1, 2.0)))
data_transforms.append(transforms.ToTensor())
train_transform = transforms.Compose(data_transforms)
return train_transform
class STL10UnlabeledDataset(Dataset):
'''
STL-10 dataset for unlabeled dataset.
'''
def __init__(self, path_to_data, transform = None):
super().__init__()
self.transform = transform
# Read unlabeled data as .bin file-
self.data = np.fromfile(path_to_data + "unlabeled_X.bin", dtype = np.uint8)
# Reshape images as (C, H, W)-
self.data = np.reshape(self.data, (-1, 3, 96, 96))
# Rotate array by 90 degrees in the plane specified by axes-
self.data = np.rot90(m = self.data, k = 3, axes = (2, 3))
self.data = np.transpose(a = self.data, axes = (0, 2, 3, 1))
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
image = self.data[idx].copy()
# print(len(image))
if self.transform is not None:
img1 = self.transform(image)
img2 = self.transform(image)
return img1, img2
train_transform = training_transformations(
jitter_strength = 1.0, inp_height = 96,
gaussian_blur = True
)
# Create training data-
unlabeled_data = STL10UnlabeledDataset(
path_to_data = path_to_stl10, transform = train_transform
)
batch_size = 1024
# Create train loader-
train_loader = utils.data.DataLoader(
dataset = unlabeled_data, batch_size = batch_size,
shuffle = False, num_workers = 1,
pin_memory = True
)
def nt_xent_loss(x, temperature = 0.05):
assert len(x.size()) == 2
# Cosine similarity
xcs = F.cosine_similarity(x[None, :, :], x[:, None, :], dim = -1)
xcs[torch.eye(x.size(0)).bool()] = float("-inf")
# Ground truth labels
target = torch.arange(x.size(0))
target[0::2] += 1
target[1::2] -= 1
# Standard cross-entropy loss
return F.cross_entropy(xcs / temperature, target, reduction = "mean")
# Initialize ResNet-50 bottleneck CNN-
backbone_enc = ResNet(Bottleneck, [3, 4, 6, 3])
# Define LightningModule-
class SimCLR_ResNet50(pl.LightningModule):
def __init__(
self, backbone_enc,
temperature = 0.05
):
super().__init__()
self.backbone_enc = backbone_enc
self.projection_head = nn.Linear(in_features = 2048, out_features = 128, bias = True)
self.temperature = temperature
'''
def validation_step(self, batch, batch_idx):
# Validation loop.
x_t, y_t = batch
x_t = x_t.view(x_t.size(0), -1)
z_t = self.encoder(x_t)
pred_t = self.decoder(z_t)
loss_t = nn.functional.mse_loss(pred_t, x_t)
self.log('test_loss', loss_t, sync_dist=True)
return loss_t
'''
def training_step(self, batch, batch_idx):
# training_step() defines the training loop.
# It's independent of forward().
x1, x2 = batch
h1 = self.backbone_enc(x1)
h2 = self.backbone_enc(x2)
z1 = F.relu(self.projection_head(h1))
z2 = F.relu(self.projection_head(h2))
x = torch.cat((z1, z2), dim = 0)
loss = nt_xent_loss(x, temperature = self.temperature)
# log to Tensorboard (if installed) by default-
self.log('train_ntxent_loss', sync_dist = True)
return loss
def configure_optimizers(self):
# optimizer = optim.Adam(params = self.parameters(), lr = 1e-3)
# Specify parameters to optimize-
optimizer = optim.Adam(
params = list(backbone_enc.parameters()) + list(projection_head.parameters()),
lr = 1e-3
)
return optimizer
# Initialize SimCLR model-
model = SimCLR_ResNet50(backbone_enc = backbone_enc, temperature = 0.05)
# Train the model
# The Lightning Trainer βmixesβ any LightningModule with any dataset and
# abstracts away all the engineering complexity needed for scale.
trainer = pl.Trainer(
limit_train_batches = 1.0,
# limit_val_batches = 1.0,
max_epochs = 20,
accelerator = 'gpu',
# devices = 2,
# devices = [0, 1, 2, 3, 4, 5, 6, 7]
devices = "auto"
)
trainer.fit(
model = model, train_dataloaders = train_loader,
# val_dataloaders = test_loader
)
which gives the error:
ProcessRaisedException Traceback (most recent call last)
Cell In[14], line 1
----> 1 trainer.fit(
2 model = model, train_dataloaders = train_loader,
3 # val_dataloaders = test_loader
4 )File ~/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py:532, in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
530 self.strategy._lightning_module = model
531 _verify_strategy_supports_compile(model, self.strategy)
β 532 call._call_and_handle_interrupt(
533 self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
534 )File ~/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:42, in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs)
40 try:
41 if trainer.strategy.launcher is not None:
β> 42 return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
43 return trainer_fn(*args, **kwargs)
45 except _TunerExitException:File ~/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/lightning/pytorch/strategies/launchers/multiprocessing.py:126, in _MultiProcessingLauncher.launch(self, function, trainer, *args, **kwargs)
118 process_context = mp.start_processes(
119 self._wrapping_function,
120 args=process_args,
(β¦)
123 join=False, # we will join ourselves to get the process references
124 )
125 self.procs = process_context.processes
β 126 while not process_context.join():
127 pass
129 worker_output = return_queue.get()File ~/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/torch/multiprocessing/spawn.py:160, in ProcessContext.join(self, timeout)
158 msg = β\n\nβ Process %d terminated with the following error:\nβ % error_index
159 msg += original_trace
β 160 raise ProcessRaisedException(msg, error_index, failed_process.pid)ProcessRaisedException:
β Process 1 terminated with the following error:
Traceback (most recent call last):
File β/home/majumdar/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/torch/multiprocessing/spawn.pyβ, line 69, in _wrap
fn(i, *args)
File β/home/majumdar/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/lightning/pytorch/strategies/launchers/multiprocessing.pyβ, line 149, in _wrapping_function
results = function(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File β/home/majumdar/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.pyβ, line 571, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File β/home/majumdar/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.pyβ, line 938, in _run
self.strategy.setup_environment()
File β/home/majumdar/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/lightning/pytorch/strategies/ddp.pyβ, line 144, in setup_environment
super().setup_environment()
File β/home/majumdar/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.pyβ, line 129, in setup_environment
self.accelerator.setup_device(self.root_device)
File β/home/majumdar/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/lightning/pytorch/accelerators/cuda.pyβ, line 44, in setup_device
_check_cuda_matmul_precision(device)
File β/home/majumdar/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/lightning/fabric/accelerators/cuda.pyβ, line 349, in _check_cuda_matmul_precision
major, _ = torch.cuda.get_device_capability(device)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File β/home/majumdar/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/torch/cuda/init.pyβ, line 381, in get_device_capability
prop = get_device_properties(device)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File β/home/majumdar/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/torch/cuda/init.pyβ, line 395, in get_device_properties
_lazy_init() # will define _get_device_properties
^^^^^^^^^^^^
File β/home/majumdar/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/torch/cuda/init.pyβ, line 235, in _lazy_init
raise RuntimeError(
RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the βspawnβ start method
what am I doing wrong?