CUDA multiprocessing asks to use "spawn" start metod

I am using torch version: 2.0.1+cu117 and lightning version: 2.0.7, with:
device: cuda:0 and number of available GPUs = 8

I am attempting to learn multi-GPU training on a single machine using DDP and lightning. To be precise, I am implementing SimCLR paper.

The code is:

def training_transformations(
    jitter_strength = 1.0, inp_height = 96,
    gaussian_blur = False
):
    # Train data transformations-

    # brightness, contrast, saturation and hue-
    color_jitter = transforms.ColorJitter(
        brightness = 0.8 * jitter_strength,
        contrast = 0.8 * jitter_strength,
        saturation = 0.8 * jitter_strength,
        hue = 0.2 * jitter_strength
    )

    data_transforms = [
        transforms.ToPILImage(),
        transforms.RandomResizedCrop(size = inp_height),
        transforms.RandomHorizontalFlip(p = 0.5),
        transforms.RandomApply([color_jitter], p = 0.8),
        transforms.RandomGrayscale(p = 0.2)
    ]

    if gaussian_blur:
        data_transforms.append(transforms.GaussianBlur(kernel_size = int(0.1 * inp_height), sigma = (0.1, 2.0)))

    data_transforms.append(transforms.ToTensor())
    train_transform = transforms.Compose(data_transforms)

    return train_transform


class STL10UnlabeledDataset(Dataset):
    '''
    STL-10 dataset for unlabeled dataset.
    '''
    def __init__(self, path_to_data, transform = None):
        super().__init__()
        
        self.transform = transform
        
        # Read unlabeled data as .bin file-
        self.data = np.fromfile(path_to_data + "unlabeled_X.bin", dtype = np.uint8)
        
        # Reshape images as (C, H, W)-
        self.data = np.reshape(self.data, (-1, 3, 96, 96))
        
        # Rotate array by 90 degrees in the plane specified by axes-
        self.data = np.rot90(m = self.data, k = 3, axes = (2, 3))
        self.data = np.transpose(a = self.data, axes = (0, 2, 3, 1))
        
        
    def __len__(self):
        return len(self.data)
    
    
    def __getitem__(self, idx):
        image = self.data[idx].copy()
        # print(len(image))
        
        if self.transform is not None:
            img1 = self.transform(image)
            img2 = self.transform(image)
            
            return img1, img2

train_transform = training_transformations(
    jitter_strength = 1.0, inp_height = 96,
    gaussian_blur = True
)
    
# Create training data-
unlabeled_data = STL10UnlabeledDataset(
    path_to_data = path_to_stl10, transform = train_transform
)

batch_size = 1024
# Create train loader-
train_loader = utils.data.DataLoader(
    dataset = unlabeled_data, batch_size = batch_size,
    shuffle = False, num_workers = 1,
    pin_memory = True
)


def nt_xent_loss(x, temperature = 0.05):
    assert len(x.size()) == 2

    # Cosine similarity
    xcs = F.cosine_similarity(x[None, :, :], x[:, None, :], dim = -1)
    xcs[torch.eye(x.size(0)).bool()] = float("-inf")

    # Ground truth labels
    target = torch.arange(x.size(0))
    target[0::2] += 1
    target[1::2] -= 1

    # Standard cross-entropy loss
    return F.cross_entropy(xcs / temperature, target, reduction = "mean")


# Initialize ResNet-50 bottleneck CNN-
backbone_enc = ResNet(Bottleneck, [3, 4, 6, 3])

# Define LightningModule-
class SimCLR_ResNet50(pl.LightningModule):
    def __init__(
        self, backbone_enc,
        temperature = 0.05
    ):
        super().__init__()
        
        self.backbone_enc = backbone_enc
        self.projection_head = nn.Linear(in_features = 2048, out_features = 128, bias = True)
        self.temperature = temperature

    '''
    def validation_step(self, batch, batch_idx):
        # Validation loop.
        x_t, y_t = batch
        x_t = x_t.view(x_t.size(0), -1)
        z_t = self.encoder(x_t)
        pred_t = self.decoder(z_t)
        loss_t = nn.functional.mse_loss(pred_t, x_t)

        self.log('test_loss', loss_t, sync_dist=True)
        return loss_t
        '''
    
    def training_step(self, batch, batch_idx):
        # training_step() defines the training loop.
        # It's independent of forward().
        x1, x2 = batch
        
        h1 = self.backbone_enc(x1)
        h2 = self.backbone_enc(x2)
        z1 = F.relu(self.projection_head(h1))
        z2 = F.relu(self.projection_head(h2))
        x = torch.cat((z1, z2), dim = 0)
        
        loss = nt_xent_loss(x, temperature = self.temperature)

        # log to Tensorboard (if  installed) by default-
        self.log('train_ntxent_loss', sync_dist = True)
        return loss


    def configure_optimizers(self):
        # optimizer = optim.Adam(params = self.parameters(), lr = 1e-3)
 
        # Specify parameters to optimize-
        optimizer = optim.Adam(
            params = list(backbone_enc.parameters()) + list(projection_head.parameters()),
            lr = 1e-3
        )
        
        return optimizer

# Initialize SimCLR model-
model = SimCLR_ResNet50(backbone_enc = backbone_enc, temperature = 0.05)

# Train the model
# The Lightning Trainer β€œmixes” any LightningModule with any dataset and
# abstracts away all the engineering complexity needed for scale.
trainer = pl.Trainer(
    limit_train_batches = 1.0,
    # limit_val_batches = 1.0,
    max_epochs = 20,
    accelerator = 'gpu',
    # devices = 2,
    # devices = [0, 1, 2, 3, 4, 5, 6, 7]
    devices = "auto"
)

trainer.fit(
    model = model, train_dataloaders = train_loader,
    # val_dataloaders = test_loader
)

which gives the error:


ProcessRaisedException Traceback (most recent call last)
Cell In[14], line 1
----> 1 trainer.fit(
2 model = model, train_dataloaders = train_loader,
3 # val_dataloaders = test_loader
4 )

File ~/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py:532, in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
530 self.strategy._lightning_module = model
531 _verify_strategy_supports_compile(model, self.strategy)
β†’ 532 call._call_and_handle_interrupt(
533 self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
534 )

File ~/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:42, in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs)
40 try:
41 if trainer.strategy.launcher is not None:
β€”> 42 return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
43 return trainer_fn(*args, **kwargs)
45 except _TunerExitException:

File ~/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/lightning/pytorch/strategies/launchers/multiprocessing.py:126, in _MultiProcessingLauncher.launch(self, function, trainer, *args, **kwargs)
118 process_context = mp.start_processes(
119 self._wrapping_function,
120 args=process_args,
(…)
123 join=False, # we will join ourselves to get the process references
124 )
125 self.procs = process_context.processes
β†’ 126 while not process_context.join():
127 pass
129 worker_output = return_queue.get()

File ~/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/torch/multiprocessing/spawn.py:160, in ProcessContext.join(self, timeout)
158 msg = β€œ\n\n– Process %d terminated with the following error:\n” % error_index
159 msg += original_trace
β†’ 160 raise ProcessRaisedException(msg, error_index, failed_process.pid)

ProcessRaisedException:

– Process 1 terminated with the following error:
Traceback (most recent call last):
File β€œ/home/majumdar/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/torch/multiprocessing/spawn.py”, line 69, in _wrap
fn(i, *args)
File β€œ/home/majumdar/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/lightning/pytorch/strategies/launchers/multiprocessing.py”, line 149, in _wrapping_function
results = function(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File β€œ/home/majumdar/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py”, line 571, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File β€œ/home/majumdar/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/lightning/pytorch/trainer/trainer.py”, line 938, in _run
self.strategy.setup_environment()
File β€œ/home/majumdar/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/lightning/pytorch/strategies/ddp.py”, line 144, in setup_environment
super().setup_environment()
File β€œ/home/majumdar/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/lightning/pytorch/strategies/strategy.py”, line 129, in setup_environment
self.accelerator.setup_device(self.root_device)
File β€œ/home/majumdar/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/lightning/pytorch/accelerators/cuda.py”, line 44, in setup_device
_check_cuda_matmul_precision(device)
File β€œ/home/majumdar/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/lightning/fabric/accelerators/cuda.py”, line 349, in _check_cuda_matmul_precision
major, _ = torch.cuda.get_device_capability(device)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File β€œ/home/majumdar/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/torch/cuda/init.py”, line 381, in get_device_capability
prop = get_device_properties(device)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File β€œ/home/majumdar/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/torch/cuda/init.py”, line 395, in get_device_properties
_lazy_init() # will define _get_device_properties
^^^^^^^^^^^^
File β€œ/home/majumdar/anaconda3/envs/lightning_cuda/lib/python3.11/site-packages/torch/cuda/init.py”, line 235, in _lazy_init
raise RuntimeError(
RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the β€˜spawn’ start method

what am I doing wrong?

My guess is that the torchvision probably initializes cuda somewhere, which then leads to this error, and you are running in a Jupyter notebook. There are serious limitations for multi-GPU in notebooks, so I suggest to just run your code as a script if you can. Limitations and how to get around them are described here, but it is not always possible to get around this.

1 Like