Hi, I am trying to use PyTorch lightning for multi GPU processing, but I got this error :
Traceback (most recent call last):
File “segnet.py”, line 423, in
trainer.fit(model,train_dataloader,test_dataloader)
File “/uge_mnt/home/b_kgh/.local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py”, line 737, in fit
self._call_and_handle_interrupt(
File “/uge_mnt/home/b_kgh/.local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py”, line 682, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File “/uge_mnt/home/b_kgh/.local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py”, line 772, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File “/uge_mnt/home/b_kgh/.local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py”, line 1195, in _run
self._dispatch()
File “/uge_mnt/home/b_kgh/.local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py”, line 1275, in _dispatch
self.training_type_plugin.start_training(self)
File “/uge_mnt/home/b_kgh/.local/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/ddp_spawn.py”, line 173, in start_training
self.spawn(self.new_process, trainer, self.mp_queue, return_result=False)
File “/uge_mnt/home/b_kgh/.local/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/ddp_spawn.py”, line 201, in spawn
mp.spawn(self._wrapped_function, args=(function, args, kwargs, return_queue), nprocs=self.num_processes)
File “/opt/conda/lib/python3.8/site-packages/torch/multiprocessing/spawn.py”, line 230, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method=‘spawn’)
File “/opt/conda/lib/python3.8/site-packages/torch/multiprocessing/spawn.py”, line 188, in start_processes
while not context.join():
File “/opt/conda/lib/python3.8/site-packages/torch/multiprocessing/spawn.py”, line 150, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
– Process 0 terminated with the following error:
Traceback (most recent call last):
File “/opt/conda/lib/python3.8/site-packages/torch/multiprocessing/spawn.py”, line 59, in _wrap
fn(i, *args)
File “/uge_mnt/home/b_kgh/.local/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/ddp_spawn.py”, line 208, in _wrapped_function
result = function(*args, **kwargs)
File “/uge_mnt/home/b_kgh/.local/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/ddp_spawn.py”, line 232, in new_process
self.configure_ddp()
File “/uge_mnt/home/b_kgh/.local/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/ddp_spawn.py”, line 291, in configure_ddp
self._model = self._setup_model(LightningDistributedModule(self.model))
File “/uge_mnt/home/b_kgh/.local/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/ddp_spawn.py”, line 159, in _setup_model
return DistributedDataParallel(module=model, device_ids=self.determine_ddp_device_ids(), **self._ddp_kwargs)
File “/opt/conda/lib/python3.8/site-packages/torch/nn/parallel/distributed.py”, line 578, in init
dist._verify_model_across_ranks(self.process_group, parameters)
RuntimeError: NCCL error in: /opt/pytorch/pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:957, unhandled cuda error, NCCL version 21.1.4
ncclUnhandledCudaError: Call to CUDA function failed.
versions:
pytorch 1.10
NCCL 2.11.4
cuda 11.4