Hello everyone, I’ve got a really weird problem.
If I set my num_workers > 0 I get a crash in (or before?) the initial sanity check. The task manager shows that the disk usage is high and the RAM gets nearly completly filled up. I wonder if num_workers>0 somehow causes the numpy memmaps (~7GB each) to be loaded into memory even though this should normally not happen. I know that my dataclass and everything else works with num_workers=0 (did a fast_dev_run and some other tests where the RAM usage stayed low the entire time).
OS is Windows 10 Pro.
pytorch version: 2.0.1
pytorch lightning version: 2.0.2
pytorch -cuda 11.8
Dataclass:
class MyDataset(Dataset):
def __init__(self, parameter_settings: ps.Parameter, transform: torchvision.transforms = None):
self.targets = torch.LongTensor(parameter_settings.get_labels())
self.list_of_arrays = [
np.memmap(memmap_path + ".dat", dtype='float32', mode='r', shape=hpf.read_memmap_shape(memmap_path)) for
memmap_path in parameter_settings.get_data_path()]
self.transform = transform
def __getitem__(self, index):
x = torch.from_numpy(np.stack([item[index] for item in self.list_of_arrays]))
y = self.targets[index]
if self.transform is not None:
return self.transform(x), y
return x, y
def __len__(self):
return len(self.targets)
Output that is printed to the console:
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
| Name | Type | Params
-------------------------------------------------
0 | model | ResNet | 11.2 M
1 | train_acc | MulticlassAccuracy | 0
2 | valid_acc | MulticlassAccuracy | 0
3 | f1 | MulticlassF1Score | 0
-------------------------------------------------
11.2 M Trainable params
0 Non-trainable params
11.2 M Total params
44.733 Total estimated model params size (MB)
Traceback (most recent call last):
File "C:\Users\x\PycharmProjects\cnn\main.py", line 16, in <module>
nt.training_loop(parameter)
File "C:\Users\x\PycharmProjects\cnn\network_training.py", line 94, in training_loop
trainer.fit(model, data_module)
File "C:\Users\x\miniconda3\envs\cnn\lib\site-packages\lightning\pytorch\trainer\trainer.py", line 520, in fit
call._call_and_handle_interrupt(
File "C:\Users\x\miniconda3\envs\cnn\lib\site-packages\lightning\pytorch\trainer\call.py", line 44, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "C:\Users\x\miniconda3\envs\cnn\lib\site-packages\lightning\pytorch\trainer\trainer.py", line 559, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "C:\Users\x\miniconda3\envs\cnn\lib\site-packages\lightning\pytorch\trainer\trainer.py", line 935, in _run
results = self._run_stage()
File "C:\Users\x\miniconda3\envs\cnn\lib\site-packages\lightning\pytorch\trainer\trainer.py", line 978, in _run_stage
self.fit_loop.run()
File "C:\Users\x\miniconda3\envs\cnn\lib\site-packages\lightning\pytorch\loops\fit_loop.py", line 193, in run
self.setup_data()
File "C:\Users\x\miniconda3\envs\cnn\lib\site-packages\lightning\pytorch\loops\fit_loop.py", line 235, in setup_data
_check_dataloader_iterable(dl, source, trainer_fn)
File "C:\Users\x\miniconda3\envs\cnn\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py", line 383, in _check_dataloader_iterable
iter(dataloader) # type: ignore[call-overload]
File "C:\Users\x\miniconda3\envs\cnn\lib\site-packages\torch\utils\data\dataloader.py", line 441, in __iter__
return self._get_iterator()
File "C:\Users\x\miniconda3\envs\cnn\lib\site-packages\torch\utils\data\dataloader.py", line 388, in _get_iterator
return _MultiProcessingDataLoaderIter(self)
File "C:\Users\x\miniconda3\envs\cnn\lib\site-packages\torch\utils\data\dataloader.py", line 1042, in __init__
w.start()
File "C:\Users\x\miniconda3\envs\cnn\lib\multiprocessing\process.py", line 121, in start
self._popen = self._Popen(self)
File "C:\Users\x\miniconda3\envs\cnn\lib\multiprocessing\context.py", line 224, in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File "C:\Users\x\miniconda3\envs\cnn\lib\multiprocessing\context.py", line 336, in _Popen
return Popen(process_obj)
File "C:\Users\x\miniconda3\envs\cnn\lib\multiprocessing\popen_spawn_win32.py", line 93, in __init__
reduction.dump(process_obj, to_child)
File "C:\Users\x\miniconda3\envs\cnn\lib\multiprocessing\reduction.py", line 60, in dump
ForkingPickler(file, protocol).dump(obj)
OSError: [Errno 22] Invalid argument
Process finished with exit code 1