Following seems to work on the CPU but does not work on the GPU.
It is the first time I am using Lightning so I might also be doing something incorrectly also.
from typing import Dict, Tuple
import fire
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pl_bolts.models.detection.faster_rcnn import FasterRCNN
from pytorch_lightning.loggers import TensorBoardLogger
from torch.tensor import Tensor
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.dataset import random_split
from <redacted> import Dataset
def collate(batch):
return tuple(zip(*batch))
class DataModule(pl.LightningDataModule):
"""TorchLightning DataModule from Dataset"""
def __init__(self, batch_size: int = 2):
super().__init__()
self.batch_size = batch_size
self.num_workers = 1
dataset = Dataset()
ltrain = len(dataset)
ltest = int(ltrain * 0.2)
ltrain -= ltest
lval = int(ltrain * 0.2)
ltrain -= lval
lengths = ltrain, lval, ltest
print(f"train: {ltrain} val: {lval} test: {ltest}")
self.train, self.val, self.test = random_split(dataset, lengths)
def val_dataloader(self) -> DataLoader[Tuple[Tensor, Dict[str, Tensor]]]:
val_loader: DataLoader[Tuple[Tensor, Dict[str, Tensor]]] = DataLoader(
self.val,
batch_size=self.batch_size,
num_workers=self.num_workers,
collate_fn=collate,
)
return val_loader
def train_dataloader(
self,
) -> DataLoader[Tuple[Tensor, Dict[str, Tensor]]]:
train_loader: DataLoader[Tuple[Tensor, Dict[str, Tensor]]] = DataLoader(
self.train,
batch_size=self.batch_size,
num_workers=self.num_workers,
collate_fn=collate,
)
return train_loader
def test_dataloader(self) -> DataLoader[Tuple[Tensor, Dict[str, Tensor]]]:
test_loader: DataLoader[Tuple[Tensor, Dict[str, Tensor]]] = DataLoader(
self.test,
batch_size=self.batch_size,
num_workers=self.num_workers,
collate_fn=collate,
)
return test_loader
class Train():
@staticmethod
def start():
example = Dataset()[0]
print(f"""
\tsrc: {example[0].shape}
\tboxes: {example[1]['boxes'].shape}
\tlabels: {example[1]['labels'].shape}
"""
)
logger = TensorBoardLogger(save_dir="logs", version=1, name="training_logs")
trainer = Trainer(
logger=logger,
# accelerator="ddp",
gpus=1,
# amp_backend="apex",
)
model = FasterRCNN()
trainer.fit(model, datamodule=DataModule())
if __name__=="__main__":
fire.Fire(Train())
The dataset print is
src: torch.Size([3, 1028, 1232])
boxes: torch.Size([26, 4])
labels: torch.Size([26])
But it fails with
UserWarning: The dataloader, train dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 20 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
warnings.warn(*args, **kwargs)
Epoch 0: 0%| | 0/56 [00:00<?, ?it/s]/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:142: operator(): block: [0,0,0], thread: [32,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:142: operator(): block: [0,0,0], thread: [33,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:142: operator(): block: [0,0,0], thread: [34,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:142: operator(): block: [0,0,0], thread: [35,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:142: operator(): block: [0,0,0], thread: [36,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
<Continues for a while with errors where only thread number changes>
Traceback (most recent call last):
File "/usr/lib/python3.9/runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.9/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "<my_path>/.vscode-server/extensions/ms-python.python-2021.6.944021595/pythonFiles/lib/python/debugpy/__main__.py", line 45, in <module>
cli.main()
File "<my_path>/.vscode-server/extensions/ms-python.python-2021.6.944021595/pythonFiles/lib/python/debugpy/../debugpy/server/cli.py", line 444, in main
run()
File "<my_path>/.vscode-server/extensions/ms-python.python-2021.6.944021595/pythonFiles/lib/python/debugpy/../debugpy/server/cli.py", line 285, in run_file
runpy.run_path(target_as_str, run_name=compat.force_str("__main__"))
File "/usr/lib/python3.9/runpy.py", line 268, in run_path
return _run_module_code(code, init_globals, run_name,
File "/usr/lib/python3.9/runpy.py", line 97, in _run_module_code
_run_code(code, mod_globals, init_globals,
File "/usr/lib/python3.9/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "<my_path>/train.py", line 96, in <module>
fire.Fire(Train())
File "<my_path>/ENV/lib/python3.9/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "<my_path>/ENV/lib/python3.9/site-packages/fire/core.py", line 466, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "<my_path>/ENV/lib/python3.9/site-packages/fire/core.py", line 681, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "<my_path>/train.py", line 92, in mgg
trainer.fit(model, datamodule=DataModule())
File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 458, in fit
self._run(model)
File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 756, in _run
self.dispatch()
File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 797, in dispatch
self.accelerator.start_training(self)
File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/accelerators/accelerator.py", line 96, in start_training
self.training_type_plugin.start_training(trainer)
File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 144, in start_training
self._results = trainer.run_stage()
File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 807, in run_stage
return self.run_train()
File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 869, in run_train
self.train_loop.run_training_epoch()
File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/trainer/training_loop.py", line 499, in run_training_epoch
batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx)
File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/trainer/training_loop.py", line 738, in run_training_batch
self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/trainer/training_loop.py", line 434, in optimizer_step
model_ref.optimizer_step(
File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/core/lightning.py", line 1403, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/core/optimizer.py", line 214, in step
self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs)
File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/core/optimizer.py", line 134, in __optimizer_step
trainer.accelerator.optimizer_step(optimizer, self._optimizer_idx, lambda_closure=closure, **kwargs)
File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/accelerators/accelerator.py", line 329, in optimizer_step
self.run_optimizer_step(optimizer, opt_idx, lambda_closure, **kwargs)
File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/accelerators/accelerator.py", line 336, in run_optimizer_step
self.training_type_plugin.optimizer_step(optimizer, lambda_closure=lambda_closure, **kwargs)
File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 193, in optimizer_step
optimizer.step(closure=lambda_closure, **kwargs)
File "<my_path>/ENV/lib/python3.9/site-packages/torch/optim/optimizer.py", line 89, in wrapper
return func(*args, **kwargs)
File "<my_path>/ENV/lib/python3.9/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "<my_path>/ENV/lib/python3.9/site-packages/torch/optim/sgd.py", line 87, in step
loss = closure()
File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/trainer/training_loop.py", line 732, in train_step_and_backward_closure
result = self.training_step_and_backward(
File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/trainer/training_loop.py", line 823, in training_step_and_backward
result = self.training_step(split_batch, batch_idx, opt_idx, hiddens)
File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/trainer/training_loop.py", line 290, in training_step
training_step_output = self.trainer.accelerator.training_step(args)
File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/accelerators/accelerator.py", line 204, in training_step
return self.training_type_plugin.training_step(*args)
File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 155, in training_step
return self.lightning_module.training_step(*args, **kwargs)
File "<my_path>/ENV/lib/python3.9/site-packages/pl_bolts/models/detection/faster_rcnn/faster_rcnn_module.py", line 112, in training_step
loss_dict = self.model(images, targets)
File "<my_path>/ENV/lib/python3.9/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "<my_path>/ENV/lib/python3.9/site-packages/torchvision/models/detection/generalized_rcnn.py", line 97, in forward
proposals, proposal_losses = self.rpn(images, features, targets)
File "<my_path>/ENV/lib/python3.9/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "<my_path>/ENV/lib/python3.9/site-packages/torchvision/models/detection/rpn.py", line 364, in forward
loss_objectness, loss_rpn_box_reg = self.compute_loss(
File "<my_path>/ENV/lib/python3.9/site-packages/torchvision/models/detection/rpn.py", line 296, in compute_loss
sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
File "<my_path>/ENV/lib/python3.9/site-packages/torchvision/models/detection/_utils.py", line 45, in __call__
positive = torch.where(matched_idxs_per_image >= 1)[0]
RuntimeError: CUDA error: device-side assert triggered
Exception ignored in: <function tqdm.__del__ at 0x7f01c474d550>
Traceback (most recent call last):
File "<my_path>/ENV/lib/python3.9/site-packages/tqdm/std.py", line 1145, in __del__
File "<my_path>/ENV/lib/python3.9/site-packages/tqdm/std.py", line 1299, in close
File "<my_path>/ENV/lib/python3.9/site-packages/tqdm/std.py", line 1492, in display
File "<my_path>/ENV/lib/python3.9/site-packages/tqdm/std.py", line 1148, in __str__
File "<my_path>/ENV/lib/python3.9/site-packages/tqdm/std.py", line 1450, in format_dict
TypeError: cannot unpack non-iterable NoneType object
I have no clue what this index assertion does. Do I have an error in my dataset (unlikely as CPU seems to work) or is this some other error?