Automatic opt SGD version works well, but manual opt version does not save any checkpoint even save_last or save_top_1
Manual optimization version differs from automatic optimization SGD version:
def training_step(
self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx: int
) -> torch.Tensor:
"""Perform a single training step on a batch of data from the training set.
:param batch: A batch of data (a tuple) containing the input tensor of images and target
labels.
:param batch_idx: The index of the current batch.
:return: A tensor of losses between model predictions and targets.
"""
optimizer = self.optimizers()
# first forward-backward pass
loss_1, preds_1, targets_1 = self.model_step(batch)
self.manual_backward(loss_1)
optimizer.first_step(zero_grad=True)
# update and log metrics
self.train_loss(loss_1)
self.train_acc(preds_1, targets_1)
self.log("train/loss", self.train_loss, on_step=False, on_epoch=True, prog_bar=True)
self.log("train/acc", self.train_acc, on_step=False, on_epoch=True, prog_bar=True)
self.log("learning rate", self.get_lr(optimizer), on_step=True, on_epoch=False, prog_bar=False)
# second forward-backward pass
loss_2, preds_2, targets_2 = self.model_step(batch)
self.manual_backward(loss_2)
optimizer.second_step(zero_grad=True)
# return loss or backpropagation will fail
return loss_1