This is my code.
def validation_step(self, batch, batch_size):
x, y = batch
logits = self(x)
loss = F.nll_loss(logits, y)
labels_hat = torch.argmax(logits, dim=1)
val_acc = accuracy(labels_hat, y)
result = pl.EvalResult()
result.log('val_acc', val_acc, prog_bar=True, reduce_fx=torch.mean)
return result
def validation_epoch_end(self, results):
all_val_acc = results.val_acc
val_acc = torch.mean(all_val_acc)
result = pl.EvalResult()
result.log('val_acc', val_acc)
return result
if __name__ == '__main__':
seed_everything(42)
model = LitNet()
checkpoint_callback = ModelCheckpoint(filepath='lightning_logs/main_checkpoints/mnist_v10_best.ckpt', monitor='val_acc')
trainer = Trainer(max_epochs=10, gpus=1, checkpoint_callback=checkpoint_callback)
trainer.fit(model)
print(checkpoint_callback.best_model_path)
The issue I am facing is, checkpoint_callback.best_model_path
is returning an empty string and no checkpoint is getting saved. I also got this UserWarning on doing this.
/opt/conda/lib/python3.6/site-packages/pytorch_lightning/utilities/distributed.py:37: UserWarning:
When using EvalResult(early_stop_on=X) or TrainResult(early_stop_on=X) the
'monitor' key of ModelCheckpoint has no effect.
Remove ModelCheckpoint(monitor='val_acc) to fix')
I also tried the other way of doing this i.e. using pl.EvalResult(checkpoint_on=''val_acc')
in at both the places (in validation_step
and in validation_epoch_end
). On doing this the checkpoint was getting saved but not the best one. The epoch=0.ckpt
checkpoint was getting saved, but I could visually confirm better val_acc
in the later epochs.
Any help is much appreciated