I have written a custom callback to log the metrics of my interest using the “callback_metrics” dictionary. However, the dictionary contains multiple values for training/validation loss/mae. I would like to know what’s the reason behind that difference and if I made a mistake somewhere. The quantities that I need to log are training loss, training mae, validation loss, and validation mae. My code is as follows,
def training_step(self, train_batch, batch_idx):
x, y = train_batch
logits = self.forward(x)
loss = self.mae_loss(logits, y)
mae = self.mean_absolute_error(logits, y)
self.log('train_loss', loss, sync_dist=True, on_epoch=True, on_step=True)
self.log('train_mae', mae, sync_dist=True, on_epoch=True, on_step=True)
self.log('epoch_num', self.current_epoch, sync_dist=True, on_epoch=True, on_step=True)
return {'loss': loss, 'mae': mae}
def training_epoch_end(self, outputs):
avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
avg_mae = torch.stack([x['mae'] for x in outputs]).mean()
self.log('avg_train_loss', avg_loss, on_epoch=True, sync_dist=True)
self.log('avg_train_mae', avg_mae, on_epoch=True, sync_dist=True)
def validation_step(self, val_batch, batch_idx):
x, y = val_batch
logits = self.forward(x)
loss = self.mae_loss(logits, y)
mae = self.mean_absolute_error(logits, y)
self.log('val_loss', loss, on_step=True, on_epoch=True, sync_dist=True)
self.log('val_mae', mae, on_step=True, on_epoch=True, sync_dist=True)
return {'rval_loss': loss, 'rval_mae': mae}
def validation_epoch_end(self, outputs):
avg_loss = torch.stack([x['rval_loss'] for x in outputs]).mean()
avg_mae = torch.stack([x['rval_mae'] for x in outputs]).mean()
self.log('avg_val_loss', avg_loss, on_epoch=True, sync_dist=True)
self.log('avg_val_mae', avg_mae, on_epoch=True, sync_dist=True)
The callback_metrics dictionary I got (for one of the epochs) from this is as follows,
{
"avg_train_loss": 0.7574038505554199,
"avg_train_mae": 0.7574038505554199,
"avg_val_loss": 1.288301944732666,
"avg_val_mae": 1.288301944732666,
"epoch_num": 10.0,
"epoch_num_epoch": 10.0,
"epoch_num_step": 10.0,
"train_loss": 0.7605399489402771,
"train_loss_epoch": 0.7293527722358704,
"train_loss_step": 0.7605399489402771,
"train_mae": 0.7605400085449219,
"train_mae_epoch": 0.7293527722358704,
"train_mae_step": 0.7605400085449219,
"val_loss": 1.0137921571731567,
"val_loss_epoch": 1.2942861318588257,
"val_mae": 1.0137921571731567,
"val_mae_epoch": 1.2942861318588257
}
As you can see, avg_train_loss, train_loss (= train_loss_step), and train_loss_epoch have different values. The trend is the same for train mae and validation loss/mae.