How to log metrics and losses correctly when model returns dictionary as output

emilaz · June 15, 2023, 10:53am

I’m trying my way around pytorch lightning and receiving different results compared to vanilla pytorch. I’m wondering whether this is due to me erroneously logging the loss. I have multiple outputs and I’m summing over the losses, so this may be the error? Here is my code. Would also be interested to get some feedback on whether the metrics calculations are correct!


class MultiOutputModule(pl.LightningModule):
    def __init__(self, model: torch.nn.Module, compile_params: CompileParams):
        super().__init__()
        self.model = model
        self.compile_params = compile_params
        self.metrics = self._prepare_metrics()

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.model(x)
        loss = self._compute_loss(y_hat, y)
        self.log(
            "train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True
        )
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self.model(x)
        loss = self._compute_loss(y_pred, y)
        self.log(
            "val_loss", loss, on_step=False, on_epoch=True, prog_bar=True, logger=True
        )
        self._update_metrics(y_pred, y)

        return loss

    def on_validation_epoch_end(self):
        self._compute_metrics()

    def configure_optimizers(self):
        optimizer = self.compile_params.optimizer(
            self.model.parameters(), lr=self.compile_params.start_lr
        )
        scheduler = self.compile_params.lr_scheduler(
            optimizer, **self.compile_params.scheduler_kwargs
        )
        return [optimizer], [scheduler]

    def _compute_loss(self, outputs: dict, labels: dict) -> torch.Tensor:
        loss_values_all = []
        for output_name, output_value in outputs.items():
            loss_value = torch.nn.functional.mse_loss(output_value, labels[output_name])
            loss_values_all.append(loss_value)
        return sum(loss_values_all)

    def _update_metrics(self, val_outputs: dict, val_labels: dict) -> None:
        for output_name, val_output_data in val_outputs.items():
            val_label_data = val_labels[output_name]
            current_metrics = self.metrics[output_name]
            current_metrics.update(val_output_data, val_label_data)

    def _compute_metrics(self):
        for output_name, metrics in self.metrics.items():
            output = metrics.compute()
            self.log_dict(output, prog_bar=False, logger=True)
            metrics.reset()

    def _prepare_metrics(self):
        metrics_dict = {}
        for output_name, metrics in self.compile_params.metrics.items():
            curr_output_metrics = []
            for metric_name in metrics:
                metric = get_metric(metric_name)
                curr_output_metrics.append(metric)
            metrics_dict[output_name] = MetricCollection(
                curr_output_metrics, prefix=output_name + "_"
            )
        return metrics_dict

awaelchli · June 19, 2023, 1:50pm

Hey @emilaz
I don’t see anything immediately wrong with the code. You update the metrics during the steps and then compute + reset at the end of the epoch.

Do the metrics make sense? i.e., at the beginning of training they correspond to a random initialized model, and after training they improve?

Topic		Replies	Views
How to access the logged results such as losses Results object	8	15204	July 9, 2021
Logging using a torchmetric object that returns dictionary LightningModule	1	395	May 17, 2023
Metrics or Callbacks? callbacks	5	3172	November 14, 2022
Metrics not logged properly in PyTorch Lightning LightningModule	1	902	October 22, 2023
What does PyTorch Lightning module do with logged validation losses?	10	3337	March 6, 2024

How to log metrics and losses correctly when model returns dictionary as output

Related topics