Does not run validation step after epoch when running with all data

Hi,

I’ve got the following module:

class EvaluationModel(pl.LightningModule):
    def __init__(
            self,
            train_data: List[pathlib.Path],
            val_data: List[pathlib.Path],
            batch_size=1024,
            learning_rate=1e-3,
            hidden_layers=10,
            hidden_layer_width=256
    ):
        super().__init__()
        self.train_data = train_data
        self.val_data = val_data
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.save_hyperparameters()

        layers: List[Tuple[str, any]] = [
            (f'linear-entry', nn.Linear((12 * 8 * 8) + 3, hidden_layer_width, dtype=torch_dtype, bias=False)),
            (f'activation-entry', nn.ReLU())
        ]

        for i in range(hidden_layers):
            layers.append(
                (f'linear-{i}', nn.Linear(hidden_layer_width, hidden_layer_width, dtype=torch_dtype))
            )
            layers.append(
                (f'activation-{i}', nn.ReLU())
            )

        layers.append(('linear', nn.Linear(hidden_layer_width, 1, dtype=torch_dtype)))
        self.seq = nn.Sequential(collections.OrderedDict(layers))

    def forward(self, board, features):
        x = torch.cat([
            torch.flatten(board, 1),
            features
        ], 1)
        return self.seq(x)

    def training_step(self, batch, batch_idx):
        y = batch['score']
        y_hat = self.forward(batch['board'], batch['features'])
        loss = F.l1_loss(y_hat, y)
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        y = batch['score']
        y_hat = self.forward(batch['board'], batch['features'])
        loss = F.l1_loss(y_hat, y)
        self.log("val_loss", loss, prog_bar=True)
        return loss

    def train_dataloader(self) -> DataLoader:
        dataset = EvaluationDataset(self.train_data)
        return DataLoader(dataset, batch_size=self.batch_size, num_workers=32, pin_memory=True, drop_last=True)

    def val_dataloader(self) -> DataLoader:
        dataset = EvaluationDataset(self.val_data)
        return DataLoader(dataset, batch_size=self.batch_size, num_workers=32, drop_last=True)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
        scheduler = ReduceLROnPlateau(optimizer, mode="min", verbose=True)
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "interval": "epoch",
                "frequency": 1,
                "monitor": "val_loss",
                "strict": True,
            }
        }

and the following trainer setup:

    lr = 1e-3
    model = EvaluationModel(
        train_paths, val_paths,
        batch_size=1024 * 4,
        learning_rate=lr,
        hidden_layers=6,
        hidden_layer_width=2048
    )
    callbacks = [
        StochasticWeightAveraging(swa_lrs=lr, device=None),
        EarlyStopping(monitor="val_loss", verbose=True, check_on_train_epoch_end=False),
        LearningRateMonitor(logging_interval='epoch', log_momentum=True),
        ModelCheckpoint(
            filename='epoch={epoch}-step={step}-val_loss={val_loss:.3f}-train_loss={train_loss_epoch:.3f}',
            save_top_k=-1,
        )
    ]
    accumulate_grad_batches = 7
    tb_logger = TensorBoardLogger(save_dir="logs_train3_2/")
    trainer = pl.Trainer(
        accelerator="gpu",
        max_epochs=2000,
        callbacks=callbacks,
        accumulate_grad_batches=accumulate_grad_batches,
        #precision='16-mixed',
        logger=tb_logger,
        # limit_train_batches=4,
        # limit_val_batches=4,
        # log_every_n_steps=1,
    )

    trainer.fit(
        model,
        # ckpt_path=list(pathlib.Path(r"logs_train3/lightning_logs/version_0/checkpoints").rglob("*.ckpt"))[0]
    )

Seems that if I un-comment the following lines:

        # limit_train_batches=4,
        # limit_val_batches=4,
        # log_every_n_steps=1,

everything works as expected, i.e, after each train epoch, it runs a validation epoch, and early stopping callback works as expected.

However if I comment these out, it doesn’t seem to run validation after epoch 0.
So far with check_on_train_epoch_end=False in EarlyStopping, is hasn’t failed on epoch 0, and is still running (epoch 1 now, but no val_loss metric in the progress bar).
However, if I remove check_on_train_epoch_end=False from EarlyStopping, it actually fails with an exception suggesting it can’t find val_loss metric after epoch 0 is finished.

Am I missing something? Why is it not running validation after epoch 0 when running with all the batches?

seems it moved to epoch 2 without having run avlidation

I suspect I am hitting:

@jmr Are you working with an iterable-style dataset here?

I am, as I have a few hundred GiBs of training data chunked into 4GIB files, and the dataset reads one file and moves to the next.

Also, I know exactly how many samples there are of that data, and how many batches that will end up being, as each sample is of the same size, and sum of sizes of all files / sample size gives me that number.

Seems that if I don’t declare len on my iterable dataset it works, but then I get no progress reports, which is disappointing (given I know exactly how much data there is).

Is there a better way to do this?
Perhaps use normal Dataset for each 4GiB chunk and then some sort of ChainedDataset or something like that? Or will that still pre-read all of it into memory?

Seems that setting check_val_every_n_epoch=None on the trainer causes validation to run each epoch and allows having Iterable datasets that declare length.