Hi,
I’ve got the following module:
class EvaluationModel(pl.LightningModule):
def __init__(
self,
train_data: List[pathlib.Path],
val_data: List[pathlib.Path],
batch_size=1024,
learning_rate=1e-3,
hidden_layers=10,
hidden_layer_width=256
):
super().__init__()
self.train_data = train_data
self.val_data = val_data
self.batch_size = batch_size
self.learning_rate = learning_rate
self.save_hyperparameters()
layers: List[Tuple[str, any]] = [
(f'linear-entry', nn.Linear((12 * 8 * 8) + 3, hidden_layer_width, dtype=torch_dtype, bias=False)),
(f'activation-entry', nn.ReLU())
]
for i in range(hidden_layers):
layers.append(
(f'linear-{i}', nn.Linear(hidden_layer_width, hidden_layer_width, dtype=torch_dtype))
)
layers.append(
(f'activation-{i}', nn.ReLU())
)
layers.append(('linear', nn.Linear(hidden_layer_width, 1, dtype=torch_dtype)))
self.seq = nn.Sequential(collections.OrderedDict(layers))
def forward(self, board, features):
x = torch.cat([
torch.flatten(board, 1),
features
], 1)
return self.seq(x)
def training_step(self, batch, batch_idx):
y = batch['score']
y_hat = self.forward(batch['board'], batch['features'])
loss = F.l1_loss(y_hat, y)
self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
return loss
def validation_step(self, batch, batch_idx):
y = batch['score']
y_hat = self.forward(batch['board'], batch['features'])
loss = F.l1_loss(y_hat, y)
self.log("val_loss", loss, prog_bar=True)
return loss
def train_dataloader(self) -> DataLoader:
dataset = EvaluationDataset(self.train_data)
return DataLoader(dataset, batch_size=self.batch_size, num_workers=32, pin_memory=True, drop_last=True)
def val_dataloader(self) -> DataLoader:
dataset = EvaluationDataset(self.val_data)
return DataLoader(dataset, batch_size=self.batch_size, num_workers=32, drop_last=True)
def configure_optimizers(self):
optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
scheduler = ReduceLROnPlateau(optimizer, mode="min", verbose=True)
return {
"optimizer": optimizer,
"lr_scheduler": {
"scheduler": scheduler,
"interval": "epoch",
"frequency": 1,
"monitor": "val_loss",
"strict": True,
}
}
and the following trainer setup:
lr = 1e-3
model = EvaluationModel(
train_paths, val_paths,
batch_size=1024 * 4,
learning_rate=lr,
hidden_layers=6,
hidden_layer_width=2048
)
callbacks = [
StochasticWeightAveraging(swa_lrs=lr, device=None),
EarlyStopping(monitor="val_loss", verbose=True, check_on_train_epoch_end=False),
LearningRateMonitor(logging_interval='epoch', log_momentum=True),
ModelCheckpoint(
filename='epoch={epoch}-step={step}-val_loss={val_loss:.3f}-train_loss={train_loss_epoch:.3f}',
save_top_k=-1,
)
]
accumulate_grad_batches = 7
tb_logger = TensorBoardLogger(save_dir="logs_train3_2/")
trainer = pl.Trainer(
accelerator="gpu",
max_epochs=2000,
callbacks=callbacks,
accumulate_grad_batches=accumulate_grad_batches,
#precision='16-mixed',
logger=tb_logger,
# limit_train_batches=4,
# limit_val_batches=4,
# log_every_n_steps=1,
)
trainer.fit(
model,
# ckpt_path=list(pathlib.Path(r"logs_train3/lightning_logs/version_0/checkpoints").rglob("*.ckpt"))[0]
)
Seems that if I un-comment the following lines:
# limit_train_batches=4,
# limit_val_batches=4,
# log_every_n_steps=1,
everything works as expected, i.e, after each train epoch, it runs a validation epoch, and early stopping callback works as expected.
However if I comment these out, it doesn’t seem to run validation after epoch 0.
So far with check_on_train_epoch_end=False
in EarlyStopping, is hasn’t failed on epoch 0, and is still running (epoch 1 now, but no val_loss metric in the progress bar).
However, if I remove check_on_train_epoch_end=False
from EarlyStopping, it actually fails with an exception suggesting it can’t find val_loss metric after epoch 0 is finished.
Am I missing something? Why is it not running validation after epoch 0 when running with all the batches?