RuntimeError: Early stopping conditioned on metric `val_loss` which is not available.

#######
def mixup(x: torch.Tensor, y: torch.Tensor, alpha: float = 1.0):
    assert alpha > 0, "alpha should be larger than 0"
    assert x.size(0) > 1, "Mixup cannot be applied to a single instance."

    lam = np.random.beta(alpha, alpha)
    rand_index = torch.randperm(x.size()[0])
    mixed_x = lam * x + (1 - lam) * x[rand_index, :]
    target_a, target_b = y, y[rand_index]
    return mixed_x, target_a, target_b, lam

class Model(pl.LightningModule):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.__build_model()
        self._criterion = eval(self.cfg['loss'])()
        self.transform = get_default_transforms()
        self.save_hyperparameters(cfg)
        self.training_step_outputs = []
        self.validation_step_outputs =[]

    def __build_model(self):
        self.backbone = create_model(
            self.cfg['model']['name'], pretrained=True, num_classes=0, in_chans=3
        )
        num_features = self.backbone.num_features
        self.fc = nn.Sequential(
            nn.Dropout(0.5), nn.Linear(num_features, self.cfg['model']['output_dim'])
        )

    def forward(self, x):
        f = self.backbone(x)
        out = self.fc(f)
        return out

    def training_step(self, batch, batch_idx):
        images, labels = batch
        labels = labels.float() / 100.0
        images = self.transform['train'](images)
        
        if torch.rand(1)[0] < 0.5:
            mix_images, target_a, target_b, lam = mixup(images, labels, alpha=0.5)
            logits = self.forward(mix_images).squeeze(1)
            loss = self._criterion(logits, target_a) * lam + \
                (1 - lam) * self._criterion(logits, target_b)
        else:
            logits = self.forward(images).squeeze(1)
            loss = self._criterion(logits, labels)
        
        pred = logits.sigmoid().detach().cpu() * 100.
        labels = labels.detach().cpu() * 100.
        self.training_step_outputs.append({'loss': loss, 'pred':pred, 'labels': labels})
        return {'loss':loss}
        
    def validation_step(self, batch, batch_idx):
        images, labels = batch
        labels = labels.float() / 100.0
        images = self.transform['val'](images)

        logits = self.forward(images).squeeze(1)
        loss = self._criterion(logits, labels)

        pred = logits.sigmoid().detach().cpu() * 100.
        labels = labels.detach().cpu() * 100.
        self.validation_step_outputs.append({'loss': loss,'pred':pred, 'labels': labels})
        return {'loss':loss}
        
    def on_train_epoch_end(self):
        losses = torch.stack([x['loss'] for x in self.training_step_outputs]).mean()
        preds = torch.cat([x['pred'] for x in self.training_step_outputs], dim=0)
        labels = torch.cat([x['labels'] for x in self.training_step_outputs], dim=0)
        metrics = torch.sqrt(((labels - preds)**2).mean())
        self.log('train_loss', metrics, on_epoch=True) 
        #self.training_step_outputs.clear() 
        return {'train_loss': metrics}

    def on_validation_epoch_end(self):
        losses = torch.stack([x['loss'] for x in self.validation_step_outputs]).mean()
        preds = torch.cat([x['pred'] for x in self.validation_step_outputs], dim=0)
        labels = torch.cat([x['labels'] for x in self.validation_step_outputs], dim=0)
        val_metrics = torch.sqrt(((labels - preds)**2).mean())
        self.log("val_loss", val_metrics, on_epoch=True)
        #metrics.update({"val_loss": metrics})
        self.validation_step_outputs.clear()   
        return {'val_loss': val_metrics}

    def configure_optimizers(self):
        optimizer = eval(self.cfg['optimizer']['name'])(
            self.parameters(), **self.cfg['optimizer']['params']
        )
        scheduler = eval(self.cfg['scheduler']['name'])(
            optimizer,
            **self.cfg['scheduler']['params']
        )
        return [optimizer], [scheduler]
skf = StratifiedKFold(
    n_splits=config['n_splits'], shuffle=True, random_state=config['seed']
)

for fold, (train_idx, val_idx) in enumerate(skf.split(df["Id"], df["Pawpularity"])):
    train_df = df.loc[train_idx].reset_index(drop=True)
    val_df = df.loc[val_idx].reset_index(drop=True)
    datamodule = PetfinderDataModule(train_df, val_df, test_df, config)
    model = Model(config)
    earystopping = EarlyStopping(monitor="val_loss")
    lr_monitor = callbacks.LearningRateMonitor()
    loss_checkpoint = callbacks.ModelCheckpoint(
        #dirpath = '/content/kaggle/',  # 추가
        filename="best_loss",
        monitor="val_loss",
        save_top_k=1,
        mode="min",
        save_last=False,
    )
    logger = TensorBoardLogger(config['model']['name'])
    
    trainer = pl.Trainer(
        accelerator='cuda',
        logger=logger,
        max_epochs=config['epoch'],
        callbacks=[lr_monitor, loss_checkpoint, earystopping],
        max_steps=100,
        **config['trainer']
    )
    trainer.fit(model, datamodule=datamodule)

RuntimeError Traceback (most recent call last)
in <cell line: 5>()
28 **config[‘trainer’]
29 )
—> 30 trainer.fit(model, datamodule=datamodule)

10 frames
/usr/local/lib/python3.9/dist-packages/lightning/pytorch/callbacks/early_stopping.py in _validate_condition_metric(self, logs)
148 if monitor_val is None:
149 if self.strict:
→ 150 raise RuntimeError(error_msg)
151 if self.verbose > 0:
152 rank_zero_warn(error_msg, category=RuntimeWarning)

RuntimeError: Early stopping conditioned on metric val_loss which is not available. Pass in or modify your EarlyStopping callback to use any of the following: train_loss


I’ve tried all possible methods such as googling and chatGPT, but all I get is the same error. Where could I have gone wrong?

Hey,
You need to log them inside the validation_step and either use one of our metrics from torchmetrics or use our default mean accumulation.

Otherwise the EarlyStopping runs before on_validation_epoch_end

Cheers,
Justus