RuntimeError: Early stopping conditioned on metric `val_loss` which is not available

shreyas301197 · July 18, 2023, 8:06am

val_loss is not being recognized in my code -

def training_step(self, batch,batch_idx):
        kwargs1 = {'anchor':[],'engaged':[]}
        
        #labels
       
        

        ## tower inputs
        
        
        
        # model forward pass
        q_emb,p_emb = self(**kwargs1)
        
        #loss calcualtion

        # get predictions
        preds = torch.where (predicted_prob >=0.5,1,0)
        
        # metrics_calculation
        batch_acc = self.accuracy(preds,batch_labels)
        batch_f1 = self.f1score(preds,batch_labels)
        batch_prec = self.precision(preds,batch_labels)
        batch_recall = self.recall(preds,batch_labels)
        log_dict = {
            'train_accuracy' : batch_acc,
            'train_precision' : batch_prec,
            'train_recall' : batch_recall
        }
        
        self.log('train_loss',loss,on_epoch = True,prog_bar=True)
        self.log('train_f1score',batch_f1,on_epoch = True,prog_bar=True)
        
        self.log_dict(log_dict,on_step=False, on_epoch = True)
        
        return loss
    
    def validation_step(self, batch,batch_idx):
     
        
        #labels
        
        ##siamese tower batched inputs
       
        # model forward pass
        
        # loss calculation
        
            
        # get predictions
        
        
        # metrics_calculation
        batch_acc = self.accuracy(preds,batch_labels)
        batch_f1 = self.f1score(preds,batch_labels)
        batch_prec = self.precision(preds,batch_labels)
        batch_recall = self.recall(preds,batch_labels)
        log_dict = {
            'val_loss': loss,
            'val_accuracy' : batch_acc,
            'val_f1score' : batch_f1,
            'val_precision' : batch_prec,
            'val_recall' : batch_recall
        }
        self.log_dict(log_dict,on_epoch = True, sync_dist=True)
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.trainer.model.parameters(), lr=1e-3)
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 8, gamma=0.1, last_epoch=-1)
        return [optimizer], [lr_scheduler]

def main(model_save_dir, log_dir,  **kwargs):

    # init data
    data_module = ItemsageDataModule(**kwargs)
    # init model
    model = ItemSageModel(**kwargs)
    # model = torch.compile(model)

    # define callbacks
    checkpoint_callback = ModelCheckpoint(
    dirpath=model_save_dir,
    filename='model-{epoch:02d}',
    save_top_k=-1,  # Save all checkpoints
    verbose=True)

    early_stop_callback = EarlyStopping(
    monitor='val_loss',  # The metric to monitor (e.g., validation loss)
    mode='min',  # The direction to monitor (minimize the metric)
    patience=5,  # Number of epochs with no improvement before stopping
    verbose=True  # Print early stopping updates
    )
    logger = TensorBoardLogger(save_dir=log_dir, version=1, name="lightning_logs")
    callbacks = [checkpoint_callback,early_stop_callback]

    # init trainer
    trainer = Trainer(
    callbacks= callbacks ,
    max_epochs=model_args['n_epochs'],
    max_steps = 1000,
    # num_sanity_val_steps = 0,
    accelerator="gpu", 
    devices=4,
    # num_nodes = -1,
    strategy="deepspeed",
    deterministic=True,
    # precision='16-mixed',
    default_root_dir=log_dir,
    reload_dataloaders_every_n_epochs=1,
    benchmark = True,
    # use_distributed_sampler=True,
    enable_progress_bar=True,
    enable_model_summary=True,
    check_val_every_n_epoch=1,
    # precision='32-mixed',
    logger=logger)
    
    # start training
    trainer.fit( model=model, datamodule=data_module )

Am i missing something here ? Does validation_step work differently under deepspeed strat?

Here is the error at the end of the first epoch -

RuntimeError: Early stopping conditioned on metric `val_loss` which is 
not available. Pass in or modify your `EarlyStopping` callback to use any 
of the following: 
`train_loss`, `train_loss_step`, `train_f1score`, `train_f1score_step`, 
`train_loss_epoch`, `train_f1score_epoch`, `train_accuracy`, 
`train_precision`, `train_recall`

awaelchli · July 24, 2023, 9:24am

@shreyas301197 Does this only happen with the DeepSpeed strategy? Which Lightning version is this?