val_loss is not being recognized in my code -
def training_step(self, batch,batch_idx):
kwargs1 = {'anchor':[],'engaged':[]}
#labels
## tower inputs
# model forward pass
q_emb,p_emb = self(**kwargs1)
#loss calcualtion
# get predictions
preds = torch.where (predicted_prob >=0.5,1,0)
# metrics_calculation
batch_acc = self.accuracy(preds,batch_labels)
batch_f1 = self.f1score(preds,batch_labels)
batch_prec = self.precision(preds,batch_labels)
batch_recall = self.recall(preds,batch_labels)
log_dict = {
'train_accuracy' : batch_acc,
'train_precision' : batch_prec,
'train_recall' : batch_recall
}
self.log('train_loss',loss,on_epoch = True,prog_bar=True)
self.log('train_f1score',batch_f1,on_epoch = True,prog_bar=True)
self.log_dict(log_dict,on_step=False, on_epoch = True)
return loss
def validation_step(self, batch,batch_idx):
#labels
##siamese tower batched inputs
# model forward pass
# loss calculation
# get predictions
# metrics_calculation
batch_acc = self.accuracy(preds,batch_labels)
batch_f1 = self.f1score(preds,batch_labels)
batch_prec = self.precision(preds,batch_labels)
batch_recall = self.recall(preds,batch_labels)
log_dict = {
'val_loss': loss,
'val_accuracy' : batch_acc,
'val_f1score' : batch_f1,
'val_precision' : batch_prec,
'val_recall' : batch_recall
}
self.log_dict(log_dict,on_epoch = True, sync_dist=True)
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.trainer.model.parameters(), lr=1e-3)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 8, gamma=0.1, last_epoch=-1)
return [optimizer], [lr_scheduler]
def main(model_save_dir, log_dir, **kwargs):
# init data
data_module = ItemsageDataModule(**kwargs)
# init model
model = ItemSageModel(**kwargs)
# model = torch.compile(model)
# define callbacks
checkpoint_callback = ModelCheckpoint(
dirpath=model_save_dir,
filename='model-{epoch:02d}',
save_top_k=-1, # Save all checkpoints
verbose=True)
early_stop_callback = EarlyStopping(
monitor='val_loss', # The metric to monitor (e.g., validation loss)
mode='min', # The direction to monitor (minimize the metric)
patience=5, # Number of epochs with no improvement before stopping
verbose=True # Print early stopping updates
)
logger = TensorBoardLogger(save_dir=log_dir, version=1, name="lightning_logs")
callbacks = [checkpoint_callback,early_stop_callback]
# init trainer
trainer = Trainer(
callbacks= callbacks ,
max_epochs=model_args['n_epochs'],
max_steps = 1000,
# num_sanity_val_steps = 0,
accelerator="gpu",
devices=4,
# num_nodes = -1,
strategy="deepspeed",
deterministic=True,
# precision='16-mixed',
default_root_dir=log_dir,
reload_dataloaders_every_n_epochs=1,
benchmark = True,
# use_distributed_sampler=True,
enable_progress_bar=True,
enable_model_summary=True,
check_val_every_n_epoch=1,
# precision='32-mixed',
logger=logger)
# start training
trainer.fit( model=model, datamodule=data_module )
Am i missing something here ? Does validation_step work differently under deepspeed strat?
Here is the error at the end of the first epoch -
RuntimeError: Early stopping conditioned on metric `val_loss` which is
not available. Pass in or modify your `EarlyStopping` callback to use any
of the following:
`train_loss`, `train_loss_step`, `train_f1score`, `train_f1score_step`,
`train_loss_epoch`, `train_f1score_epoch`, `train_accuracy`,
`train_precision`, `train_recall`