I have train, validation and test loader objects.
I wrote this lightning module:
class GraphLevelGNN(pl.LightningModule):
"""
See #see https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html; below is a standard set up for using pl lightning modules.
"""
def __init__(self,**model_kwargs):
super().__init__()
# Saving hyperparameters
self.save_hyperparameters()
self.model = GraphGNNModel(**model_kwargs)
self.loss_module = nn.BCEWithLogitsLoss() if self.hparams.c_out == 1 else nn.CrossEntropyLoss()
self.optimizer_name = model_kwargs['optimizer_name']
self.learning_rate = model_kwargs['learning_rate']
def forward(self, data, mode="train"):
x, edge_index, batch_idx = data.x, data.edge_index, data.batch
x = self.model(x, edge_index, batch_idx)
x = x.squeeze(dim=-1)
if self.hparams.c_out == 1:
preds = (x > 0).float()
data.y = data.y.float()
else:
preds = x.argmax(dim=-1)
loss = self.loss_module(x, data.y.float())
acc = (preds == data.y).sum().float() / preds.shape[0]
data.y = data.y.int()
preds = preds.int()
f1 = BinaryF1Score().to(device) #change this to other F1, precision etc with num_classes
f1_score = f1(preds,data.y).to(device)
precision = BinaryPrecision().to(device)
precision_score=precision(preds,data.y).to(device)
recall = BinaryRecall().to(device)
recall_score=recall(preds,data.y).to(device)
return loss, acc, f1_score,precision_score, recall_score,preds
def configure_optimizers(self):
learning_rate = self.learning_rate
optimizer = optim.SGD(self.parameters(),lr=learning_rate)
return optimizer
def training_step(self, batch, batch_idx):
loss, acc, _,_,_,_ = self.forward(batch, mode="train")
self.log('train_loss', loss,on_epoch=True,logger=True,batch_size=64)
self.log('train_acc', acc,on_epoch=True,logger=True,batch_size=64)
return loss
def validation_step(self, batch, batch_idx):
loss, acc, _,_,_,_ = self.forward(batch, mode="val")
self.log('val_acc', acc,on_epoch=True,logger=True,batch_size=64)
self.log('val_loss', loss,on_epoch=True,logger=True,batch_size=64)
def test_step(self, batch, batch_idx):
loss,acc, f1,precision, recall,preds = self.forward(batch, mode="test")
self.log('test_acc', acc,on_epoch=True,logger=True,batch_size=64)
self.log('test_f1', f1,on_epoch=True,logger=True,batch_size=64)
self.log('test_precision', precision,on_epoch=True,logger=True,batch_size=64)
self.log('test_recall', recall,on_epoch=True,logger=True,batch_size=64)
To calculate validation and test metrics, I wrote this:
def evaluate_model(model,graph_test_loader,graph_val_loader,output_file='/home/output.txt'):
"""
Aim: Return validation and test metrics for the best model returned by ray tune
Input: Validation and test set and model
Output: File with metrics for validation and test set.
How is this different from parse_logger_file(); this function is not per epoch, and is also incorporating the test set
"""
model.eval() #switch for evaluating, e.g. turning off dropout, batch norm etc
trainer = pl.Trainer()#(accelerator='gpu',devices=-1)
test_result = trainer.test(model, graph_test_loader, verbose=False)[0] #[0] because it returns the dict in a list
validation_result = trainer.test(model, graph_val_loader, verbose=False)[0]
output_file = open(output_file, 'a')
output_file.write('validation results' + '\n')
for i in validation_result:
output_file.write(i + '\t' + str(validation_result[i]) + '\n')
output_file.write('******' + '\n')
output_file.write('test results' + '\n')
for i in test_result:
output_file.write(i + '\t' + str(test_result[i]) + '\n')
output_file.close()
return
My output returns high metrics for my validation set (i.e. precision, recall etc >0.89) but very low metrics for my test set (<0.5, for multiple different data sets).
I’m wondering have I not implemented the calculation of validation and test metrics correctly - could someone show me how to alter the evaluate_model function to correctly return overall precision, recall and accuracy metrics for the validation and test sets?