Metrics Not Lining Up With sklearn

thingsofleon · January 12, 2021, 11:35pm

For the life of me I can’t get the PyTorch Lightning Metrics to give me the same answer as the sklearn.metrics.

sklearn give me:

Accuracy: 0.8274366863905326
Precision: 0.004664135377115268
Recall: 0.7605095541401274
F1: 0.009271410046357051

When getting Precision, Recall, and F1 I have used the following:

no parameters

[{‘test_accuracy’: 0.8302674293518066,
‘test_precision’: 0.004992614965885878,
‘test_recall’: 0.1422896534204483,
‘test_F1’: 0.0090402876958251}]
num_classes=2

[{‘test_accuracy’: 0.8249880075454712,
‘test_precision’: 0.8249880075454712,
‘test_recall’: 0.8249880075454712,
‘test_F1’: 0.8249880075454712}]
multilabel=True

[{‘test_accuracy’: 0.8274366855621338,
‘test_precision’: 0.004944399930536747,
‘test_recall’: 0.14596910774707794,
‘test_F1’: 0.008971753530204296}]
num_classes=2, multilabel=True

[{‘test_accuracy’: 0.8220030665397644,
‘test_precision’: 0.004781993106007576,
‘test_recall’: 0.1394656002521515,
‘test_F1’: 0.8220030665397644}]

Anyone have any thoughts?

teddy · January 13, 2021, 5:55pm

Hi there! We try to ensure that our metrics are rigorously tested against sklearn. Would you mind sharing any code to reproduce this?

You can find these tests comparing sklearn’s metrics and ours here: https://github.com/PyTorchLightning/pytorch-lightning/blob/master/tests/metrics/classification/test_precision_recall.py. Could be helpful in ensuring you are supplying the correct arguments.

thingsofleon · January 14, 2021, 3:25pm

unfortunately the dataset is proprietary so I can’t share that.
in essence the:

y (truth): a long list of 0’s and 1’s
y_hat (predictions): a long list of 0’s and 1’s

Here is the code:

class Model(pl.LightningModule):
def __init__(self, input_size):
    super().__init__()
    self.input_size = input_size
    
    #Layers
    self.fc1 = torch.nn.Linear(self.input_size, 10) 
    self.fc2 = torch.nn.Linear(10, 2)
    self.relu = torch.nn.ReLU()
    
def forward(self,x):
    out = self.relu(self.fc1(x))
    out = self.fc2(out)
    return out

class LitClassifier(pl.LightningModule):
def __init__(self, train_data, model, batch_size=32, learning_rate = 1e-3):
    super().__init__()
    
    self.train_data = train_data
    self.model = model       
    
    self.batch_size = batch_size # must have for batch_size tuning
    self.learning_rate = learning_rate # must have for lr tuning
    
    # Metrics
    self.train_accuracy = pl.metrics.Accuracy()
    self.test_accuracy = pl.metrics.Accuracy()
    self.test_F1 = pl.metrics.F1()
    self.test_precision = pl.metrics.Precision()
    self.test_recall = pl.metrics.Recall()
   
    # Test Truth and Predictions
    self.test_y = []
    self.test_y_hat = []        
    
def configure_optimizers(self):
    return torch.optim.Adam(self.parameters())
    
def training_step(self, train_batch, batch_idx):
    X_batch = train_batch[0]
    y_batch = train_batch[1]
    y_hat_batch = self.model(X_batch)
    loss = F.cross_entropy(y_hat_batch, y_batch) # CrossEntropyLoss
    
    #Logs
    self.log('training_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
    self.log('train_accuracy', self.train_accuracy(y_hat_batch, y_batch), on_step=False, on_epoch=True, prog_bar=True, logger=True)

    return loss

def test_step(self, test_batch, batch_idx):
    X_batch = test_batch[0]
    y_batch = test_batch[1]
    y_hat_batch = self.model(X_batch)
    _, y_hat_batch_tags = torch.max(y_hat_batch, dim=-1)
    
    # Metric Logs (DON'T GIVE ACCURATE VALUES)
    self.log('test_accuracy_1', self.test_accuracy(y_hat_batch_tags, y_batch), on_step=False, on_epoch=True, prog_bar=True, logger=True)
    self.log('test_precision_1', self.test_precision(y_hat_batch_tags, y_batch), on_step=False, on_epoch=True, prog_bar=True, logger=True)
    self.log('test_recall_1', self.test_recall(y_hat_batch_tags, y_batch), on_step=False, on_epoch=True, prog_bar=True, logger=True)
    self.log('test_F1_1', self.test_F1(y_hat_batch_tags, y_batch), on_step=False, on_epoch=True, prog_bar=True, logger=True)
   
    #Save Test Truth and Predictions
    y_batch = y_batch.cpu().numpy().tolist()
    self.test_y.extend(y_batch)
    
    y_hat_batch = y_hat_batch_tags.cpu().numpy().tolist()
    self.test_y_hat.extend(y_hat_batch)
    
    return

#######################################
# Dataset and Dataloader (multi-gpu)
#######################################
def setup(self, stage):
    if stage == 'fit':
        #Convert training data to a TorchDataset
        self.train_dataset = TorchDataset(self.train_data.X,self.train_data.Y[:,0])

def train_dataloader(self):
    # For this Experiment, the train_dataloader will be ran on every epoch
    # This give all targets and the same number of randomly sampled non-targets from the train_data
    sampled_X, sampled_Y = self.train_data.getSample(material_index=0)
    sampled_train_dataset = TorchDataset(sampled_X,sampled_Y[:,0])
    train_dataloader = DataLoader(sampled_train_dataset, batch_size=self.batch_size, num_workers=24)
    return train_dataloader

class MyCallbacks(pl.Callback):

# After the test loop is finished
def on_test_end(self, trainer, pl_module):
    
    #Log metrics to MLFlow
mlf_logger.experiment.log_metric(RUN_ID,key="test_accuracy_2",
    value=sklearn.metrics.accuracy_score(pl_module.test_y,pl_module.test_y_hat))
mlf_logger.experiment.log_metric(RUN_ID,key="test_precision_2",
    value=sklearn.metrics.precision_score(pl_module.test_y,pl_module.test_y_hat))
mlf_logger.experiment.log_metric(RUN_ID,key="test_recall_2",
    value=sklearn.metrics.recall_score(pl_module.test_y,pl_module.test_y_hat))
mlf_logger.experiment.log_metric(RUN_ID,key="test_f1_2",
    value=sklearn.metrics.f1_score(pl_module.test_y,pl_module.test_y_hat))

# TRAIN
EPOCHS = 100
BATCH_SIZE = 50

model = Model(input_size=train_data.spectra_length)
classifier = LitClassifier(train_data, model, batch_size = BATCH_SIZE)

trainer = pl.Trainer(gpus=1,
                              max_epochs=EPOCHS,
                              auto_lr_find=False, 
                              reload_dataloaders_every_epoch=True, 
                              logger=[mlf_logger, tb_logger],
                              callbacks=[MyCallbacks()]) #sample
trainer.fit(classifier)

# TEST
BATCH_SIZE = 4096
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, num_workers=24)
trainer.test(classifier,test_dataloader)

The PyTorch LIghtning logging in the test_step (played around with different paramters as stated in the OP) is different than the logging in the on_test_end callback (using sklearn).

Topic		Replies	Views
Discrepancy between val and test metrics implementation help	6	3311	December 21, 2020
Getting different values between `sklearn.metrics` and `torchmetrics`	1	1675	January 30, 2022
Metrics not logged properly in PyTorch Lightning LightningModule	1	898	October 22, 2023
How to customize trainer.test implementation help	6	4365	October 30, 2020
PyTorch Lightning AUROC value for multi-class seems to be completely off compared to sklearn (using it wrong)?	7	6050	September 1, 2020

Metrics Not Lining Up With sklearn

Related topics