Model training stops at the first epoch (epoch 0)

feliperzarichta · May 15, 2024, 6:32pm

I’m training models using PyTorch Lightning, I built a loop to train one model at a time, but only the first model goes “far”, the other two are always stopping at the first epoch. Very strange, as my early stopping is for 3 epochs.

import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from torch import nn
from torch.optim import Adam
from torchmetrics.functional import accuracy
from torchvision import models


def create_model(architecture):
    if architecture == "efficientnet_b0":
        model = models.efficientnet_b0(weights="DEFAULT")
        num_features = model.classifier[1].in_features
        model.classifier[1] = nn.Linear(num_features, 2)
    elif architecture == "mobilenet_v2":
        model = models.mobilenet_v2(weights="DEFAULT")
        model.classifier[1] = nn.Linear(model.last_channel, 2)
    elif architecture == "mobilenet_v3_large":
        model = models.mobilenet_v3_large(weights="DEFAULT")
        model.classifier[3] = nn.Linear(1280, 2)
    else:
        raise ValueError(f"Unknown architecture: {architecture}")
    return model


class BaseModel(pl.LightningModule):
    def __init__(self, architecture):
        super().__init__()
        self.model = create_model(architecture)
        self.architecture = architecture
        self.criterion = nn.CrossEntropyLoss()

    def on_save_checkpoint(self, checkpoint):
        checkpoint["architecture"] = self.architecture

    @classmethod
    def load_from_checkpoint(cls, checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        model = cls(checkpoint["architecture"])
        model.load_state_dict(checkpoint["state_dict"])
        return model

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self(inputs)
        loss = self.criterion(outputs, labels)
        _, preds = torch.max(outputs, 1)
        acc = accuracy(preds, labels, task="binary")
        self.log("train_loss", loss)
        self.log("train_acc", acc, on_step=True, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self(inputs)
        loss = self.criterion(outputs, labels)
        _, preds = torch.max(outputs, 1)
        acc = accuracy(preds, labels, task="binary")
        self.log("val_loss", loss)
        self.log("val_acc", acc, on_step=True, on_epoch=True)
        return loss

    def test_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self(inputs)
        loss = self.criterion(outputs, labels)
        _, preds = torch.max(outputs, 1)
        acc = accuracy(preds, labels, task="binary")
        self.log("test_loss", loss)
        self.log("test_acc", acc, on_step=True, on_epoch=True)
        return loss

    def configure_optimizers(self):
        return Adam(self.parameters(), lr=0.001)


early_stop_callback = EarlyStopping(
    monitor="val_loss", patience=3, verbose=False, mode="min"
)


def train_model(train_loader, val_loader, test_loader, num_epochs, device):
    models = [
        BaseModel("efficientnet_b0"),
        BaseModel("mobilenet_v2"),
        BaseModel("mobilenet_v3_large"),
    ]

    for model in models:
        model = model.to(device)
        logger = TensorBoardLogger("lightning_logs", name=model.architecture)
        trainer = pl.Trainer(
            max_epochs=num_epochs,
            callbacks=[early_stop_callback],
            accelerator="gpu",
            devices=1,
            logger=logger,
        )
        trainer.fit(model, train_loader, val_loader)
        trainer.test(model, test_loader)

ERROR:

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4070 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Missing logger folder: lightning_logs\efficientnet_b0
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | model     | EfficientNet     | 4.0 M 
1 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
4.0 M     Trainable params
0         Non-trainable params
4.0 M     Total params
16.040    Total estimated model params size (MB)
Sanity Checking: |                                                                                                                                   | 0/? [00:00<?, ?it/s]C:\Users\felipe\.conda\envs\pytorch\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
C:\Users\felipe\.conda\envs\pytorch\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
Epoch 7: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 463/463 [04:14<00:00,  1.82it/s, v_num=0] 
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]                                                                                                                                   
C:\Users\felipe\.conda\envs\pytorch\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
Testing DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:58<00:00,  1.71it/s] 
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃        Test metric        ┃       DataLoader 0        ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│      test_acc_epoch       │    0.9996848702430725     │
│         test_loss         │   0.0006146501400507987   │
└───────────────────────────┴───────────────────────────┘
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: lightning_logs\mobilenet_v2
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | model     | MobileNetV2      | 2.2 M
1 | criterion | CrossEntropyLoss | 0
-----------------------------------------------
2.2 M     Trainable params
0         Non-trainable params
2.2 M     Total params
8.906     Total estimated model params size (MB)
Epoch 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 463/463 [04:18<00:00,  1.79it/s, v_num=0] 
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]                                                                                                                                   
Testing DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:46<00:00,  2.16it/s] 
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃        Test metric        ┃       DataLoader 0        ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│      test_acc_epoch       │     0.994957447052002     │
│         test_loss         │    0.0266144797205925     │
└───────────────────────────┴───────────────────────────┘
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: lightning_logs\mobilenet_v3_large
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | model     | MobileNetV3      | 4.2 M
1 | criterion | CrossEntropyLoss | 0
-----------------------------------------------
4.2 M     Trainable params
0         Non-trainable params
4.2 M     Total params
16.818    Total estimated model params size (MB)
Epoch 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 463/463 [04:15<00:00,  1.81it/s, v_num=0] 
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Testing DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:49<00:00,  2.01it/s]
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃        Test metric        ┃       DataLoader 0        ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│      test_acc_epoch       │     0.946107804775238     │
│         test_loss         │    0.31665417551994324    │
└───────────────────────────┴───────────────────────────┘

I tried running 1 at a time manually and it worked correctly, but I would like to automate this process. I expected the models to train normally.

Topic		Replies	Views
How to continue training for more epochs? Trainer	1	1465	March 25, 2023
Why pytorch lightning code does not end?	1	855	September 7, 2022
Does not run validation step after epoch when running with all data implementation help	5	2678	May 1, 2023
Training_epoch_end is never called LightningModule	3	1578	February 22, 2021
Sequential training on multiple trainloaders in a single epoch LightningModule	0	528	March 10, 2021

Model training stops at the first epoch (epoch 0)

Related topics