I’m training models using PyTorch Lightning, I built a loop to train one model at a time, but only the first model goes “far”, the other two are always stopping at the first epoch. Very strange, as my early stopping is for 3 epochs.
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from torch import nn
from torch.optim import Adam
from torchmetrics.functional import accuracy
from torchvision import models
def create_model(architecture):
if architecture == "efficientnet_b0":
model = models.efficientnet_b0(weights="DEFAULT")
num_features = model.classifier[1].in_features
model.classifier[1] = nn.Linear(num_features, 2)
elif architecture == "mobilenet_v2":
model = models.mobilenet_v2(weights="DEFAULT")
model.classifier[1] = nn.Linear(model.last_channel, 2)
elif architecture == "mobilenet_v3_large":
model = models.mobilenet_v3_large(weights="DEFAULT")
model.classifier[3] = nn.Linear(1280, 2)
else:
raise ValueError(f"Unknown architecture: {architecture}")
return model
class BaseModel(pl.LightningModule):
def __init__(self, architecture):
super().__init__()
self.model = create_model(architecture)
self.architecture = architecture
self.criterion = nn.CrossEntropyLoss()
def on_save_checkpoint(self, checkpoint):
checkpoint["architecture"] = self.architecture
@classmethod
def load_from_checkpoint(cls, checkpoint_path):
checkpoint = torch.load(checkpoint_path)
model = cls(checkpoint["architecture"])
model.load_state_dict(checkpoint["state_dict"])
return model
def forward(self, x):
return self.model(x)
def training_step(self, batch, batch_idx):
inputs, labels = batch
outputs = self(inputs)
loss = self.criterion(outputs, labels)
_, preds = torch.max(outputs, 1)
acc = accuracy(preds, labels, task="binary")
self.log("train_loss", loss)
self.log("train_acc", acc, on_step=True, on_epoch=True)
return loss
def validation_step(self, batch, batch_idx):
inputs, labels = batch
outputs = self(inputs)
loss = self.criterion(outputs, labels)
_, preds = torch.max(outputs, 1)
acc = accuracy(preds, labels, task="binary")
self.log("val_loss", loss)
self.log("val_acc", acc, on_step=True, on_epoch=True)
return loss
def test_step(self, batch, batch_idx):
inputs, labels = batch
outputs = self(inputs)
loss = self.criterion(outputs, labels)
_, preds = torch.max(outputs, 1)
acc = accuracy(preds, labels, task="binary")
self.log("test_loss", loss)
self.log("test_acc", acc, on_step=True, on_epoch=True)
return loss
def configure_optimizers(self):
return Adam(self.parameters(), lr=0.001)
early_stop_callback = EarlyStopping(
monitor="val_loss", patience=3, verbose=False, mode="min"
)
def train_model(train_loader, val_loader, test_loader, num_epochs, device):
models = [
BaseModel("efficientnet_b0"),
BaseModel("mobilenet_v2"),
BaseModel("mobilenet_v3_large"),
]
for model in models:
model = model.to(device)
logger = TensorBoardLogger("lightning_logs", name=model.architecture)
trainer = pl.Trainer(
max_epochs=num_epochs,
callbacks=[early_stop_callback],
accelerator="gpu",
devices=1,
logger=logger,
)
trainer.fit(model, train_loader, val_loader)
trainer.test(model, test_loader)
ERROR:
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4070 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Missing logger folder: lightning_logs\efficientnet_b0
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
| Name | Type | Params
-----------------------------------------------
0 | model | EfficientNet | 4.0 M
1 | criterion | CrossEntropyLoss | 0
-----------------------------------------------
4.0 M Trainable params
0 Non-trainable params
4.0 M Total params
16.040 Total estimated model params size (MB)
Sanity Checking: | | 0/? [00:00<?, ?it/s]C:\Users\felipe\.conda\envs\pytorch\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
C:\Users\felipe\.conda\envs\pytorch\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
Epoch 7: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 463/463 [04:14<00:00, 1.82it/s, v_num=0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
C:\Users\felipe\.conda\envs\pytorch\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
Testing DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:58<00:00, 1.71it/s]
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ Test metric ┃ DataLoader 0 ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ test_acc_epoch │ 0.9996848702430725 │
│ test_loss │ 0.0006146501400507987 │
└───────────────────────────┴───────────────────────────┘
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: lightning_logs\mobilenet_v2
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
| Name | Type | Params
-----------------------------------------------
0 | model | MobileNetV2 | 2.2 M
1 | criterion | CrossEntropyLoss | 0
-----------------------------------------------
2.2 M Trainable params
0 Non-trainable params
2.2 M Total params
8.906 Total estimated model params size (MB)
Epoch 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 463/463 [04:18<00:00, 1.79it/s, v_num=0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Testing DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:46<00:00, 2.16it/s]
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ Test metric ┃ DataLoader 0 ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ test_acc_epoch │ 0.994957447052002 │
│ test_loss │ 0.0266144797205925 │
└───────────────────────────┴───────────────────────────┘
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: lightning_logs\mobilenet_v3_large
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
| Name | Type | Params
-----------------------------------------------
0 | model | MobileNetV3 | 4.2 M
1 | criterion | CrossEntropyLoss | 0
-----------------------------------------------
4.2 M Trainable params
0 Non-trainable params
4.2 M Total params
16.818 Total estimated model params size (MB)
Epoch 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 463/463 [04:15<00:00, 1.81it/s, v_num=0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Testing DataLoader 0: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:49<00:00, 2.01it/s]
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ Test metric ┃ DataLoader 0 ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ test_acc_epoch │ 0.946107804775238 │
│ test_loss │ 0.31665417551994324 │
└───────────────────────────┴───────────────────────────┘
I tried running 1 at a time manually and it worked correctly, but I would like to automate this process. I expected the models to train normally.