Hi there, I’m attempting to reproduce a deep convolutional model with lightning for experiment. But I meet a problem in training.
Environment
pytorch-lightning=1.9.4
pytorch=1.13.1
torchmetrics=0.11.3
Problem
Note the vertical axes that the loss just oscillates around 5.48 in a tiny rangefrom the first ep despite its zigzag, and the acc isn’t improved during the epochs (24 epochs for present finished).
Here’s my model definition and training code(113x113x3 input, batchsize=128)
class HalfDeepWriter(plight.LightningModule):
def __init__(self, num_classes):
super().__init__()
self.num_classes = num_classes
self.deepwriter = nn.Sequential(
nn.Conv2d(in_channels=3, out_channels=96, kernel_size=5, stride=2),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(in_channels=96, out_channels=256, kernel_size=3, padding=1, stride=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(in_channels=256, out_channels=384, kernel_size=3, padding=1, stride=1),
nn.ReLU(inplace=True),
nn.Conv2d(in_channels=384, out_channels=384, kernel_size=3, padding=1, stride=1),
nn.ReLU(inplace=True),
nn.Conv2d(in_channels=384, out_channels=256, kernel_size=3, padding=1, stride=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
)
self.flatten = nn.Flatten()
self.fc1=nn.Sequential(
nn.Linear(256*6*6, 1024),
nn.ReLU(inplace=True),
nn.Dropout(0.5),
)
self.fc2=nn.Sequential(
nn.Linear(1024,1024),
nn.ReLU(inplace=True),
nn.Dropout(0.5),
)
self.classifier = nn.Linear(1024, num_classes)
#self.softmax = nn.Softmax(1)
self.criterion = nn.CrossEntropyLoss()
self.train_acc = metrics.Accuracy(task="multiclass", num_classes=self.num_classes)
self.val_acc = metrics.Accuracy(task="multiclass", num_classes=self.num_classes)
def forward(self, x):
out = self.deepwriter(x)
out = self.flatten(out)
out = self.fc1(out)
out = self.fc2(out)
out = self.classifier(out)
out = F.softmax(out, dim=1)
return out
def training_step(self, batch, batch_idx):
x,y = batch
y_hat = self.forward(x)
loss = self.criterion(y_hat, y)
self.train_acc(y_hat, y)
self.log("train_loss", loss)
self.log("train_acc", self.train_acc)
return loss
def validation_step(self, batch, batch_idx) :
x,y = batch
y_hat = self.forward(x)
loss = self.criterion(y_hat, y)
self.val_acc(y_hat, y)
self.log("val_acc", self.val_acc)
self.log("val_loss",loss)
def configure_optimizers(self):
optim = torch.optim.Adam(self.parameters(), lr=1e-3)
sched = torch.optim.lr_scheduler.StepLR(optim, step_size=1e+2, gamma=0.1)
return [optim], [sched]
num_workers = 4
bsize = 128
data_module = PatchDataModule(data_dir,batch_size = bsize,num_workers=num_workers)
data_module.setup(stage="train")
max_ep = 3000
logger_pretrain = TensorBoardLogger("tboard-logs/deepwriter")
ckpt_callback_pretrain = ModelCheckpoint("plight-ckpts/deepwriter",
filename="deepwriter-{epoch}",
every_n_epochs=10,
save_last=True,
save_top_k=2,
monitor="val_loss",
)
deepwriter = HalfDeepWriter(num_classes=240)
lr_monitor = LearningRateMonitor('step', True)
trainer = plight.Trainer(
logger=logger_pretrain,
callbacks=[ckpt_callback_pretrain, lr_monitor],
max_epochs=max_ep,
accelerator="gpu",
devices=1,
)
trainer.fit(deepwriter, datamodule=data_module)
Data module is made with ImageFolder dataset
I can’t figure out what’s wrong here. Maybe I miss something?