Why are my training and validation losses only changing by very little?

jfang · April 18, 2023, 1:21pm

I’m trying to implement a 1D neural network, with sequence length 80, 6 channels in PyTorch Lightning. The input size is [# examples, 6, 80]. I have no idea of what happened that lead to my loss not going down, with validation error and training error both changing by a factor of 1e-5 like noise and not in any particular direction. I have tried decreasing my learning rate by a factor of 10 from 0.01 all the way down to 1e-6, normalizing inputs over the channel (calculating global training-set channel mean and standard deviation), but still it is not working.

Here is my code.

class LitConvNet(pl.LightningModule):
    def __init__(self): # shows all the layers
        super().__init__()
        self.pool = nn.MaxPool1d(2)
        self.dropout = nn.Dropout(0.25)

        self.conv1 = nn.Conv1d(6,8,5, padding = "same") # 6 channels as (x, y, z) in accelerometer and 3 dims in gyroscope
        self.bn1 = nn.BatchNorm1d(8)
        
        self.conv2 = nn.Conv1d(8,16,5, padding = "same")
        self.bn2 = nn.BatchNorm1d(16)

        self.conv3 = nn.Conv1d(16,32,5, padding = "same")
        self.conv4 = nn.Conv1d(32,32,5, padding = "same")
        
        self.fc1 = nn.LazyLinear(16)
        self.fc2 = nn.Linear(16,2)

        self.crit = F.cross_entropy
        self.val_output = []
        self.running_loss = AverageMeter()

    def forward(self, x):
        
        # initial size: 16, 2, 129, 88 (relu are activation functions)
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool(x)
        x = F.relu(self.bn2(self.conv2(x))) 
        x = self.pool(x)           
        x = F.relu(self.conv3(x))
        x = self.pool(x)           
        x = F.relu(self.conv4(x))       

        #flatten for input to linear layer
        
        x = torch.flatten(x, 1)
        
        x = F.relu(self.fc1(x))    
        x = self.fc2(x)
        return x

    def train_dataloader(self):
        return train_dl
    
    def training_step(self, batch, batch_idx):
        input, labels = batch
        preds = self(input)
        labels = labels.long()
        loss = self.crit(preds, labels)
        self.running_loss.update(loss.item(), len(labels))

        self.log("Loss", loss, prog_bar=True)
        self.log("Running Loss", self.running_loss.show(), prog_bar=True)
        return loss

    def val_dataloader(self):
        return test_dl
    
    def validation_step(self, batch, batch_idx):
        images, labels = batch
        
        # Forward Pass
        preds = self(images)
        labels = labels.long()

        loss = self.crit(preds, labels)
        self.val_output.append({"val_pred": preds, "val_loss": loss, "labels": labels})
        return self.val_output
        
    def on_validation_epoch_end(self):
        avg_loss = torch.stack([x['val_loss'] for x in self.val_output]).mean()
        #print(self.val_output[0]['val_pred'].shape, self.val_output[-1]['val_pred'].shape)
        valid_preds = torch.cat([x['val_pred'] for x in self.val_output]).to(device="cuda:0")
        valid_labels = torch.cat([x['labels'] for x in self.val_output]).to(device="cuda:0")
        #print(valid_preds[:100], valid_labels[:100])
        acc = MulticlassAccuracy(num_classes=2).to(device="cuda:0")
        epoch_acc = acc(valid_preds, valid_labels)
        print(f'Average validation loss: {avg_loss} | Accuracy: {epoch_acc}')
        self.running_loss.reset()
        self.val_output.clear() 
    
    def configure_optimizers(self):
        torch.optim.Adam(model.parameters(), lr=learning_rate)

Here are the results of the training.

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type        | Params
----------------------------------------
0 | pool    | MaxPool1d   | 0     
1 | dropout | Dropout     | 0     
2 | conv1   | Conv1d      | 248   
3 | bn1     | BatchNorm1d | 16    
4 | conv2   | Conv1d      | 656   
5 | bn2     | BatchNorm1d | 32    
6 | conv3   | Conv1d      | 2.6 K 
7 | conv4   | Conv1d      | 5.2 K 
8 | fc1     | LazyLinear  | 0     
9 | fc2     | Linear      | 34    
----------------------------------------
8.7 K     Trainable params
0         Non-trainable params
8.7 K     Total params
0.035     Total estimated model params size (MB)
Average validation loss: 0.761396050453186 | Accuracy: 0.0
Epoch 29: 13%
260/2012 [00:01<00:07, 221.88it/s, v_num=20, Loss=0.661, Running Loss=0.662]
Average validation loss: 0.6660452485084534 | Accuracy: 0.5068368315696716
Average validation loss: 0.6649066209793091 | Accuracy: 0.509514570236206
Average validation loss: 0.6649907231330872 | Accuracy: 0.509514570236206
Average validation loss: 0.664742648601532 | Accuracy: 0.5078623294830322
Average validation loss: 0.6637939810752869 | Accuracy: 0.5074635744094849
Average validation loss: 0.663637101650238 | Accuracy: 0.5067798495292664
Average validation loss: 0.666568398475647 | Accuracy: 0.5083751082420349
Average validation loss: 0.666399359703064 | Accuracy: 0.5071786642074585
Average validation loss: 0.6654255986213684 | Accuracy: 0.5068368315696716
Average validation loss: 0.6637231707572937 | Accuracy: 0.507064700126648
Average validation loss: 0.6636268496513367 | Accuracy: 0.5067798495292664
Average validation loss: 0.6640604734420776 | Accuracy: 0.5071216821670532
Average validation loss: 0.6635396480560303 | Accuracy: 0.5067228674888611
Average validation loss: 0.6644148826599121 | Accuracy: 0.5076914429664612
Average validation loss: 0.6649019122123718 | Accuracy: 0.5080902576446533
Average validation loss: 0.6634423136711121 | Accuracy: 0.507064700126648
Average validation loss: 0.6645583510398865 | Accuracy: 0.5082611441612244
Average validation loss: 0.6661717891693115 | Accuracy: 0.5074065923690796
Average validation loss: 0.6646984219551086 | Accuracy: 0.5080902576446533
Average validation loss: 0.6638010144233704 | Accuracy: 0.5068938136100769
Average validation loss: 0.6630386114120483 | Accuracy: 0.5054124593734741
Average validation loss: 0.6649308204650879 | Accuracy: 0.5087169408798218
Average validation loss: 0.6650264263153076 | Accuracy: 0.5094575881958008
Average validation loss: 0.6629320383071899 | Accuracy: 0.5043869614601135
Average validation loss: 0.6648880243301392 | Accuracy: 0.5082042217254639
Average validation loss: 0.6640430688858032 | Accuracy: 0.5088309049606323
Average validation loss: 0.6647407412528992 | Accuracy: 0.5089448690414429
Average validation loss: 0.664152979850769 | Accuracy: 0.5074065923690796
Average validation loss: 0.6635743975639343 | Accuracy: 0.507805347442627
Validation: 0it [00:00, ?it/s]
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=30` reached.
Average validation loss: 0.6635584831237793 | Accuracy: 0.5083181262016296

I’ve also noticed that the output on the validation batch is very similar to each other - the outputs are not changing. What happened? One subsample of 100 validation examples yielded this:

tensor([[ 0.1528, -0.1483],
        [ 0.1528, -0.1483],
        [ 0.1468, -0.1635],
        [ 0.1557, -0.1177],
        [ 0.1644, -0.1507],
        [ 0.1232, -0.1610],
        [ 0.0840, -0.1708],
        [ 0.1403, -0.1519],
        [ 0.1456, -0.1368],
        [ 0.1330, -0.1567],
        [ 0.1312, -0.1529],
        [ 0.1106, -0.1608],
        [ 0.1407, -0.1649],
        [ 0.1198, -0.1669],
        [ 0.1419, -0.1471],
        [ 0.1264, -0.1670],
        [ 0.1389, -0.1641],
        [ 0.1374, -0.1576],
        [ 0.1474, -0.1581],
        [ 0.1261, -0.1638],
        [ 0.1487, -0.1471],
        [ 0.1361, -0.1587],
        [ 0.1561, -0.1223],
        [ 0.1111, -0.1757],
        [ 0.1202, -0.1729],
        [ 0.1493, -0.1544],
        [ 0.1348, -0.1536],
        [ 0.1411, -0.1412],
        [ 0.1500, -0.1519],
        [ 0.1359, -0.1493],
        [ 0.1193, -0.1703],
        [ 0.1477, -0.1721],
        [ 0.1561, -0.1557],
        [ 0.1500, -0.1550],
        [ 0.1461, -0.1699],
        [ 0.1610, -0.1520],
        [ 0.1659, -0.1694],
        [ 0.1538, -0.1448],
        [ 0.1290, -0.1550],
        [ 0.1467, -0.1702],
        [ 0.1527, -0.1691],
        [ 0.1975, -0.1549],
        [ 0.1536, -0.1561],
        [ 0.1558, -0.1346],
        [ 0.1758, -0.1535],
        [ 0.1619, -0.1605],
        [ 0.1434, -0.1648],
        [ 0.1672, -0.1398],
        [ 0.1478, -0.1614],
        [ 0.1553, -0.1489],
        [ 0.1555, -0.1535],
        [ 0.1497, -0.1548],
        [ 0.1321, -0.1606],
        [ 0.1394, -0.1555],
        [ 0.1543, -0.1553],
        [ 0.1443, -0.1591],
        [ 0.1629, -0.1422],
        [ 0.1685, -0.1765],
        [ 0.1626, -0.1530],
        [ 0.1521, -0.1460],
        [ 0.1381, -0.1838],
        [ 0.1833, -0.1476],
        [ 0.1659, -0.1438],
        [ 0.1768, -0.1504],
        [ 0.1433, -0.1500],
        [ 0.1368, -0.1546],
        [ 0.1341, -0.1550],
        [ 0.1370, -0.1501],
        [ 0.1412, -0.1525],
        [ 0.1396, -0.1497],
        [ 0.1501, -0.1559],
        [ 0.1320, -0.1567],
        [ 0.1356, -0.1547],
        [ 0.1376, -0.1570],
        [ 0.1502, -0.1541],
        [ 0.1399, -0.1518],
        [ 0.1392, -0.1510],
        [ 0.1399, -0.1514],
        [ 0.1285, -0.1660],
        [ 0.1407, -0.1516],
        [ 0.1433, -0.1500],
        [ 0.1393, -0.1524],
        [ 0.1320, -0.1574],
        [ 0.1395, -0.1511],
        [ 0.1409, -0.1502],
        [ 0.1376, -0.1554],
        [ 0.1394, -0.1469],
        [ 0.1400, -0.1486],
        [ 0.1414, -0.1482],
        [ 0.1404, -0.1520],
        [ 0.1367, -0.1533],
        [ 0.1381, -0.1454],
        [ 0.1383, -0.1546],
        [ 0.1432, -0.1538],
        [ 0.1404, -0.1520],
        [ 0.1409, -0.1502],
        [ 0.1754, -0.1555],
        [ 0.1730, -0.1542],
        [ 0.1810, -0.1521],
        [ 0.1782, -0.1547]], device='cuda:0') tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0], device='cuda:0')
Average validation loss: 0.6133456230163574 | Accuracy: 0.4999430179595947

awaelchli · April 18, 2023, 1:27pm

what is the value of your learning rate? Maybe it’s too small?

emanrique · April 28, 2023, 7:54pm

Hi, did you solved the issue? I’m facing a very similar problem, also with a model that uses 1d convolutions:

class TempCNN(torch.nn.Module):
    def __init__(self, num_classes=8, kernel_size=5, hidden_dims=64, dropout=0.5):
        super(TempCNN, self).__init__()
        self.hidden_dims = hidden_dims

        self.conv_bn_relu1 = Conv1D_BatchNorm_Relu_Dropout(hidden_dims, kernel_size=kernel_size,
                                                           drop_probability=dropout)
        self.conv_bn_relu2 = Conv1D_BatchNorm_Relu_Dropout(hidden_dims, kernel_size=kernel_size,
                                                           drop_probability=dropout)
        self.conv_bn_relu3 = Conv1D_BatchNorm_Relu_Dropout(hidden_dims, kernel_size=kernel_size,
                                                           drop_probability=dropout)
        self.flatten = Flatten()
        self.dense = FC_BatchNorm_Relu_Dropout(4 * hidden_dims, drop_probability=dropout)
        self.linear = nn.LazyLinear(num_classes)
        #self.logsoftmax = nn.Sequential(nn.Linear(4 * hidden_dims, num_classes), nn.LogSoftmax(dim=-1))

    def forward(self, x):
        # require NxTxD
        #x = x.transpose(1,2)
        x = self.conv_bn_relu1(x)
        x = self.conv_bn_relu2(x)
        x = self.conv_bn_relu3(x)
        x = self.flatten(x)
        x = self.dense(x)
        return self.linear(x)

class Conv1D_BatchNorm_Relu_Dropout(torch.nn.Module):
    def __init__(self, hidden_dims, kernel_size=5, drop_probability=0.5):
        super(Conv1D_BatchNorm_Relu_Dropout, self).__init__()

        self.block = nn.Sequential(
            nn.LazyConv1d(hidden_dims, kernel_size, padding=(kernel_size // 2)),
            nn.BatchNorm1d(hidden_dims),
            nn.ReLU(),
            nn.Dropout(p=drop_probability)
        )

    def forward(self, X):
        return self.block(X)

class FC_BatchNorm_Relu_Dropout(torch.nn.Module):
    def __init__(self, hidden_dims, drop_probability=0.5):
        super(FC_BatchNorm_Relu_Dropout, self).__init__()

        self.block = nn.Sequential(
            nn.LazyLinear(hidden_dims),
            nn.BatchNorm1d(hidden_dims),
            nn.ReLU(),
            nn.Dropout(p=drop_probability)
        )

    def forward(self, X):
        return self.block(X)


class Flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)

And these are the steps in the task module:

def forward(self, input_im):
        logits = self.model(input_im)
        return logits

    def step(self, batch):
        X, targets = batch[0], batch[1] -1
        logits = self.forward(X)
        proba = torch.softmax(logits, dim=1)
        loss = self.criterion(logits, targets)
        with torch.no_grad():
            preds = torch.argmax(proba, dim=1) 
        return loss, preds, targets

    def training_step(self, batch, batch_idx):
        loss, preds, targets = self.step(batch)
        
        self.train_loss.update(loss)
        self.train_metrics(preds, targets)
        return loss

    def validation_step(self, batch, batch_idx):
        loss, preds, targets = self.step(batch)
        self.val_loss.update(loss)
        self.val_metrics(preds, targets)
        return loss

My code was working properly before, but now the losses won’t go down.

Topic		Replies	Views
My Training Loss and Validation loss are correct but my validation loss is exploding implementation help	6	4581	October 30, 2023
Loss not decreasing - first-time user implementation help	2	910	October 17, 2022
Weird result in convolutional network Trainer	2	504	May 14, 2023
Validation loss doesn't decrease when using Lightning implementation help	0	469	July 19, 2022
Pytorch Lightning Module not decreasing training loss/improving even training	0	1679	August 12, 2022

Why are my training and validation losses only changing by very little?

Related topics