Why are my training and validation losses only changing by very little?

I’m trying to implement a 1D neural network, with sequence length 80, 6 channels in PyTorch Lightning. The input size is [# examples, 6, 80]. I have no idea of what happened that lead to my loss not going down, with validation error and training error both changing by a factor of 1e-5 like noise and not in any particular direction. I have tried decreasing my learning rate by a factor of 10 from 0.01 all the way down to 1e-6, normalizing inputs over the channel (calculating global training-set channel mean and standard deviation), but still it is not working.

Here is my code.

class LitConvNet(pl.LightningModule):
    def __init__(self): # shows all the layers
        super().__init__()
        self.pool = nn.MaxPool1d(2)
        self.dropout = nn.Dropout(0.25)

        self.conv1 = nn.Conv1d(6,8,5, padding = "same") # 6 channels as (x, y, z) in accelerometer and 3 dims in gyroscope
        self.bn1 = nn.BatchNorm1d(8)
        
        self.conv2 = nn.Conv1d(8,16,5, padding = "same")
        self.bn2 = nn.BatchNorm1d(16)

        self.conv3 = nn.Conv1d(16,32,5, padding = "same")
        self.conv4 = nn.Conv1d(32,32,5, padding = "same")
        
        self.fc1 = nn.LazyLinear(16)
        self.fc2 = nn.Linear(16,2)

        self.crit = F.cross_entropy
        self.val_output = []
        self.running_loss = AverageMeter()

    def forward(self, x):
        
        # initial size: 16, 2, 129, 88 (relu are activation functions)
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool(x)
        x = F.relu(self.bn2(self.conv2(x))) 
        x = self.pool(x)           
        x = F.relu(self.conv3(x))
        x = self.pool(x)           
        x = F.relu(self.conv4(x))       

        #flatten for input to linear layer
        
        x = torch.flatten(x, 1)
        
        x = F.relu(self.fc1(x))    
        x = self.fc2(x)
        return x

    def train_dataloader(self):
        return train_dl
    
    def training_step(self, batch, batch_idx):
        input, labels = batch
        preds = self(input)
        labels = labels.long()
        loss = self.crit(preds, labels)
        self.running_loss.update(loss.item(), len(labels))

        self.log("Loss", loss, prog_bar=True)
        self.log("Running Loss", self.running_loss.show(), prog_bar=True)
        return loss

    def val_dataloader(self):
        return test_dl
    
    def validation_step(self, batch, batch_idx):
        images, labels = batch
        
        # Forward Pass
        preds = self(images)
        labels = labels.long()

        loss = self.crit(preds, labels)
        self.val_output.append({"val_pred": preds, "val_loss": loss, "labels": labels})
        return self.val_output
        
    def on_validation_epoch_end(self):
        avg_loss = torch.stack([x['val_loss'] for x in self.val_output]).mean()
        #print(self.val_output[0]['val_pred'].shape, self.val_output[-1]['val_pred'].shape)
        valid_preds = torch.cat([x['val_pred'] for x in self.val_output]).to(device="cuda:0")
        valid_labels = torch.cat([x['labels'] for x in self.val_output]).to(device="cuda:0")
        #print(valid_preds[:100], valid_labels[:100])
        acc = MulticlassAccuracy(num_classes=2).to(device="cuda:0")
        epoch_acc = acc(valid_preds, valid_labels)
        print(f'Average validation loss: {avg_loss} | Accuracy: {epoch_acc}')
        self.running_loss.reset()
        self.val_output.clear() 
    
    def configure_optimizers(self):
        torch.optim.Adam(model.parameters(), lr=learning_rate)

Here are the results of the training.

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type        | Params
----------------------------------------
0 | pool    | MaxPool1d   | 0     
1 | dropout | Dropout     | 0     
2 | conv1   | Conv1d      | 248   
3 | bn1     | BatchNorm1d | 16    
4 | conv2   | Conv1d      | 656   
5 | bn2     | BatchNorm1d | 32    
6 | conv3   | Conv1d      | 2.6 K 
7 | conv4   | Conv1d      | 5.2 K 
8 | fc1     | LazyLinear  | 0     
9 | fc2     | Linear      | 34    
----------------------------------------
8.7 K     Trainable params
0         Non-trainable params
8.7 K     Total params
0.035     Total estimated model params size (MB)
Average validation loss: 0.761396050453186 | Accuracy: 0.0
Epoch 29: 13%
260/2012 [00:01<00:07, 221.88it/s, v_num=20, Loss=0.661, Running Loss=0.662]
Average validation loss: 0.6660452485084534 | Accuracy: 0.5068368315696716
Average validation loss: 0.6649066209793091 | Accuracy: 0.509514570236206
Average validation loss: 0.6649907231330872 | Accuracy: 0.509514570236206
Average validation loss: 0.664742648601532 | Accuracy: 0.5078623294830322
Average validation loss: 0.6637939810752869 | Accuracy: 0.5074635744094849
Average validation loss: 0.663637101650238 | Accuracy: 0.5067798495292664
Average validation loss: 0.666568398475647 | Accuracy: 0.5083751082420349
Average validation loss: 0.666399359703064 | Accuracy: 0.5071786642074585
Average validation loss: 0.6654255986213684 | Accuracy: 0.5068368315696716
Average validation loss: 0.6637231707572937 | Accuracy: 0.507064700126648
Average validation loss: 0.6636268496513367 | Accuracy: 0.5067798495292664
Average validation loss: 0.6640604734420776 | Accuracy: 0.5071216821670532
Average validation loss: 0.6635396480560303 | Accuracy: 0.5067228674888611
Average validation loss: 0.6644148826599121 | Accuracy: 0.5076914429664612
Average validation loss: 0.6649019122123718 | Accuracy: 0.5080902576446533
Average validation loss: 0.6634423136711121 | Accuracy: 0.507064700126648
Average validation loss: 0.6645583510398865 | Accuracy: 0.5082611441612244
Average validation loss: 0.6661717891693115 | Accuracy: 0.5074065923690796
Average validation loss: 0.6646984219551086 | Accuracy: 0.5080902576446533
Average validation loss: 0.6638010144233704 | Accuracy: 0.5068938136100769
Average validation loss: 0.6630386114120483 | Accuracy: 0.5054124593734741
Average validation loss: 0.6649308204650879 | Accuracy: 0.5087169408798218
Average validation loss: 0.6650264263153076 | Accuracy: 0.5094575881958008
Average validation loss: 0.6629320383071899 | Accuracy: 0.5043869614601135
Average validation loss: 0.6648880243301392 | Accuracy: 0.5082042217254639
Average validation loss: 0.6640430688858032 | Accuracy: 0.5088309049606323
Average validation loss: 0.6647407412528992 | Accuracy: 0.5089448690414429
Average validation loss: 0.664152979850769 | Accuracy: 0.5074065923690796
Average validation loss: 0.6635743975639343 | Accuracy: 0.507805347442627
Validation: 0it [00:00, ?it/s]
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=30` reached.
Average validation loss: 0.6635584831237793 | Accuracy: 0.5083181262016296

I’ve also noticed that the output on the validation batch is very similar to each other - the outputs are not changing. What happened? One subsample of 100 validation examples yielded this:

tensor([[ 0.1528, -0.1483],
        [ 0.1528, -0.1483],
        [ 0.1468, -0.1635],
        [ 0.1557, -0.1177],
        [ 0.1644, -0.1507],
        [ 0.1232, -0.1610],
        [ 0.0840, -0.1708],
        [ 0.1403, -0.1519],
        [ 0.1456, -0.1368],
        [ 0.1330, -0.1567],
        [ 0.1312, -0.1529],
        [ 0.1106, -0.1608],
        [ 0.1407, -0.1649],
        [ 0.1198, -0.1669],
        [ 0.1419, -0.1471],
        [ 0.1264, -0.1670],
        [ 0.1389, -0.1641],
        [ 0.1374, -0.1576],
        [ 0.1474, -0.1581],
        [ 0.1261, -0.1638],
        [ 0.1487, -0.1471],
        [ 0.1361, -0.1587],
        [ 0.1561, -0.1223],
        [ 0.1111, -0.1757],
        [ 0.1202, -0.1729],
        [ 0.1493, -0.1544],
        [ 0.1348, -0.1536],
        [ 0.1411, -0.1412],
        [ 0.1500, -0.1519],
        [ 0.1359, -0.1493],
        [ 0.1193, -0.1703],
        [ 0.1477, -0.1721],
        [ 0.1561, -0.1557],
        [ 0.1500, -0.1550],
        [ 0.1461, -0.1699],
        [ 0.1610, -0.1520],
        [ 0.1659, -0.1694],
        [ 0.1538, -0.1448],
        [ 0.1290, -0.1550],
        [ 0.1467, -0.1702],
        [ 0.1527, -0.1691],
        [ 0.1975, -0.1549],
        [ 0.1536, -0.1561],
        [ 0.1558, -0.1346],
        [ 0.1758, -0.1535],
        [ 0.1619, -0.1605],
        [ 0.1434, -0.1648],
        [ 0.1672, -0.1398],
        [ 0.1478, -0.1614],
        [ 0.1553, -0.1489],
        [ 0.1555, -0.1535],
        [ 0.1497, -0.1548],
        [ 0.1321, -0.1606],
        [ 0.1394, -0.1555],
        [ 0.1543, -0.1553],
        [ 0.1443, -0.1591],
        [ 0.1629, -0.1422],
        [ 0.1685, -0.1765],
        [ 0.1626, -0.1530],
        [ 0.1521, -0.1460],
        [ 0.1381, -0.1838],
        [ 0.1833, -0.1476],
        [ 0.1659, -0.1438],
        [ 0.1768, -0.1504],
        [ 0.1433, -0.1500],
        [ 0.1368, -0.1546],
        [ 0.1341, -0.1550],
        [ 0.1370, -0.1501],
        [ 0.1412, -0.1525],
        [ 0.1396, -0.1497],
        [ 0.1501, -0.1559],
        [ 0.1320, -0.1567],
        [ 0.1356, -0.1547],
        [ 0.1376, -0.1570],
        [ 0.1502, -0.1541],
        [ 0.1399, -0.1518],
        [ 0.1392, -0.1510],
        [ 0.1399, -0.1514],
        [ 0.1285, -0.1660],
        [ 0.1407, -0.1516],
        [ 0.1433, -0.1500],
        [ 0.1393, -0.1524],
        [ 0.1320, -0.1574],
        [ 0.1395, -0.1511],
        [ 0.1409, -0.1502],
        [ 0.1376, -0.1554],
        [ 0.1394, -0.1469],
        [ 0.1400, -0.1486],
        [ 0.1414, -0.1482],
        [ 0.1404, -0.1520],
        [ 0.1367, -0.1533],
        [ 0.1381, -0.1454],
        [ 0.1383, -0.1546],
        [ 0.1432, -0.1538],
        [ 0.1404, -0.1520],
        [ 0.1409, -0.1502],
        [ 0.1754, -0.1555],
        [ 0.1730, -0.1542],
        [ 0.1810, -0.1521],
        [ 0.1782, -0.1547]], device='cuda:0') tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0], device='cuda:0')
Average validation loss: 0.6133456230163574 | Accuracy: 0.4999430179595947

what is the value of your learning rate? Maybe it’s too small?

Hi, did you solved the issue? I’m facing a very similar problem, also with a model that uses 1d convolutions:

class TempCNN(torch.nn.Module):
    def __init__(self, num_classes=8, kernel_size=5, hidden_dims=64, dropout=0.5):
        super(TempCNN, self).__init__()
        self.hidden_dims = hidden_dims

        self.conv_bn_relu1 = Conv1D_BatchNorm_Relu_Dropout(hidden_dims, kernel_size=kernel_size,
                                                           drop_probability=dropout)
        self.conv_bn_relu2 = Conv1D_BatchNorm_Relu_Dropout(hidden_dims, kernel_size=kernel_size,
                                                           drop_probability=dropout)
        self.conv_bn_relu3 = Conv1D_BatchNorm_Relu_Dropout(hidden_dims, kernel_size=kernel_size,
                                                           drop_probability=dropout)
        self.flatten = Flatten()
        self.dense = FC_BatchNorm_Relu_Dropout(4 * hidden_dims, drop_probability=dropout)
        self.linear = nn.LazyLinear(num_classes)
        #self.logsoftmax = nn.Sequential(nn.Linear(4 * hidden_dims, num_classes), nn.LogSoftmax(dim=-1))

    def forward(self, x):
        # require NxTxD
        #x = x.transpose(1,2)
        x = self.conv_bn_relu1(x)
        x = self.conv_bn_relu2(x)
        x = self.conv_bn_relu3(x)
        x = self.flatten(x)
        x = self.dense(x)
        return self.linear(x)

class Conv1D_BatchNorm_Relu_Dropout(torch.nn.Module):
    def __init__(self, hidden_dims, kernel_size=5, drop_probability=0.5):
        super(Conv1D_BatchNorm_Relu_Dropout, self).__init__()

        self.block = nn.Sequential(
            nn.LazyConv1d(hidden_dims, kernel_size, padding=(kernel_size // 2)),
            nn.BatchNorm1d(hidden_dims),
            nn.ReLU(),
            nn.Dropout(p=drop_probability)
        )

    def forward(self, X):
        return self.block(X)

class FC_BatchNorm_Relu_Dropout(torch.nn.Module):
    def __init__(self, hidden_dims, drop_probability=0.5):
        super(FC_BatchNorm_Relu_Dropout, self).__init__()

        self.block = nn.Sequential(
            nn.LazyLinear(hidden_dims),
            nn.BatchNorm1d(hidden_dims),
            nn.ReLU(),
            nn.Dropout(p=drop_probability)
        )

    def forward(self, X):
        return self.block(X)


class Flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)

And these are the steps in the task module:

def forward(self, input_im):
        logits = self.model(input_im)
        return logits

    def step(self, batch):
        X, targets = batch[0], batch[1] -1
        logits = self.forward(X)
        proba = torch.softmax(logits, dim=1)
        loss = self.criterion(logits, targets)
        with torch.no_grad():
            preds = torch.argmax(proba, dim=1) 
        return loss, preds, targets

    def training_step(self, batch, batch_idx):
        loss, preds, targets = self.step(batch)
        
        self.train_loss.update(loss)
        self.train_metrics(preds, targets)
        return loss

    def validation_step(self, batch, batch_idx):
        loss, preds, targets = self.step(batch)
        self.val_loss.update(loss)
        self.val_metrics(preds, targets)
        return loss

My code was working properly before, but now the losses won’t go down.