I’m trying to implement a 1D neural network, with sequence length 80, 6 channels in PyTorch Lightning. The input size is [# examples, 6, 80]. I have no idea of what happened that lead to my loss not going down, with validation error and training error both changing by a factor of 1e-5 like noise and not in any particular direction. I have tried decreasing my learning rate by a factor of 10 from 0.01 all the way down to 1e-6, normalizing inputs over the channel (calculating global training-set channel mean and standard deviation), but still it is not working.
Here is my code.
class LitConvNet(pl.LightningModule):
def __init__(self): # shows all the layers
super().__init__()
self.pool = nn.MaxPool1d(2)
self.dropout = nn.Dropout(0.25)
self.conv1 = nn.Conv1d(6,8,5, padding = "same") # 6 channels as (x, y, z) in accelerometer and 3 dims in gyroscope
self.bn1 = nn.BatchNorm1d(8)
self.conv2 = nn.Conv1d(8,16,5, padding = "same")
self.bn2 = nn.BatchNorm1d(16)
self.conv3 = nn.Conv1d(16,32,5, padding = "same")
self.conv4 = nn.Conv1d(32,32,5, padding = "same")
self.fc1 = nn.LazyLinear(16)
self.fc2 = nn.Linear(16,2)
self.crit = F.cross_entropy
self.val_output = []
self.running_loss = AverageMeter()
def forward(self, x):
# initial size: 16, 2, 129, 88 (relu are activation functions)
x = F.relu(self.bn1(self.conv1(x)))
x = self.pool(x)
x = F.relu(self.bn2(self.conv2(x)))
x = self.pool(x)
x = F.relu(self.conv3(x))
x = self.pool(x)
x = F.relu(self.conv4(x))
#flatten for input to linear layer
x = torch.flatten(x, 1)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
def train_dataloader(self):
return train_dl
def training_step(self, batch, batch_idx):
input, labels = batch
preds = self(input)
labels = labels.long()
loss = self.crit(preds, labels)
self.running_loss.update(loss.item(), len(labels))
self.log("Loss", loss, prog_bar=True)
self.log("Running Loss", self.running_loss.show(), prog_bar=True)
return loss
def val_dataloader(self):
return test_dl
def validation_step(self, batch, batch_idx):
images, labels = batch
# Forward Pass
preds = self(images)
labels = labels.long()
loss = self.crit(preds, labels)
self.val_output.append({"val_pred": preds, "val_loss": loss, "labels": labels})
return self.val_output
def on_validation_epoch_end(self):
avg_loss = torch.stack([x['val_loss'] for x in self.val_output]).mean()
#print(self.val_output[0]['val_pred'].shape, self.val_output[-1]['val_pred'].shape)
valid_preds = torch.cat([x['val_pred'] for x in self.val_output]).to(device="cuda:0")
valid_labels = torch.cat([x['labels'] for x in self.val_output]).to(device="cuda:0")
#print(valid_preds[:100], valid_labels[:100])
acc = MulticlassAccuracy(num_classes=2).to(device="cuda:0")
epoch_acc = acc(valid_preds, valid_labels)
print(f'Average validation loss: {avg_loss} | Accuracy: {epoch_acc}')
self.running_loss.reset()
self.val_output.clear()
def configure_optimizers(self):
torch.optim.Adam(model.parameters(), lr=learning_rate)
Here are the results of the training.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
| Name | Type | Params
----------------------------------------
0 | pool | MaxPool1d | 0
1 | dropout | Dropout | 0
2 | conv1 | Conv1d | 248
3 | bn1 | BatchNorm1d | 16
4 | conv2 | Conv1d | 656
5 | bn2 | BatchNorm1d | 32
6 | conv3 | Conv1d | 2.6 K
7 | conv4 | Conv1d | 5.2 K
8 | fc1 | LazyLinear | 0
9 | fc2 | Linear | 34
----------------------------------------
8.7 K Trainable params
0 Non-trainable params
8.7 K Total params
0.035 Total estimated model params size (MB)
Average validation loss: 0.761396050453186 | Accuracy: 0.0
Epoch 29: 13%
260/2012 [00:01<00:07, 221.88it/s, v_num=20, Loss=0.661, Running Loss=0.662]
Average validation loss: 0.6660452485084534 | Accuracy: 0.5068368315696716
Average validation loss: 0.6649066209793091 | Accuracy: 0.509514570236206
Average validation loss: 0.6649907231330872 | Accuracy: 0.509514570236206
Average validation loss: 0.664742648601532 | Accuracy: 0.5078623294830322
Average validation loss: 0.6637939810752869 | Accuracy: 0.5074635744094849
Average validation loss: 0.663637101650238 | Accuracy: 0.5067798495292664
Average validation loss: 0.666568398475647 | Accuracy: 0.5083751082420349
Average validation loss: 0.666399359703064 | Accuracy: 0.5071786642074585
Average validation loss: 0.6654255986213684 | Accuracy: 0.5068368315696716
Average validation loss: 0.6637231707572937 | Accuracy: 0.507064700126648
Average validation loss: 0.6636268496513367 | Accuracy: 0.5067798495292664
Average validation loss: 0.6640604734420776 | Accuracy: 0.5071216821670532
Average validation loss: 0.6635396480560303 | Accuracy: 0.5067228674888611
Average validation loss: 0.6644148826599121 | Accuracy: 0.5076914429664612
Average validation loss: 0.6649019122123718 | Accuracy: 0.5080902576446533
Average validation loss: 0.6634423136711121 | Accuracy: 0.507064700126648
Average validation loss: 0.6645583510398865 | Accuracy: 0.5082611441612244
Average validation loss: 0.6661717891693115 | Accuracy: 0.5074065923690796
Average validation loss: 0.6646984219551086 | Accuracy: 0.5080902576446533
Average validation loss: 0.6638010144233704 | Accuracy: 0.5068938136100769
Average validation loss: 0.6630386114120483 | Accuracy: 0.5054124593734741
Average validation loss: 0.6649308204650879 | Accuracy: 0.5087169408798218
Average validation loss: 0.6650264263153076 | Accuracy: 0.5094575881958008
Average validation loss: 0.6629320383071899 | Accuracy: 0.5043869614601135
Average validation loss: 0.6648880243301392 | Accuracy: 0.5082042217254639
Average validation loss: 0.6640430688858032 | Accuracy: 0.5088309049606323
Average validation loss: 0.6647407412528992 | Accuracy: 0.5089448690414429
Average validation loss: 0.664152979850769 | Accuracy: 0.5074065923690796
Average validation loss: 0.6635743975639343 | Accuracy: 0.507805347442627
Validation: 0it [00:00, ?it/s]
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=30` reached.
Average validation loss: 0.6635584831237793 | Accuracy: 0.5083181262016296
I’ve also noticed that the output on the validation batch is very similar to each other - the outputs are not changing. What happened? One subsample of 100 validation examples yielded this:
tensor([[ 0.1528, -0.1483],
[ 0.1528, -0.1483],
[ 0.1468, -0.1635],
[ 0.1557, -0.1177],
[ 0.1644, -0.1507],
[ 0.1232, -0.1610],
[ 0.0840, -0.1708],
[ 0.1403, -0.1519],
[ 0.1456, -0.1368],
[ 0.1330, -0.1567],
[ 0.1312, -0.1529],
[ 0.1106, -0.1608],
[ 0.1407, -0.1649],
[ 0.1198, -0.1669],
[ 0.1419, -0.1471],
[ 0.1264, -0.1670],
[ 0.1389, -0.1641],
[ 0.1374, -0.1576],
[ 0.1474, -0.1581],
[ 0.1261, -0.1638],
[ 0.1487, -0.1471],
[ 0.1361, -0.1587],
[ 0.1561, -0.1223],
[ 0.1111, -0.1757],
[ 0.1202, -0.1729],
[ 0.1493, -0.1544],
[ 0.1348, -0.1536],
[ 0.1411, -0.1412],
[ 0.1500, -0.1519],
[ 0.1359, -0.1493],
[ 0.1193, -0.1703],
[ 0.1477, -0.1721],
[ 0.1561, -0.1557],
[ 0.1500, -0.1550],
[ 0.1461, -0.1699],
[ 0.1610, -0.1520],
[ 0.1659, -0.1694],
[ 0.1538, -0.1448],
[ 0.1290, -0.1550],
[ 0.1467, -0.1702],
[ 0.1527, -0.1691],
[ 0.1975, -0.1549],
[ 0.1536, -0.1561],
[ 0.1558, -0.1346],
[ 0.1758, -0.1535],
[ 0.1619, -0.1605],
[ 0.1434, -0.1648],
[ 0.1672, -0.1398],
[ 0.1478, -0.1614],
[ 0.1553, -0.1489],
[ 0.1555, -0.1535],
[ 0.1497, -0.1548],
[ 0.1321, -0.1606],
[ 0.1394, -0.1555],
[ 0.1543, -0.1553],
[ 0.1443, -0.1591],
[ 0.1629, -0.1422],
[ 0.1685, -0.1765],
[ 0.1626, -0.1530],
[ 0.1521, -0.1460],
[ 0.1381, -0.1838],
[ 0.1833, -0.1476],
[ 0.1659, -0.1438],
[ 0.1768, -0.1504],
[ 0.1433, -0.1500],
[ 0.1368, -0.1546],
[ 0.1341, -0.1550],
[ 0.1370, -0.1501],
[ 0.1412, -0.1525],
[ 0.1396, -0.1497],
[ 0.1501, -0.1559],
[ 0.1320, -0.1567],
[ 0.1356, -0.1547],
[ 0.1376, -0.1570],
[ 0.1502, -0.1541],
[ 0.1399, -0.1518],
[ 0.1392, -0.1510],
[ 0.1399, -0.1514],
[ 0.1285, -0.1660],
[ 0.1407, -0.1516],
[ 0.1433, -0.1500],
[ 0.1393, -0.1524],
[ 0.1320, -0.1574],
[ 0.1395, -0.1511],
[ 0.1409, -0.1502],
[ 0.1376, -0.1554],
[ 0.1394, -0.1469],
[ 0.1400, -0.1486],
[ 0.1414, -0.1482],
[ 0.1404, -0.1520],
[ 0.1367, -0.1533],
[ 0.1381, -0.1454],
[ 0.1383, -0.1546],
[ 0.1432, -0.1538],
[ 0.1404, -0.1520],
[ 0.1409, -0.1502],
[ 0.1754, -0.1555],
[ 0.1730, -0.1542],
[ 0.1810, -0.1521],
[ 0.1782, -0.1547]], device='cuda:0') tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0], device='cuda:0')
Average validation loss: 0.6133456230163574 | Accuracy: 0.4999430179595947