I was training a model with lightining but it seems the model does not converge, the loss is stuck in 0.7. Anyway I just make a toy dataset with two gaussian multivariate (class 1 and class 0), and repeat the experimentation with a basic MLP like model ans binary cross entropy.
When the model is trained using GPU, the error rate does not seem to go down, but if i use the CPU it does. I have just use 0.9 version.
Steps to reproduce the behavior:
- Based on the following model
mean_1 = [0, 0]
cov_1 = [[1, 0], [0, 100]]
mean_2 = [5,-7]
cov_2 = [[16, 70], [1000, 0.1]]
class ToyDataset(Dataset):
def __init__(self, param1, param2):
mean_1, cov_1 = param1
mean_2, cov_2 = param2
data_1 = np.random.multivariate_normal(mean_1, cov_1, 50000)
y_1 = np.zeros(50000)
data_2 = np.random.multivariate_normal(mean_2, cov_2, 50000)
y_2 = np.ones(50000)
data_all_x = np.concatenate((data_1,data_2), axis=0)
data_all_y = np.concatenate((y_1,y_2), axis=0)
idx = list(range(100000))
random.shuffle(idx)
self.data_all_x = data_all_x[idx]
self.data_all_y = data_all_y[idx]
def __getitem__(self, sample_index):
return self.data_all_x[sample_index], self.data_all_y[sample_index]
def __len__(self):
return len(self.data_all_y)
class PocLightning(pl.LightningModule):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(2, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 1)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return F.sigmoid(x)
def training_step(self, batch, batch_idx):
x, y = batch
y_hat = self(x.float())
criterion = nn.BCELoss()
loss = criterion(y_hat, y.float())
return loss
def validation_step(self, batch, batch_idx):
x, y = batch
y_hat = self(x.float())
criterion = nn.BCELoss()
loss = criterion(y_hat, y.float())
return loss
def test_step(self, batch, batch_idx):
x, y = batch
y_hat = self(x.float())
criterion = nn.BCELoss()
loss = criterion(y_hat, y.float())
return loss
def configure_optimizers(self):
opt_SGD = torch.optim.SGD(self.parameters(), lr = 0.001, momentum=0.9)
return opt_SGD
def prepare_data(self):
self.train_data = ToyDataset((mean_1, cov_1), (mean_2, cov_2))
self.test_data = ToyDataset((mean_1, cov_1), (mean_2, cov_2))
self.val_data = ToyDataset((mean_1, cov_1), (mean_2, cov_2))
def train_dataloader(self):
return DataLoader(self.train_data, batch_size=32)
def val_dataloader(self):
return DataLoader(self.val_data, batch_size=32)
def test_dataloader(self):
return DataLoader(self.test_data, batch_size=32)
- This is the behaviour when the previous model was trained using GPU
#train
trainer = pl.Trainer(gpus=1, max_epochs=2)
model = PocLightning()
trainer.fit(model)
result
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]
:8: RuntimeWarning: covariance is not positive-semidefinite.
data_2 = np.random.multivariate_normal(mean_2, cov_2, 50000)
| Name | Type | Params
0 | fc1 | Linear | 360
1 | fc2 | Linear | 10 K
2 | fc3 | Linear | 85
Epoch 1: 100%
6250/6250 [00:14<00:00, 417.76it/s, loss=0.690, v_num=85, val_loss=0.691]
Saving latest checkpoint…
- But doing without GPU
trainer = pl.Trainer(gpus=0, max_epochs=2)
model = PocLightning()
trainer.fit(model)
result
GPU available: True, used: False
TPU available: False, using: 0 TPU cores
:8: RuntimeWarning: covariance is not positive-semidefinite.
data_2 = np.random.multivariate_normal(mean_2, cov_2, 50000)
| Name | Type | Params
0 | fc1 | Linear | 360
1 | fc2 | Linear | 10 K
2 | fc3 | Linear | 85
Epoch 1: 100%
6250/6250 [00:10<00:00, 580.56it/s, loss=0.111, v_num=86, val_loss=0.115]
Saving latest checkpoint…
My Environment is
- CUDA:
- GPU:
- GeForce RTX 2080 Ti
- available: True
- version: 10.1
- GPU:
- Packages:
- numpy: 1.19.1
- pyTorch_debug: False
- pyTorch_version: 1.6.0
- pytorch-lightning: 0.9.0
- tqdm: 4.47.0
- System:
- OS: Linux
- architecture:
- 64bit
- ELF
- processor:
- python: 3.8.5
- version: #1 SMP Debian 4.19.98-1 (2020-01-26)