I would like to save the output (target) tensor from my model distributed on four GPUs while testing (and potentially during training). However, I have only been able to save tensors stored in GPU:0. The relevant code I am trying is as follows,
def test_step(self, test_batch, batch_idx):
x, y = test_batch
logits = self.forward(x)
loss = self.mae_loss(logits, y)
mae = self.mean_absolute_error(logits, y)
self.log('test_loss', loss, on_step=True, on_epoch=True, sync_dist=True)
self.log('test_mae', mae, on_step=True, on_epoch=True, sync_dist=True)
return {'rtest_loss': loss, 'rtest_mae': mae, 'logits': logits, 'y_vals': y}
def test_epoch_end(self, outputs):
avg_loss = torch.stack([x['rval_loss'] for x in outputs]).mean()
avg_mae = torch.stack([x['rval_mae'] for x in outputs]).mean()
for x in outputs:
with torch.cuda.device(0):
torch.save(x['logits'], 'logits_0.pt')
with torch.cuda.device(1):
torch.save(x['logits'], 'logits_1.pt')
with torch.cuda.device(2):
torch.save(x['logits'], 'logits_2.pt')
with torch.cuda.device(3):
torch.save(x['logits'], 'logits_3.pt')
self.log('avg_val_loss', avg_loss, on_epoch=True, sync_dist=True)
self.log('avg_val_mae', avg_mae, on_epoch=True, sync_dist=True)
I was wondering if there is a better way to save the tensors. Is there any other way to get the full output tensor directly? Any help would be highly appreciated.