Hey folks!
I am having an issue where I am executing a code that throws an error related to autograd I suppose.
I have defined a forward step as follows.
def step(self, batch, mode):
anc, pos = batch
with torch.no_grad():
# transformer encodings
cts_anc_emb = self.pretrained_model.encoder(anc) # output: B x D
cts_pos_emb = self.pretrained_model.encoder(pos)
# projections
proj_anc_emb = self.pretrained_model.nnblock(cts_anc_emb) # output: B x d(=bits)
proj_pos_emb = self.pretrained_model.nnblock(cts_pos_emb)
# NF embeddings
_, self.uni_z_anc, loss_1= self.nflows.forward_KL(proj_anc_emb, return_z=True) # output: B x d(=bits)
_, self.uni_z_pos, _ = self.nflows.forward_KL(proj_pos_emb, return_z=True)
# contrast loss on NN block embeddings. L2 norm used.
self.aa = torch.linalg.norm(self.uni_z_anc - self.uni_z_pos, dim=-1).unsqueeze(1)
self.bb = torch.linalg.norm(self.uni_z_anc.unsqueeze(1) - self.uni_z_anc, dim=-1) + torch.linalg.norm(self.uni_z_anc.unsqueeze(1) - self.uni_z_pos, dim=-1)
anc_pos_sim = self.aa
anc_neg_sim = 0.5*torch.mean(self.bb, dim=-1, keepdim=True)
loss_2 = torch.mean(torch.maximum(torch.tensor(0.0), anc_pos_sim + torch.tensor(6) - anc_neg_sim)) #1.4
loss = self.lambd[0]*loss_1 + self.lambd[1]*loss_2
self.log(mode+"_loss", loss, on_epoch=True, on_step=True, prog_bar=True, logger=True)
return loss
And I have configured the optimizer as:
def configure_optimizers(self):
if self.optimizer.lower() == "adam":
optimizer = torch.optim.Adam(self.nflows.parameters(), lr=self.lr)
scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=1000, eta_min=1e-4)
elif self.optimizer.lower() == "adamw":
optimizer = torch.optim.AdamW(self.nflows.parameters(), lr=self.lr, weight_decay=self.wt_decay)
scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=1000, eta_min=1e-4)
else:
raise NotImplementedError
return {"optimizer": optimizer, 'lr_scheduler':{"scheduler": scheduler, "interval":"epoch"}}
Executing this code throws an error when I run on A100 GPU but works perfectly fine on a different GPU than A100. The error is given hereunder:
result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position])
File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 247, in _run_optimization
self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure)
File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 366, in _optimizer_step
using_lbfgs=is_lbfgs,
File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1342, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/core/module.py", line 1661, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/core/optimizer.py", line 169, in step
step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/strategies/strategy.py", line 235, in optimizer_step
optimizer, model=model, optimizer_idx=opt_idx, closure=closure, **kwargs
File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 121, in optimizer_step
return optimizer.step(closure=closure, **kwargs)
File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/torch/optim/lr_scheduler.py", line 65, in wrapper
return wrapped(*args, **kwargs)
File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/torch/optim/optimizer.py", line 88, in wrapper
return func(*args, **kwargs)
File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/torch/autograd/grad_mode.py", line 28, in decorate_context
return func(*args, **kwargs)
File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/torch/optim/adam.py", line 66, in step
loss = closure()
File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 107, in _wrap_closure
closure_result = closure()
File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 147, in __call__
self._result = self.closure(*args, **kwargs)
File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 142, in closure
self._backward_fn(step_output.closure_loss)
File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 303, in backward_fn
self.trainer._call_strategy_hook("backward", loss, optimizer, opt_idx)
File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1480, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/strategies/strategy.py", line 207, in backward
self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, optimizer_idx, *args, **kwargs)
File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 69, in backward
model.backward(tensor, optimizer, optimizer_idx, *args, **kwargs)
File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/core/module.py", line 1406, in backward
loss.backward(*args, **kwargs)
File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/torch/_tensor.py", line 255, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/torch/autograd/__init__.py", line 149, in backward
allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!
Could anyone please let me know how to resolve this issue?