Auto grad issue

Hey folks!

I am having an issue where I am executing a code that throws an error related to autograd I suppose.

I have defined a forward step as follows.

def step(self, batch, mode):
        anc, pos = batch

        with torch.no_grad():
            # transformer encodings
            cts_anc_emb = self.pretrained_model.encoder(anc) # output: B x D
            cts_pos_emb = self.pretrained_model.encoder(pos) 
            # projections
            proj_anc_emb = self.pretrained_model.nnblock(cts_anc_emb) # output: B x d(=bits)
            proj_pos_emb = self.pretrained_model.nnblock(cts_pos_emb)

        # NF embeddings
        _, self.uni_z_anc, loss_1= self.nflows.forward_KL(proj_anc_emb, return_z=True) # output: B x d(=bits)
        _, self.uni_z_pos, _ = self.nflows.forward_KL(proj_pos_emb, return_z=True)   
        
        # contrast loss on NN block embeddings. L2 norm used.
        self.aa = torch.linalg.norm(self.uni_z_anc - self.uni_z_pos, dim=-1).unsqueeze(1)
        self.bb = torch.linalg.norm(self.uni_z_anc.unsqueeze(1) - self.uni_z_anc, dim=-1) + torch.linalg.norm(self.uni_z_anc.unsqueeze(1) - self.uni_z_pos, dim=-1)
        anc_pos_sim = self.aa
        anc_neg_sim = 0.5*torch.mean(self.bb, dim=-1, keepdim=True)
        loss_2 = torch.mean(torch.maximum(torch.tensor(0.0), anc_pos_sim  + torch.tensor(6) - anc_neg_sim)) #1.4
       
        loss = self.lambd[0]*loss_1 + self.lambd[1]*loss_2
       
        self.log(mode+"_loss", loss, on_epoch=True, on_step=True, prog_bar=True, logger=True)
         return loss

And I have configured the optimizer as:

def configure_optimizers(self):
        if self.optimizer.lower() == "adam":
            optimizer = torch.optim.Adam(self.nflows.parameters(), lr=self.lr)
            scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=1000, eta_min=1e-4)
        elif self.optimizer.lower() == "adamw":
            optimizer = torch.optim.AdamW(self.nflows.parameters(), lr=self.lr, weight_decay=self.wt_decay)
            scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=1000, eta_min=1e-4)
        else:
            raise NotImplementedError
        return {"optimizer": optimizer,  'lr_scheduler':{"scheduler": scheduler, "interval":"epoch"}}

Executing this code throws an error when I run on A100 GPU but works perfectly fine on a different GPU than A100. The error is given hereunder:

result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position])
  File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 247, in _run_optimization
    self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure)
  File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 366, in _optimizer_step
    using_lbfgs=is_lbfgs,
  File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1342, in _call_lightning_module_hook
    output = fn(*args, **kwargs)
  File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/core/module.py", line 1661, in optimizer_step
    optimizer.step(closure=optimizer_closure)
  File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/core/optimizer.py", line 169, in step
    step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
  File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/strategies/strategy.py", line 235, in optimizer_step
    optimizer, model=model, optimizer_idx=opt_idx, closure=closure, **kwargs
  File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 121, in optimizer_step
    return optimizer.step(closure=closure, **kwargs)
  File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/torch/optim/lr_scheduler.py", line 65, in wrapper
    return wrapped(*args, **kwargs)
  File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/torch/optim/optimizer.py", line 88, in wrapper
    return func(*args, **kwargs)
  File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/torch/autograd/grad_mode.py", line 28, in decorate_context
    return func(*args, **kwargs)
  File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/torch/optim/adam.py", line 66, in step
    loss = closure()
  File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 107, in _wrap_closure
    closure_result = closure()
  File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 147, in __call__
    self._result = self.closure(*args, **kwargs)
  File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 142, in closure
    self._backward_fn(step_output.closure_loss)
  File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 303, in backward_fn
    self.trainer._call_strategy_hook("backward", loss, optimizer, opt_idx)
  File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1480, in _call_strategy_hook
    output = fn(*args, **kwargs)
  File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/strategies/strategy.py", line 207, in backward
    self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, optimizer_idx, *args, **kwargs)
  File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 69, in backward
    model.backward(tensor, optimizer, optimizer_idx, *args, **kwargs)
  File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/pytorch_lightning/core/module.py", line 1406, in backward
    loss.backward(*args, **kwargs)
  File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/torch/_tensor.py", line 255, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/nlsasfs/home/nltm-st/vipular/anaconda3/envs/venvv/lib/python3.7/site-packages/torch/autograd/__init__.py", line 149, in backward
    allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

Could anyone please let me know how to resolve this issue?

Hi @anupsingh15

This is probably because of this line of code:

        loss_2 = torch.mean(torch.maximum(torch.tensor(0.0), anc_pos_sim  + torch.tensor(6) - anc_neg_sim)) #1.4

Change torch.tensor(0.0) to torch.tensor(0.0, device=self.device) and the same for the other constants torch.tensor(6) etc.

Everywhere where you create a new tensor, e.g. with torch.tensor(...) or torch.zeros(...) etc., you should pass in the device of the LightningModule with device=self.device. Then all your tensors will be on the same device and can be used inside torch functions.

Hope this helps