Precision 16 run problem

This is my code written by pytorch lightning and running on google colab gpu. I changed it to precision 16 and it was working ok previously, but suddenly it did not work and following error rose on line x1 = self.conv_1x1(x)

RuntimeError: dot : expected both vectors to have same dtype, but found Float and Half

this is my dataset

class TFDataset(torch.utils.data.Dataset):
    def __init__(self, split):
        super().__init__()
        self.reader = load_dataset(
            "openclimatefix/nimrod-uk-1km", "sample", split=split, streaming=True
        )
        self.iter_reader = self.reader

    def __len__(self):
        return 1000

    def __getitem__(self, item):
        try:
            row = next(self.iter_reader)
        except Exception:
            rng = default_rng()
            self.iter_reader = iter(
                self.reader.shuffle(seed=rng.integers(low=0, high=100000), buffer_size=10)
            )
            row = next(self.iter_reader)
        input_frames, target_frames = extract_input_and_target_frames(row["radar_frames"])

        d_flat = np.moveaxis(input_frames, [0, 1, 2, 3], [0, 3, 2, 1])
        d_flat = torch.from_numpy(d_flat)
        resized_d_flat = TF.resize(d_flat, (32, 32))
        resized_d_flat = resized_d_flat.detach().cpu().numpy()
        input_frames = np.moveaxis(resized_d_flat, [0, 1, 2, 3], [0, 3, 2, 1])

        d_flat = np.moveaxis(target_frames, [0, 1, 2, 3], [0, 3, 2, 1])
        d_flat = torch.from_numpy(d_flat)
        resized_d_flat = TF.resize(d_flat, (32, 32))
        resized_d_flat = resized_d_flat.detach().cpu().numpy()
        target_frames = np.moveaxis(resized_d_flat, [0, 1, 2, 3], [0, 3, 2, 1])

        return np.moveaxis(input_frames, [0, 1, 2, 3], [0, 2, 3, 1]), np.moveaxis(
            target_frames, [0, 1, 2, 3], [0, 2, 3, 1]
        )

this is the main code and classes

def get_conv_layer(conv_type: str = "standard") -> torch.nn.Module:
        if conv_type == "standard":
            conv_layer = torch.nn.Conv2d
        elif conv_type == "3d":
            conv_layer = torch.nn.Conv3d
        else:
            raise ValueError(f"{conv_type} is not a recognized Conv method")
        return conv_layer


class DBlock(torch.nn.Module):
    def __init__(
            self,
            input_channels: int = 12,
            output_channels: int = 12,
            conv_type: str = "standard",
            first_relu: bool = True,
            keep_same_output: bool = False,
    ):
        
        super().__init__()
        self.input_channels = input_channels
        self.output_channels = output_channels
        self.first_relu = first_relu
        self.keep_same_output = keep_same_output
        self.conv_type = conv_type
        conv2d = get_conv_layer(conv_type)
        if conv_type == "3d":
            # 3D Average pooling
            self.pooling = torch.nn.AvgPool3d(kernel_size=2, stride=2)
        else:
            self.pooling = torch.nn.AvgPool2d(kernel_size=2, stride=2)
        self.conv_1x1 = spectral_norm(
            conv2d(
                in_channels=input_channels,
                out_channels=output_channels,
                kernel_size=1,
            )
        )
        self.first_conv_3x3 = spectral_norm(
            conv2d(
                in_channels=input_channels,
                out_channels=output_channels,
                kernel_size=3,
                padding=1,
            )
        )
        self.last_conv_3x3 = spectral_norm(
            conv2d(
                in_channels=output_channels,
                out_channels=output_channels,
                kernel_size=3,
                padding=1,
                stride=1,
            )
        )
        # Downsample at end of 3x3
        self.relu = torch.nn.ReLU()
        # Concatenate to double final channels and keep reduced spatial extent

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if self.input_channels != self.output_channels:
            x1 = self.conv_1x1(x)
            if not self.keep_same_output:
                x1 = self.pooling(x1)
        else:
            x1 = x

        if self.first_relu:
            x = self.relu(x)

        x = self.first_conv_3x3(x)
        x = self.relu(x)
        x = self.last_conv_3x3(x)

        if not self.keep_same_output:
            x = self.pooling(x)
        x = x1 + x  # Sum the outputs should be half spatial and double channels
        return x



class CNNModel(LightningModule):

    def __init__(self):
        super(CNNModel, self).__init__()

        input_channels = 1

        output_channels = 384

        num_context_steps = 4

        self.d1 = DBlock(
            input_channels=4 * input_channels,
            output_channels=((output_channels // 4) * input_channels) // num_context_steps,
            conv_type="standard",
        )

    def forward(self, x):
        x = self.relu(self.conv_1x1(x))
        
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch

        self.space2depth = PixelUnshuffle(downscale_factor=2)

        x = self.space2depth(x)

        steps = x.size(1) 

        print(torch.cuda.is_available())

        for i in range(steps):

          s1 = self.d1(x[:, i, :, :, :])


if __name__ == "__main__":
    data_module = DGMRDataModule()
    model = CNNModel()
    trainer = pl.Trainer(
        max_epochs=1,
        accelerator="auto",
        precision=16,
        num_sanity_val_steps=0,
        min_epochs = 1)
    trainer.fit(model, data_module)

I tried various methods from changing input data type to float32 to using torch.cuda.amp.autocast but none of them solved the problem.