This is my code written by pytorch lightning and running on google colab gpu. I changed it to precision 16 and it was working ok previously, but suddenly it did not work and following error rose on line x1 = self.conv_1x1(x)
RuntimeError: dot : expected both vectors to have same dtype, but found Float and Half
this is my dataset
class TFDataset(torch.utils.data.Dataset):
def __init__(self, split):
super().__init__()
self.reader = load_dataset(
"openclimatefix/nimrod-uk-1km", "sample", split=split, streaming=True
)
self.iter_reader = self.reader
def __len__(self):
return 1000
def __getitem__(self, item):
try:
row = next(self.iter_reader)
except Exception:
rng = default_rng()
self.iter_reader = iter(
self.reader.shuffle(seed=rng.integers(low=0, high=100000), buffer_size=10)
)
row = next(self.iter_reader)
input_frames, target_frames = extract_input_and_target_frames(row["radar_frames"])
d_flat = np.moveaxis(input_frames, [0, 1, 2, 3], [0, 3, 2, 1])
d_flat = torch.from_numpy(d_flat)
resized_d_flat = TF.resize(d_flat, (32, 32))
resized_d_flat = resized_d_flat.detach().cpu().numpy()
input_frames = np.moveaxis(resized_d_flat, [0, 1, 2, 3], [0, 3, 2, 1])
d_flat = np.moveaxis(target_frames, [0, 1, 2, 3], [0, 3, 2, 1])
d_flat = torch.from_numpy(d_flat)
resized_d_flat = TF.resize(d_flat, (32, 32))
resized_d_flat = resized_d_flat.detach().cpu().numpy()
target_frames = np.moveaxis(resized_d_flat, [0, 1, 2, 3], [0, 3, 2, 1])
return np.moveaxis(input_frames, [0, 1, 2, 3], [0, 2, 3, 1]), np.moveaxis(
target_frames, [0, 1, 2, 3], [0, 2, 3, 1]
)
this is the main code and classes
def get_conv_layer(conv_type: str = "standard") -> torch.nn.Module:
if conv_type == "standard":
conv_layer = torch.nn.Conv2d
elif conv_type == "3d":
conv_layer = torch.nn.Conv3d
else:
raise ValueError(f"{conv_type} is not a recognized Conv method")
return conv_layer
class DBlock(torch.nn.Module):
def __init__(
self,
input_channels: int = 12,
output_channels: int = 12,
conv_type: str = "standard",
first_relu: bool = True,
keep_same_output: bool = False,
):
super().__init__()
self.input_channels = input_channels
self.output_channels = output_channels
self.first_relu = first_relu
self.keep_same_output = keep_same_output
self.conv_type = conv_type
conv2d = get_conv_layer(conv_type)
if conv_type == "3d":
# 3D Average pooling
self.pooling = torch.nn.AvgPool3d(kernel_size=2, stride=2)
else:
self.pooling = torch.nn.AvgPool2d(kernel_size=2, stride=2)
self.conv_1x1 = spectral_norm(
conv2d(
in_channels=input_channels,
out_channels=output_channels,
kernel_size=1,
)
)
self.first_conv_3x3 = spectral_norm(
conv2d(
in_channels=input_channels,
out_channels=output_channels,
kernel_size=3,
padding=1,
)
)
self.last_conv_3x3 = spectral_norm(
conv2d(
in_channels=output_channels,
out_channels=output_channels,
kernel_size=3,
padding=1,
stride=1,
)
)
# Downsample at end of 3x3
self.relu = torch.nn.ReLU()
# Concatenate to double final channels and keep reduced spatial extent
def forward(self, x: torch.Tensor) -> torch.Tensor:
if self.input_channels != self.output_channels:
x1 = self.conv_1x1(x)
if not self.keep_same_output:
x1 = self.pooling(x1)
else:
x1 = x
if self.first_relu:
x = self.relu(x)
x = self.first_conv_3x3(x)
x = self.relu(x)
x = self.last_conv_3x3(x)
if not self.keep_same_output:
x = self.pooling(x)
x = x1 + x # Sum the outputs should be half spatial and double channels
return x
class CNNModel(LightningModule):
def __init__(self):
super(CNNModel, self).__init__()
input_channels = 1
output_channels = 384
num_context_steps = 4
self.d1 = DBlock(
input_channels=4 * input_channels,
output_channels=((output_channels // 4) * input_channels) // num_context_steps,
conv_type="standard",
)
def forward(self, x):
x = self.relu(self.conv_1x1(x))
return x
def training_step(self, batch, batch_idx):
x, y = batch
self.space2depth = PixelUnshuffle(downscale_factor=2)
x = self.space2depth(x)
steps = x.size(1)
print(torch.cuda.is_available())
for i in range(steps):
s1 = self.d1(x[:, i, :, :, :])
if __name__ == "__main__":
data_module = DGMRDataModule()
model = CNNModel()
trainer = pl.Trainer(
max_epochs=1,
accelerator="auto",
precision=16,
num_sanity_val_steps=0,
min_epochs = 1)
trainer.fit(model, data_module)
I tried various methods from changing input data type to float32 to using torch.cuda.amp.autocast but none of them solved the problem.