Lightning didn't move my model to GPU

Here is my model code

import torch
import torch.nn.functional as F
from torch import nn

class Patches(nn.Module):
    def __init__(self, patch_size: int):
        self.patch_size = patch_size

    def forward(self, x):
        batch_size = x.shape[0]
        patches = F.unfold(x, kernel_size=self.patch_size, stride=self.patch_size)
        patches = patches.reshape(batch_size, 1, patches.shape[-1], patches.shape[1])
        return patches

class MixerBlock(nn.Module):
    def __init__(self, s: int, c: int, ds: int, dc: int, activation=nn.GELU()):
        self.activation_layer = activation
        self.weight1 = torch.nn.Parameter(
            torch.nn.init.kaiming_uniform_(torch.empty(s, ds)), requires_grad=True
        self.weight2 = torch.nn.Parameter(
            torch.nn.init.kaiming_uniform_(torch.empty(ds, s)), requires_grad=True
        self.weight3 = torch.nn.Parameter(
            torch.nn.init.kaiming_uniform_(torch.empty(c, dc)), requires_grad=True
        self.weight4 = torch.nn.Parameter(
            torch.nn.init.kaiming_uniform_(torch.empty(dc, c)), requires_grad=True

    def forward(self, x):
        # token-mixing layer
        x_t = torch.permute(x, dims=(0, 1, 3, 2))
        w1_x = x_t @ self.weight1
        w2_x = w1_x @ self.weight2
        w2_x = self.activation_layer(w2_x)
        w2_x = torch.permute(w2_x, dims=(0, 1, 3, 2))
        # skip-connection
        u = w2_x + x
        # channel-mixing layer
        w3_x = u @ self.weight3
        w4_x = w3_x @ self.weight4
        w4_x = self.activation_layer(w4_x)
        # skip-connection
        y = w4_x + u

        return y

class MlpMixer(nn.Module):
    def __init__(
        patch_size: int,
        s: int,
        c: int,
        ds: int,
        dc: int,
        num_mlp_blocks: int,
        num_classes: int,
        self.c = c
        self.s = s
        self.ds = ds
        self.dc = dc
        self.num_classes = num_classes
        self.layer_norm = nn.LayerNorm([1, s, c])
        self.mixer_blocks = nn.ModuleList(
            [MixerBlock(s, c, ds, dc) for i in range(num_mlp_blocks)]
        self.num_classes = num_classes
        self.classifier = nn.Sequential(nn.Flatten(), nn.Dropout(0.2))
        self.patches_extract = Patches(patch_size)

    def forward(self, x):
        patches = self.patches_extract(x)
        for block in self.mixer_blocks:
            patches = self.layer_norm(patches)
            patches = block(patches)
        output = self.classifier(patches)
        if self.num_classes == 2:
            output = nn.Linear(self.c * self.s, 1)(output)
            output = nn.Sigmoid()(output)
            output = nn.Linear(self.c * self.s, self.num_classes)(output)
            output = nn.Softmax()(output)
        return output

This is my code for creating Lightning Module

class ImageClassifier(pl.LightningModule):
    def __init__(self, model, num_classes: int):
        self.model = model
        self.num_classes = num_classes
        if self.num_classes == 2:
            self.train_acc = Accuracy(task="binary")
            self.train_acc = Accuracy(task="multiclass", num_classes=self.num_classes)

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch):
        x, y = batch
        logits = self.forward(x)
        if self.num_classes == 2:
            loss = F.binary_cross_entropy(logits, y)
            loss = F.cross_entropy(logits, y)

        accuracy = self.train_acc(logits, y)

        self.log("train_loss", loss, on_epoch=True)
        self.log("train_accuracy", accuracy, on_epoch=True)

        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self.forward(x)

        if self.num_classes == 2:
            loss = F.binary_cross_entropy(logits, y)
            loss = F.cross_entropy(logits, y)

        acc = self.train_acc(logits, y)

        self.log("val_loss", loss, on_epoch=True)
        self.log("val_acc", acc, on_epoch=True)

        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)

This is my training code

    base_model = ImageClassifier(model, num_classes)
    logger = CSVLogger("logs", name="cat_dog_classfication")
    trainer = pl.Trainer(max_epochs=epochs, logger=logger), train_dataloader, valid_dataloader)

And it happened a runtime error : “RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)”
Can anyone help me to fix this problem


Can you share the full code so I can run it and see what’s happening? The base model to ImageClassifier and the train dataloader and probably other stuff is missing.

absolutely, here is my full source code