torch._C._TensorBase 'to' very slow after a few batches


I want to train a plain 1D conv-net (1 layer). The training runs through, but it is extremely slow. In particular, in the epochs, the first ~75% are super fast. But then it gets very slow. Also the validation takes extermely long (longer than the Epoch itself).
I did some profiling to find out the root cause, and it seems to be related to the transfer of data to GPU.

I use cuda 11.8 (NVIDIA P520) and found the same behaviour on win11 and on win11-WSL2 (ubuntu). It does not make a difference if i run the script from powershell or within Spyder 5.
Do you know what could be the issue. I am a beginner - so it might be something very stupid.

Here is the profiler result:

class Dataset(tdata.Dataset):
    def __init__(self, In, Out, transform = None):
      self.In = In
      self.Out = Out

    def __len__(self):
        return len(self.Out)

    def __getitem__(self, idx):
        In = self.In[idx,:,:]
        Out = self.Out[idx,:,:]   
        return torch.from_numpy(In), torch.from_numpy(Out)

I create dataloaders as:

    fullset = Dataset(In, Out) # dimensions of numpy arrays [10, 3, 19939]
    nTrain = int(np.floor(len(fullset) * 0.9))
    nVal = len(fullset) - nTrain
    train_set, val_set = tdata.random_split(fullset, [nTrain, nVal])
    test_set = Dataset(InTest, OutTest)

    # Data Loaders
    train_loader = tdata.DataLoader(train_set, batch_size=16, shuffle=True, drop_last=True, pin_memory=True, num_workers=0)
    val_loader = tdata.DataLoader(val_set, batch_size=16, shuffle=False, drop_last=False, pin_memory=False, num_workers=0)
    test_loader = tdata.DataLoader(test_set, batch_size=16, shuffle=False, drop_last=False, num_workers=0)

My module is

class MyModule(pl.LightningModule):

    def __init__(self, model, model_hparams, optimizer_name, optimizer_hparams):
        self.model = model
        self.loss_module = nn.MSELoss()
        # Example input for visualizing the graph in Tensorboard
        self.example_input_array = torch.zeros((10, 3, 19939), dtype=torch.float32)

    def forward(self, imgs):
        return self.model(imgs)

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), **self.hparams.optimizer_hparams)
        scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100, 150], gamma=0.1)
        return [optimizer], [scheduler]

    def training_step(self, batch, batch_idx):
        In, Out = batch
        newOut = self.model(In)
        loss = self.loss_module(newOut, Out)
        absDiffLoss = (Out == newOut).float().abs().mean() # substitute for accuracy

        self.log('train_AbsDiffLoss', absDiffLoss, on_step=False, on_epoch=True)
        self.log('train_MSEloss', loss)
        return loss  # Return tensor to call ".backward" on

    def validation_step(self, batch, batch_idx):
        In, Out = batch
        newOut = self.model(In)
        absDiffLoss = (Out == newOut).float().abs().mean()

        self.log('val_AbsDiffLoss', absDiffLoss)

    def test_step(self, batch, batch_idx):
        In, Out = batch
        newOut = self.model(In)
        absDiffLoss = (Out == newOut).float().abs().mean()

        self.log('test_AbsDiffLoss', absDiffLoss)

My model is simple 1D convolution:

class Net(nn.Module):
    def __init__(self, dt, tKernelSize, act_fn):
        self.hparams = SimpleNamespace(act_fn=act_fn,
    def _create_network(self):
        kernel_size = round_to_odd(int(self.hparams.tKernelSize // self.hparams.dt))
        self.conv_1D = nn.Conv1d(in_channels=3, out_channels=16,
                                 kernel_size = kernel_size , stride=1, padding='same')       
    def _init_params(self):
       for m in self.modules():
           if isinstance(self.hparams.act_fn, nn.Identity):
               print('Nothing to initialize')
               if isinstance(m, nn.Conv1d):
                       m.weight, nonlinearity=self.hparams.act_fn)
               elif isinstance(m, nn.BatchNorm2d):
                   nn.init.constant_(m.weight, 1)
                   nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x_out = self.conv_1D(x)
        return x_out

My training is

def train_model(model, train_loader, val_loader, test_loader, save_name, **kwargs):
    global device
    # Create a PyTorch Lightning trainer with the generation callback
    trainer = pl.Trainer(default_root_dir=os.path.join(CHECKPOINT_PATH, save_name),                          # Where to save models
                         accelerator="gpu" if str(device).startswith("cuda") else "cpu",                     # We run on a GPU (if possible)
                         devices=1,                                                                          # How many GPUs/CPUs we want to use (1 is enough for the notebooks)
                         max_epochs=2,                                                                     # How many epochs to train for if no patience is set
                         callbacks=[ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_AbsDiffLoss"),  # Save the best checkpoint based on the maximum val_AbsDiffLoss recorded. Saves only weights and not optimizer
                                    LearningRateMonitor("epoch")],                                           # Log learning rate every epoch
                         enable_progress_bar=True)                                                           # Set to False if you do not want a progress bar
    trainer.logger._log_graph = True         # If True, we plot the computation graph in tensorboard
    trainer.logger._default_hp_metric = None # Optional logging argument that we don't need
    # Check whether pretrained model exists. If yes, load it and skip training
    pretrained_filename = os.path.join(CHECKPOINT_PATH, save_name + ".ckpt")
    if os.path.isfile(pretrained_filename):
        print(f"Found pretrained model at {pretrained_filename}, loading...")
        model = MyModule.load_from_checkpoint(pretrained_filename) # Automatically loads the model with the saved hyperparameters
        pl.seed_everything(42) # To be reproducable
        model = MyModule(model, **kwargs), train_loader, val_loader)
        model = MyModule.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) # Load best checkpoint after training
    # Test best model on validation and test set
    val_result = trainer.test(model, val_loader, verbose=True)
    test_result = trainer.test(model, test_loader, verbose=False)
    result = {"test": test_result[0]["test_AbsDiffLoss"], "val": val_result[0]["test_AbsDiffLoss"]}
    return model, result

I call the trainer

trained_Net = train_model(Net, train_loader, val_loader, test_loader, 'Net',
                        model_hparams={"act_fn":  act_fn},
                        optimizer_hparams={"lr": 1e-5, "weight_decay": 1e-4})