Hello all,
I was trying to run a PyLightning Trainer on multiple-opus on a Kaggle notebook as this:
class StarsDataset(Dataset):
def __init__(self, split, transform=None):
self.img = data_split[split]
random.shuffle(self.img)
self.img_dir = im_dir
self.transform = transform
def __len__(self):
return len(self.img)
def __getitem__(self, idx):
im_id = self.img[idx]
anno = annotations[im_id]
bboxes = anno['box_examples_coordinates']
rects = list()
for bbox in bboxes:
x1 = bbox[0][0]
y1 = bbox[0][1]
x2 = bbox[2][0]
y2 = bbox[2][1]
rects.append([y1, x1, y2, x2])
dots = np.array(anno['points'])
image = np.array(Image.open(im_dir + im_id))
density = np.load(gt_dir + im_id[:-4] + '.npy').astype('float32')
m_flag = 0
boxes = list()
for box in rects:
y1, x1, y2, x2 = [int(k) for k in box]
bbox = Image.fromarray(image[y1:y2+1, x1:x2+1, :])
bbox = transforms.Resize((64, 64))(bbox)
boxes.append(transforms.ToTensor()(bbox))
boxes = torch.stack(boxes)
if self.transform!=None:
aug = self.transform(image=image, mask=density)
image = aug['image']
density = aug['mask']
# boxes shape [3,3,64,64], image shape [3,384,384], density shape[384,384]
norm = A.Normalize()(image = image, mask = density)
return norm['image'].transpose(2, 0, 1), norm['mask'], boxes, m_flag
batch_size = 8
train_dataset = StarsDataset('train', t_transform)
val_dataset = StarsDataset('val')
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
val_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
class CounTrModel(pl.LightningModule):
def __init__(self, model, optimizer, criterion, metric=None):
super().__init__()
self.model = model
self.criterion = criterion
self.optimizer = optimizer
self.metric = metric
def forward(self, x, boxes, shot_num):
return self.model(x, boxes, shot_num)
def shared_step(self, batch, stage):
samples, gt_density, boxes, m_flag = batch
shot_num = random.randint(0, 3)
output = self.forward(samples, boxes, shot_num)
loss = self.criterion(output, gt_density)
mae = self.metric(output, gt_density)
self.log(f'{stage}_loss', loss, prog_bar=True)
self.log(f'{stage}_mae', mae, prog_bar=True)
return {"loss": loss, "mae": mae, "boxes": boxes[0], "samples": samples[0], "output": output[0], "gt_density": gt_density[0]}
def shared_epoch_end(self, outputs, stage):
avg_loss = torch.stack([x["loss"] for x in outputs]).mean()
avg_mae = torch.tensor([x["mae"] for x in outputs]).mean()
output = outputs[0]["output"]
gt_density = outputs[0]["gt_density"]
boxes = outputs[0]["boxes"]
samples = outputs[0]["samples"]
fig = output[0].unsqueeze(0).repeat(3,1,1)
f1 = gt_density[0].unsqueeze(0).repeat(3,1,1)
self.logger.experiment.add_scalar(f"mae/{stage}", avg_mae, self.current_epoch)
self.logger.experiment.add_scalar(f"loss/{stage}", avg_loss, self.current_epoch)
self.logger.experiment.add_images('bboxes', (boxes[0]), self.current_epoch, dataformats='CHW')
self.logger.experiment.add_images('gt_density', (samples[0]/2 + f1/10), self.current_epoch, dataformats='CHW')
self.logger.experiment.add_images('density map', (fig/20), self.current_epoch, dataformats='CHW')
self.logger.experiment.add_images('density map overlay', (samples[0]/2+fig/10), self.current_epoch, dataformats='CHW')
epoch_dictionary={f'{stage}_loss': avg_loss}
return epoch_dictionary
def training_step(self, batch, batch_idx):
return self.shared_step(batch, "train")
def training_epoch_end(self, outputs):
return self.shared_epoch_end(outputs, "train")
def validation_step(self, batch, batch_idx):
return self.shared_step(batch, "val")
def validation_epoch_end(self, outputs):
return self.shared_epoch_end(outputs, "val")
def configure_optimizers(self):
optimizer = self.optimizer
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=0)
return [optimizer], [scheduler]
pl_model = CounTrModel(model, optimizer, criterion, metric)
trainer = pl.Trainer(callbacks=cbs, accelerator='gpu', devices=2, max_epochs=20, logger=logger)
trainer.fit(pl_model, train_dl, val_dl)
But when I run the code it gives me the following error:
RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
What am I doing wrong?