I am trying to run this distributed training of GANs below and getting the following error log
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Traceback (most recent call last):
File "/scratch/nhd7682/HPML/distributedlightning.py", line 210, in <module>
trainer.fit(model)
File "/scratch/nhd7682/envs_dirs/pcgrl/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 768, in fit
self._call_and_handle_interrupt(
File "/scratch/nhd7682/envs_dirs/pcgrl/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 719, in _call_and_handle_interrupt
return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs)
File "/scratch/nhd7682/envs_dirs/pcgrl/lib/python3.9/site-packages/pytorch_lightning/strategies/launchers/spawn.py", line 78, in launch
mp.spawn(
File "/scratch/nhd7682/envs_dirs/pcgrl/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 240, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/scratch/nhd7682/envs_dirs/pcgrl/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 189, in start_processes
process.start()
File "/scratch/nhd7682/envs_dirs/pcgrl/lib/python3.9/multiprocessing/process.py", line 121, in start
self._popen = self._Popen(self)
File "/scratch/nhd7682/envs_dirs/pcgrl/lib/python3.9/multiprocessing/context.py", line 284, in _Popen
return Popen(process_obj)
File "/scratch/nhd7682/envs_dirs/pcgrl/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 32, in __init__
super().__init__(process_obj)
File "/scratch/nhd7682/envs_dirs/pcgrl/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__
self._launch(process_obj)
File "/scratch/nhd7682/envs_dirs/pcgrl/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 47, in _launch
reduction.dump(process_obj, fp)
File "/scratch/nhd7682/envs_dirs/pcgrl/lib/python3.9/multiprocessing/reduction.py", line 60, in dump
ForkingPickler(file, protocol).dump(obj)
TypeError: cannot pickle '_thread.lock' object
import argparse
import os
import numpy as np
import math
import torchvision.transforms as transforms
from torchvision.utils import save_image
from torch.utils.data import DataLoader
from torchvision import datasets
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch
import pytorch_lightning as pl
from test_tube import Experiment
os.makedirs("images", exist_ok=True)
parser = argparse.ArgumentParser()
parser.add_argument("--n_epochs", type=int, default=200, help="number of epochs of training")
parser.add_argument("--batch_size", type=int, default=64, help="size of the batches")
parser.add_argument("--lr", type=float, default=0.0002, help="adam: learning rate")
parser.add_argument("--b1", type=float, default=0.5, help="adam: decay of first order momentum of gradient")
parser.add_argument("--b2", type=float, default=0.999, help="adam: decay of first order momentum of gradient")
parser.add_argument("--n_cpu", type=int, default=8, help="number of cpu threads to use during batch generation")
parser.add_argument("--latent_dim", type=int, default=100, help="dimensionality of the latent space")
parser.add_argument("--img_size", type=int, default=28, help="size of each image dimension")
parser.add_argument("--channels", type=int, default=1, help="number of image channels")
parser.add_argument("--sample_interval", type=int, default=400, help="interval betwen image samples")
args = parser.parse_args()
class Generator(nn.Module):
def __init__(self, latent_dim, img_shape):
super(Generator, self).__init__()
self.img_shape = img_shape
def block(in_feat, out_feat, normalize=True):
layers = [nn.Linear(in_feat, out_feat)]
if normalize:
layers.append(nn.BatchNorm1d(out_feat, 0.8))
layers.append(nn.LeakyReLU(0.2, inplace=True))
return layers
self.model = nn.Sequential(
*block(latent_dim, 128, normalize=False),
*block(128, 256),
*block(256, 512),
*block(512, 1024),
nn.Linear(1024, int(np.prod(img_shape))),
nn.Tanh()
)
def forward(self, z):
img = self.model(z)
img = img.view(img.size(0), *self.img_shape)
return img
class Discriminator(nn.Module):
def __init__(self, img_shape):
super(Discriminator, self).__init__()
self.model = nn.Sequential(
nn.Linear(int(np.prod(img_shape)), 512),
nn.LeakyReLU(0.2, inplace=True),
nn.Linear(512, 256),
nn.LeakyReLU(0.2, inplace=True),
nn.Linear(256, 1),
nn.Sigmoid(),
)
def forward(self, img):
img_flat = img.view(img.size(0), -1)
validity = self.model(img_flat)
return validity
# opt_g = torch.optim.Adam(self.generator.parameters(), lr=lr, betas=(b1, b2))
# opt_d = torch.optim.Adam(self.discriminator.parameters(), lr=lr, betas=(b1, b2))
# # sample noise
# z = torch.randn(imgs.shape[0], self.hparams.latent_dim)
# # generate images
# self.generated_imgs = self.forward(z)
# # generate labels
# # all ones means all fake
# valid = torch.ones(imgs.size(0), 1)
# # adversarial loss is just binary cross-entropy (ie: is it fake(1) or real(0))
# g_loss = self.adversarial_loss(self.discriminator(self.generated_imgs), valid)
# # Measure discriminator's ability to classify real from generated samples
# # how well can it label as real?
# valid = torch.ones(imgs.size(0), 1)
# real_loss = self.adversarial_loss(self.discriminator(imgs), valid)
# # how well can it label as fake?
# fake = torch.zeros(imgs.size(0), 1)
# fake_loss = self.adversarial_loss(self.discriminator(self.generated_imgs.detach()), fake)
# # discriminator loss is the average of these
# d_loss = (real_loss + fake_loss) / 2
class GAN(pl.LightningModule):
def __init__(self, latent_dim, lr, b1,b2,batch_size):
super(GAN, self).__init__()
self.save_hyperparameters()
# self.hparams = hparams
# networks
mnist_shape = (3, 64, 64)
self.generator = Generator(latent_dim=self.hparams.latent_dim, img_shape=mnist_shape)
self.discriminator = Discriminator(img_shape=mnist_shape)
# cache for generated images
self.generated_imgs = None
def forward(self, z):
return self.generator(z)
def adversarial_loss(self, y_hat, y):
return F.binary_cross_entropy(y_hat, y)
def training_step(self, batch, batch_nb, optimizer_i):
imgs, _ = batch
# train generator
if optimizer_i == 0:
# sample noise
z = torch.randn(imgs.shape[0], self.hparams.latent_dim)
# match gpu device (or keep as cpu)
if self.on_gpu:
z = z.cuda(imgs.device.index)
# generate images
self.generated_imgs = self.forward(z)
# log sampled images
sample_imgs = self.generated_imgs[:6]
grid = torchvision.utils.make_grid(sample_imgs)
self.experiment.add_image('generated_images', grid, 0)
# ground truth result (ie: all fake)
valid = torch.ones(imgs.size(0), 1)
# adversarial loss is binary cross-entropy
g_loss = self.adversarial_loss(self.discriminator(self.generated_imgs), valid)
return g_loss
# train discriminator
if optimizer_i == 1:
# Measure discriminator's ability to classify real from generated samples
# how well can it label as real?
valid = torch.ones(imgs.size(0), 1)
real_loss = self.adversarial_loss(self.discriminator(imgs), valid)
# how well can it label as fake?
fake = torch.zeros(imgs.size(0), 1)
fake_loss = self.adversarial_loss(self.discriminator(self.generated_imgs.detach()), fake)
# discriminator loss is the average of these
d_loss = (real_loss + fake_loss) / 2
return d_loss
def configure_optimizers(self):
lr = self.hparams.lr
b1 = self.hparams.b1
b2 = self.hparams.b2
opt_g = torch.optim.Adam(self.generator.parameters(), lr=lr, betas=(b1, b2))
opt_d = torch.optim.Adam(self.discriminator.parameters(), lr=lr, betas=(b1, b2))
return [opt_g, opt_d], []
def tng_dataloader(self):
transf = transforms.Compose([
transforms.Resize(64),
transforms.CenterCrop(64),
transforms.ToTensor(),
transforms.Normalize(*norm,inplace=True),
])
dataset = datasets.ImageFolder(root=path,transform=transf)
dataset = MNIST(os.getcwd(), train=True, download=True, transform=transform)
return DataLoader(datasets, batch_size=batch_size)
# save tensorboard logs
exp = Experiment(save_dir=os.getcwd())
# init model
model = GAN(args.latent_dim,args.lr,args.b1,args.b2,args.batch_size)
# fit trainer on 128 GPUs
trainer = pl.Trainer(logger=exp, devices=2, accelerator="gpu", max_epochs=5)
trainer.fit(model)
I am at loss as to why this might be happening.