Hello guys!
Can anyone tell me how can I split my model to different gpu? Say my model has one encoder and one decoder ,and I want to put them in different gpu because each of them is too big to fit in one gpu.
I have noticed that there are two ways to do so:
1.tips for fast training,but the code there seems not doable. Shouldn’t it be
self.encoder=self.encoder.cuda(1)
I am so confused.
2.sharded training: in the newest Pytorch-lightning, it says we can do Sharded training, but it turn out that it will cousume more memory.
If anyone can help me, I will be very grateful.
here is a example code:
import os
import torch
from torch import nn
import torch.nn.functional as F
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader, random_split
import pytorch_lightning as pl
import os
from pytorch_lightning import Trainer
os.environ['CUDA_VISIBLE_DEVICES']='1,2'
class LitAutoEncoder(pl.LightningModule):
def __init__(self):
super().__init__()
self.encoder = nn.Sequential(
nn.Linear(28*28, 64),
nn.ReLU(),
nn.Linear(64, 3)
)
self.decoder = nn.Sequential(
nn.Linear(3, 64),
nn.ReLU(),
nn.Linear(64, 28*28)
)
def forward(self, x):
# in lightning, forward defines the prediction/inference actions
embedding = self.encoder(x)
return embedding
def training_step(self, batch, batch_idx):
# training_step defined the train loop.
# It is independent of forward
x, y = batch
x = x.view(x.size(0), -1)
# self.encoder.cuda(0)
# self.decoder.cuda(1)
# x = x.cuda(0)
z = self.encoder(x)
x_hat = self.decoder(z)
loss = F.mse_loss(x_hat, x)
# Logging to TensorBoard by default
self.log('train_loss', loss)
return loss
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
return optimizer
dataset = MNIST(os.getcwd(), download=True, transform=transforms.ToTensor())
train_loader = DataLoader(dataset,num_workers=4,batch_size=128)
autoencoder = LitAutoEncoder()
gpus = 2
accelerator = 'ddp'
plugins = None
trainer = Trainer(max_epochs=1,gpus=gpus,accelerator=accelerator,plugins=plugins)
trainer.fit(autoencoder, train_loader)