Hi,
I have a model for a multilabel classification task using pytorch lightning and AutoModel with distilroberta-base. The model is running perfectly fine on one node with one GPU. While trying to adapt it for running it on two nodes with one GPU each I get stuck on the following RuntimeError:
RuntimeError: It looks like your LightningModule has parameters that were not used in producing the loss returned by training_step. If this is intentional, you must enable the detection of unused parameters in DDP, either by setting the string value
strategy='ddp_find_unused_parameters_true'
or by setting the flag in the strategy withstrategy=DDPStrategy(find_unused_parameters=True)
.
When I switch the strategy from the Trainer from ‘ddp’ to ‘ddp_find_unused_parameters_true’ or to 'DDPStrategy(find_unused_parameters=True, static_graph=True). The training runs but it is incredibly slow (Duration per Epoch on single node: 12min / Duration per Epoch on two nodes with ddp_find_unused_parameters_true: 9h).
My LightningModule class looks like following:
class Data_Classifier(pl.LightningModule):
def __init__(self, config: dict):
super().__init__()
self.config = config
self.pretrained_model = AutoModel.from_pretrained(config['model_name'], return_dict = True)
self.hidden = nn.Linear(self.pretrained_model.config.hidden_size, self.pretrained_model.config.hidden_size)
self.classifier = nn.Linear(self.pretrained_model.config.hidden_size, self.config['n_labels'])
torch.nn.init.xavier_uniform_(self.hidden.weight)
torch.nn.init.xavier_uniform_(self.classifier.weight)
self.loss_func = nn.BCEWithLogitsLoss(reduction='mean')
self.dropout = nn.Dropout()
def forward(self, input_ids, attention_mask, labels=None):
# roberta model
output = self.pretrained_model(input_ids = input_ids, attention_mask = attention_mask)
pooled_output = torch.mean(output.last_hidden_state, 1)
# neural network classification layers
pooled_output = self.hidden(pooled_output)
pooled_output = self.dropout(pooled_output)
pooled_output = F.relu(pooled_output)
logits = self.classifier(pooled_output)
# calculate loss
loss = 0
if labels is not None:
loss = self.loss_func(logits.view(-1, self.config['n_labels']), labels.view(-1, self.config['n_labels']))
return loss, logits
def training_step(self, batch):
loss, logits = self (**batch)
self.log("train loss", loss, prog_bar = True, logger = True)
return {"loss": loss, "predictions": logits, "labels": batch['labels']}
def validation_step(self, batch):
loss, logits = self (**batch)
# Add sync_dist=True to sync logging across all GPU workers (may have performance impact)
self.log("validation loss", loss, prog_bar = True, logger = True, sync_dist=True)
return {"val_loss": loss, "predictions": logits, "labels": batch['labels']}
def predict_step(self, batch):
_, logits = self (**batch)
return logits
def configure_optimizers(self):
optimizer = AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['w_decay'])
optimizer.max_grad_norm = self.config['max_grad_norm']
total_steps = self.config['train_size'] / self.config['bs']
warmup_steps = math.floor(total_steps * self.config['warmup'])
scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
return [optimizer], [scheduler]
Did anyone encounter a similar problem and fixed it or does anyone see a problem with my LightningModule class?
Thanks in advance.