Hi,
I’ve trained the model and want to add more epochs to it. I didn’t save the checkpoints, but from my understanding pytorch lightning knows the model state to continue training where he left out. I didn’t close the kernel yet.
I did 3 epochs, I’ve set max epoch to 5. It is still training for 5 more epochs. Can someone point out if my reasoning is correct? Ty
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
class SentimentClassifier(pl.LightningModule):
def __init__(self, model):
super().__init__()
self.model = model
self.loss = torch.nn.CrossEntropyLoss()
def forward(self, input_ids, attention_mask, labels=None):
outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
return outputs
def predict_step(self, batch, batch_idx):
input_ids = batch['input_ids']
attention_mask = batch['attention_mask']
self.model.eval()
with torch.no_grad():
outputs = self(input_ids, attention_mask, labels=None)
preds = outputs.logits.argmax(-1)
return preds.tolist()
{...}
# here I define the trainer
trainer = pl.Trainer(
max_epochs=3,
logger=pl.loggers.TensorBoardLogger('logs/', name='sentiment_classifier')
)
sentiment_classifier = SentimentClassifier(model)
trainer.fit(sentiment_classifier,
DataLoader(train_dataset, batch_size=16),
DataLoader(val_dataset, batch_size=16))
# once fitted, I train for more without saving checkpoint
# it's going to train 5 more times
trainer2 = pl.Trainer(max_epochs=5)
trainer2.fit(sentiment_classifier,
DataLoader(train_dataset, batch_size=16),
DataLoader(val_dataset, batch_size=16))