Validation and test does not work

Hello,

My Validation seems to behave strangely with outputs such as :

Epoch 0:   5%|▍         | 932/18645 [03:54<1:14:26,  3.97it/s, loss=3.530, v_num=2jq6, val_loss=2.84, rouge1=13.6, rougeL=13.5]
Epoch 0:   7%|▋         | 1221/18645 [05:07<1:13:05,  3.97it/s, loss=2.911, v_num=2jq6, val_loss=2.84, rouge1=13.6, rougeL=13.5]
Validating: 0it [00:00, ?it/s]
Epoch 0:   7%|▋         | 1222/18645 [05:07<1:13:08,  3.97it/s, loss=2.911, v_num=2jq6, val_loss=2.84, rouge1=13.6, rougeL=13.5]
Epoch 0:   7%|▋         | 1223/18645 [05:08<1:13:08,  3.97it/s, loss=2.911, v_num=2jq6, val_loss=2.84, rouge1=13.6, rougeL=13.5]

and the testing does not output any loss:
Testing: 100%|██████████| 6428/6428 [25:12<00:00, 5.23it/s]

I checked the DataLoader for both and the dataset is loaded as expected
The code is done as in this git.
Here is the code used for the validation and test:

def validation_step(self, batch, batch_idx):
    return self._generative_step(batch)

def test_step(self, batch, batch_idx):
    metrics =self._generative_step(batch)
    metrics = {'test_loss': metrics['val_loss']}
    return metrics

def validation_epoch_end(self, outputs):

    avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
    tensorboard_logs = {"val_loss": avg_loss}

    rouge_results = self.rouge_metric.compute()
    rouge_dict = self.parse_score(rouge_results)

    tensorboard_logs.update(rouge1=rouge_dict['rouge1'], rougeL=rouge_dict['rougeL'])

    ## Clear out the lists for next epoch
    self.target_gen = []
    self.prediction_gen = []
    return {"avg_val_loss": avg_loss,
            "rouge1": rouge_results['rouge1'],
            "rougeL": rouge_results['rougeL'],
            "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

def test_epoch_end(self, outputs):
    metrics = self.validation_epoch_end(outputs)
    metrics = {"avg_test_loss": metrics['avg_val_loss"'],
            "test_rouge1": metrics['rouge1'],
            "test_rougeL": metrics['rougeL'],
            "log": metrics["log"], 'progress_bar': metrics['progress_bar']}
    return metrics

def _generative_step(self, batch):

    t0 = time.time()

    generated_ids = self.model.generate(
        batch["source_ids"],
        attention_mask=batch["source_mask"],
        use_cache=True,
        decoder_attention_mask=batch['target_mask'],
        max_length=150,
        num_beams=2,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True
    )
    preds = self.ids_to_clean_text(generated_ids)
    target = self.ids_to_clean_text(batch["target_ids"])

    gen_time = (time.time() - t0) / batch["source_ids"].shape[0]

    loss = self._step(batch)
    base_metrics = {'val_loss': loss}
    #         rouge: Dict = self.calc_generative_metrics(preds, target)
    summ_len = np.mean(self.lmap(len, generated_ids))
    base_metrics.update(gen_time=gen_time, gen_len=summ_len, preds=preds, target=target)
    self.rouge_metric.add_batch(preds, target)
    # rouge_results = self.rouge_metric.compute()
    # rouge_dict = self.parse_score(rouge_results)
    # base_metrics.update(rouge1=rouge_dict['rouge1'], rougeL=rouge_dict['rougeL'])

    return base_metrics

def _step(self, batch):
    labels = batch["target_ids"]
    labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        labels=labels,
        decoder_attention_mask=batch['target_mask']
    )

    loss = outputs[0]

    return loss

def ids_to_clean_text(self, generated_ids):
    gen_text = self.tokenizer.batch_decode(
        generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )
    return self.lmap(str.strip, gen_text)

Any indication on what is going wrong ?
Thanks!

Hello, my apology for the late reply. We are slowly converging to deprecate this forum in favor of the GH build-in version… Could we kindly ask you to recreate your question there - Lightning Discussions