ValueError: too many values to unpack (expected 3)

For studying purposes, I am trying to create a simple fine-tuning example using t5 and lighting:

import pandas as pd

df = pd.DataFrame({
    "text": ["O Brasil é um país localizado na América do Sul.",
              "A capital do Brasil é Brasília.",
              "A população do Brasil é de mais de 210 milhões de pessoas."],
    "question": ["Qual é o país localizado na América do Sul?",
                  "Qual é a capital do Brasil?",
                  "Qual é a população do Brasil?"]
})

from transformers import T5ForConditionalGeneration
import torch

model = T5ForConditionalGeneration.from_pretrained("t5-base")

from pytorch_lightning import LightningModule, Trainer

class T5FineTuner(LightningModule):

    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained("t5-base")

    def forward(self, input_ids, attention_mask):
        return self.model(input_ids, attention_mask)

    def training_step(self, batch, batch_idx):

        input_ids, attention_mask, target_ids = batch
        output = self.model(input_ids, attention_mask)
        output = output.logits
        loss = self.loss(output, target_ids)

        self.log("loss", loss, on_step=True, on_epoch=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=1e-4)
        return optimizer

trainer = Trainer(max_epochs=5, gpus=1)
trainer.fit(model=T5FineTuner(), train_dataloaders=df)

But I got this error:

ValueError: too many values to unpack (expected 3)

How can I fix that?

you can do somthing like this by using huggingface datasets

import pandas as pd
from torch.utils.data import DataLoader
from datasets import Dataset as hfds

df = pd.DataFrame({
    "text": ["O Brasil é um país localizado na América do Sul.",
              "A capital do Brasil é Brasília.",
              "A população do Brasil é de mais de 210 milhões de pessoas."],
    "question": ["Qual é o país localizado na América do Sul?",
                  "Qual é a capital do Brasil?",
                  "Qual é a população do Brasil?"]})
df["target_ids"] = 0


from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base")

def tokenize(qa_pair):
    # more preprocessing if you want
    return tokenizer(qa_pair["text"], qa_pair["question"], padding="max_length")

data = hfds.from_pandas(df)
data = data.map(tokenize, batched=True)
data = data.remove_columns(["text", "question"]).with_format("torch")
dataloader = DataLoader(data, batch_size=2)

and


from transformers import T5ForConditionalGeneration
import torch

model = T5ForConditionalGeneration.from_pretrained("t5-base")

from lightning import LightningModule, Trainer

class T5FineTuner(LightningModule):

    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained("t5-base")

    def forward(self, input_ids, attention_mask):
        return self.model(input_ids, attention_mask)

    def training_step(self, batch, batch_idx):
        # batch is now:
        print(batch)
        
        input_ids, attention_mask, target_ids = batch.get("input_ids"), \
                                                batch.get("attention_mask"), \
                                                batch.get("target_ids")
        output = self.model(input_ids, attention_mask)
        output = output.logits
        loss = self.loss(output, target_ids)

        self.log("loss", loss, on_step=True, on_epoch=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=1e-4)
        return optimizer

trainer = Trainer(max_epochs=5, accelerator="gpu", devices=1)

As I do not have time check the reuqired input format of T5-model, you can find it in hugging-face forum I think

1 Like

@Vela-zz I tried this code:

import pandas as pd, torch
from torch.utils.data import DataLoader
from datasets import Dataset as hfds
from transformers import T5ForConditionalGeneration
import lightning.pytorch as pl

df = pd.DataFrame({
    "text": ["O Brasil é um país localizado na América do Sul.",
              "A capital do Brasil é Brasília.",
              "A população do Brasil é de mais de 210 milhões de pessoas."],
    "question": ["Qual é o país localizado na América do Sul?",
                  "Qual é a capital do Brasil?",
                  "Qual é a população do Brasil?"]})
df["target_ids"] = 0


from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base")

def tokenize(qa_pair):
    # more preprocessing if you want
    return tokenizer(qa_pair["text"], qa_pair["question"], padding="max_length")

data = hfds.from_pandas(df)
data = data.map(tokenize, batched=True)
data = data.remove_columns(["text", "question"]).with_format("torch")
dataloader = DataLoader(data, batch_size=2)

class T5FineTuner(pl.LightningModule):

    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained("t5-base")

    def forward(self, input_ids, attention_mask):
        return self.model(input_ids, attention_mask)

    def training_step(self, batch, batch_idx):
        # batch is now:
        print(batch)
        
        input_ids, attention_mask, target_ids = batch.get("input_ids"), \
                                                batch.get("attention_mask"), \
                                                batch.get("target_ids")
        output = self.model(input_ids, attention_mask)
        output = output.logits
        loss = self.loss(output, target_ids)

        self.log("loss", loss, on_step=True, on_epoch=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=1e-4)
        return optimizer
    
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath="outputs\pl",  # Specify the directory where checkpoints will be saved
    filename="model-{epoch:02d}-{val_loss:.2f}",  # Naming convention for checkpoints
    monitor="val_loss",  # Metric to monitor for saving the best checkpoint
    mode="min",  # Mode for monitoring ("min" for loss, "max" for accuracy, etc.)
)

trainer = pl.Trainer(max_epochs=5, accelerator="gpu", devices=1, callbacks=[checkpoint_callback])
trainer.fit(T5FineTuner(), dataloader)

and got:

ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds

As I do not know the target you want to train, a part of runnable code may like this

import pandas as pd
from torch.utils.data import DataLoader
from datasets import Dataset as hfds

df = pd.DataFrame({
    "text": ["O Brasil é um país localizado na América do Sul.",
              "A capital do Brasil é Brasília.",
              "A população do Brasil é de mais de 210 milhões de pessoas."],
    "question": ["Qual é o país localizado na América do Sul?",
                  "Qual é a capital do Brasil?",
                  "Qual é a população do Brasil?"]})

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base")

def tokenize(qa_pair):
    # more preprocessing if you want
    input_tokens_dict = tokenizer(qa_pair['text'],return_tensors="pt")
    output_token_dict = tokenizer(qa_pair['question'],return_tensors="pt")
    # not familiar with T5ConditionalGenerator 
    # copy from https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb, you may check it by youself
    input_ids, input_atten = input_tokens_dict.input_ids, input_tokens_dict.attention_mask
    output_ids, output_atten = output_token_dict.input_ids, output_token_dict.attention_mask
    output_ids = output_ids[:, :-1].contiguous()
    output_atten = output_atten[:, :-1].contiguous()

    labels = output_ids.clone()
    labels[output_ids == tokenizer.pad_token_id] = -100
    
    return {
        "input_ids":input_ids.squeeze(0), "input_atten":input_atten.squeeze(0),
        "output_ids":output_ids.squeeze(0), "output_atten":output_atten.squeeze(0),
        "label":labels.squeeze(0)
    }

data = hfds.from_pandas(df)
data = data.map(tokenize)
data = data.remove_columns(["text", "question"]).with_format("torch")
dataloader = DataLoader(data, batch_size=1)


from transformers import T5ForConditionalGeneration
import torch

model = T5ForConditionalGeneration.from_pretrained("t5-base")

from lightning import LightningModule, Trainer

class T5FineTuner(LightningModule):

    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained("t5-base")

    def forward(self, input_ids, attention_mask):
        return self.model(input_ids, attention_mask)

    def training_step(self, batch, batch_idx):
        input_ids, input_attention_mask, output_ids, output_attention_mask = batch.get("input_ids"), \
                                                batch.get("input_atten"), \
                                                batch.get("output_ids"), \
                                                batch.get("output_atten")
        labels = batch.get("label")
        output = self.model(
            input_ids  = input_ids, 
            attention_mask  = input_attention_mask,
            decoder_input_ids = output_ids,
            decoder_attention_mask = output_attention_mask,
            labels=labels
            )
        
        loss = output[0]
        self.log("loss", loss, on_step=True, on_epoch=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=1e-4)
        return optimizer

trainer = Trainer(max_epochs=5, accelerator="gpu", devices=1)
trainer.fit(model=T5FineTuner(), train_dataloaders=dataloader)

when you want to train a LLM model, I personally recomand you just use hugging-face auth package, because they are already designed for easily reused.

1 Like