Hparams missing/not saved in checkpoints (self.save_hyperparameters() was called)

Hi,

I would like to access hparams of a trained model via MyLightningModule.load_from_checkpoint(my_ckpt_path). But I realized that the ckpt only has: dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'state_dict', 'loops', 'callbacks', 'optimizer_states', 'lr_schedulers']). There is no hparams.

I did call self.save_hyperparameters(logger=False) in the init of the LightningModule:

class BetaVAE(pl.LightningModule): 
    def __init__(self,
                 in_channels: int,
                 latent_dim: int,
                 hidden_dims: List = None,
                 **kwargs) -> None:
        super().__init__()
        self.save_hyperparameters(logger=False)

After reading carefully through lightning posts online about saving hparams, I still could not figure out what went wrong in my code.

I am using:
pytorch 1.13.0
pytorch-cuda 11.6
pytorch-lightning 1.8.1

Any hints or suggestions are highly appreciated!

Hey @JXuann

In order to help you, we would need to look at your code. There is not enough information here. I don’t see anything wrong with what you have shared and the hyperparameters get correctly saved in our tests. You can verify it with a simple example:


import os

import torch
from torch.utils.data import DataLoader, Dataset

from pytorch_lightning import LightningModule, Trainer


class RandomDataset(Dataset):
    def __init__(self, size, length):
        self.len = length
        self.data = torch.randn(length, size)

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return self.len


class BoringModel(LightningModule):
    def __init__(self, in_channels: int, latent_dim: int, hidden_dims=None):
        super().__init__()
        self.save_hyperparameters(logger=False)
        self.layer = torch.nn.Linear(32, 2)

    def forward(self, x):
        return self.layer(x)

    def training_step(self, batch, batch_idx):
        loss = self(batch).sum()
        self.log("train_loss", loss)
        return {"loss": loss}

    def configure_optimizers(self):
        return torch.optim.SGD(self.layer.parameters(), lr=0.1)


def run():
    train_data = DataLoader(RandomDataset(32, 64), batch_size=2)
    model = BoringModel(2, 3)
    trainer = Trainer(
        default_root_dir=os.getcwd(),
        limit_train_batches=1,
        limit_val_batches=1,
        limit_test_batches=1,
        num_sanity_val_steps=0,
        max_epochs=1,
        enable_model_summary=False,
    )
    trainer.fit(model, train_dataloaders=train_data)
    print(torch.load(trainer.checkpoint_callback.best_model_path).keys())


if __name__ == "__main__":
    run()

Can you access self.hparams in your model? Is it empty?

Hi @awaelchli

Thanks for your quick reply.

I could see hparams by running your code:
dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'state_dict', 'loops', 'callbacks', 'optimizer_states', 'lr_schedulers', 'hparams_name', 'hyper_parameters'])

But in my code, if I load the checkpoint right after training, I still could not see hparams among dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'state_dict', 'loops', 'callbacks', 'optimizer_states', 'lr_schedulers'])

My run.py:

parser = argparse.ArgumentParser(description='Generic runner for VAE models')
parser.add_argument('--config',  '-c',
                    dest="filename",
                    metavar='FILE',
                    help = 'path to the config file',
                    default='configs/vae.yaml')
parser.add_argument("--wandb_name",
                    dest="run_name",
                    help="name of wandb run"
)
args = parser.parse_args()

with open(args.filename, 'r') as file:
    try:
        config = yaml.safe_load(file)
    except yaml.YAMLError as exc:
        print(exc)

wandb_log_dir = Path(config['logging_params']['save_dir']) / args.run_name
wandb_logger = WandbLogger(save_dir=wandb_log_dir,
                           name=args.run_name,
                           project="beta-vae"
                           )

seed_everything(config['exp_params']['manual_seed'], True)

model = vae_models[config['model_params']['name']](**config['model_params'])
experiment = VAEXperiment(model,
                          config['data_params']['train_batch_size'], 
                          config['exp_params']
                          ) 
ds_params = {**config["data_params"],
             **config["spec_params"],
             **config["predict_params"]}
data = VAEDataset(
    config["data_params"]["data_dir"],
    pin_memory=len(config['trainer_params']['gpus']) != 0,
    **ds_params
    )
data.setup()

checkpoint_callback = ModelCheckpoint(save_top_k=1, 
                                      dirpath =os.path.join(wandb_log_dir, "checkpoints"),
                                      filename='{epoch}-{val_loss:.2f}',
                                      monitor="val_loss",
                                      save_last=True
                                      )
checkpoint_callback.CHECKPOINT_NAME_LAST = "last_ckpt-{epoch}-{val_loss:.2f}"
profiler = AdvancedProfiler(dirpath=wandb_log_dir, filename=f"{args.run_name}")

if config['train_params']['resume'] and Path(config['train_params']['ckpt_path']).exists():
    resume_ckpt = config['train_params']['ckpt_path']
else:
    resume_ckpt = None

runner = Trainer(logger=wandb_logger,
                 callbacks=[
                     LearningRateMonitor(), 
                     checkpoint_callback,
                     EarlyStopping(monitor="val_loss", mode="min", patience=config['train_params']['patience'])
                 ],
                 profiler=profiler,
                 **config['trainer_params'])

print("\n")
print(f"======= Training {config['model_params']['name']} =======")
runner.fit(experiment, datamodule=data, ckpt_path=resume_ckpt)

print("\n")
print(f"======= Testing the best model {config['model_params']['name']} =======")

runner.test(
    ckpt_path="best", 
    datamodule=data 
)

print(torch.load(runner.checkpoint_callback.best_model_path).keys())

wandb.finish()

Thanks in advance for looking into it. Please let me know if you need other parts of my code.

The self.hparams are fine (printed out from init):

"beta":              1
"gamma":             10.0
"hidden_dims":       None
"in_channels":       1
"latent_dim":        128
"loss_type":         B
"max_capacity":      25
"name":              BetaVAE
"snr_weighting":     True

In case it helps, at runner.test(ckpt_path="best", datamodule=data ) ,

    def test_step(self, batch, batch_idx, optimizer_idx = 0):
        """run the test step on the best model before terminating the whole workflow"""
        print("In test step, model.hparams are: ", self.model.hparams)
        real_img, labels, norm_snr = batch 
        self.curr_device = self.device 

        results = self.forward(real_img)  
        test_loss = self.model.loss_function(*results,
                                            M_N=1.0,  # real_img.shape[0]/ self.num_test_imgs,
                                            optimizer_idx=optimizer_idx,
                                            batch_idx=batch_idx)

        self.log_dict({f"val_{key}": val.item() for key, val in test_loss.items()}, on_step=False, on_epoch=True,
                      sync_dist=True, batch_size=real_img.shape[0]) 

I could still access

model.hparams are:  
"Capacity_max_iter": 10000
"beta":              1
"gamma":             10.0
"hidden_dims":       [512, 256, 128, 64, 32]
"in_channels":       512
"latent_dim":        128
"loss_type":         B
"max_capacity":      25
"name":              BetaVAE
"snr_weighting":     True

But the next step, load_check_point, hparams are no longer in the dict. I guess something went wrong with the checkpoint.

Hey @awaelchli

I finally finally figured out what went wrong in my code:
(My code was built upon this repo: GitHub - AntixK/PyTorch-VAE: A Collection of Variational Autoencoders (VAE) in PyTorch.)

My training_step, validation_step, test_step etc. are not inside the model class but reside in:

class VAEXperiment(pl.LightningModule):

    def __init__(self,
                 vae_model: BaseVAE,
                 train_batch_size: int,
                 params: dict) -> None:
        super(VAEXperiment, self).__init__()

        self.model = vae_model
        self.train_batch_size = train_batch_size
        self.params = params
        self.curr_device = None
        self.hold_graph = False
        try:
            self.hold_graph = self.params['retain_first_backpass']
        except:
            pass

When I call self.save_hyperparameters(logger=False) inside VAEXperiment, the hparams are saved as expected:

dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'state_dict', 'loops', 'callbacks', 'optimizer_states', 'lr_schedulers', 'hparams_name', 'hyper_parameters'])

'hyper_parameters':
{'train_batch_size': 32, 'params': {'LR': 1e-05, 'weight_decay': 0.0, 'scheduler_gamma': 0.95, 'kld_weight': 0.0006, 'manual_seed': 1265}}

My next question is: is there a way to save hparams from the model class in the ckpt?

Many thanks in advance!

I’m sorry, I dont understand your followup question. What do you mean by “from the model class”. Could you show an example of how you would expect to call it?

Hi @awaelchli ,

Basically, I have two LightningModules, one defines the model and the other one defines the training/experiment steps. Only the hparams of the latter was saved to the ckpt.

Here is my model class, which is also a LightningModule. (But the hparams from this are not saved to ckpt.)

class BetaVAE(pl.LightningModule): 
    def __init__(self,
                 in_channels: int,
                 latent_dim: int,
                 hidden_dims: List = None,
                 **kwargs) -> None:
        super().__init__()
        self.save_hyperparameters(logger=False)

    def encode(self, input: Tensor) -> List[Tensor]:
        xxx
    def decode(self, z: Tensor) -> Tensor:
        xxx
    def reparameterize(self, mu: Tensor, logvar: Tensor) -> Tensor:
        xxx
    def forward(self, input: Tensor, **kwargs) -> Tensor:
        xxx    
    def loss_function(self,
                      *args,
                      **kwargs) -> dict:
        xxx
    def sample(self,
               num_samples:int,
               current_device: int, **kwargs) -> Tensor:
        xxx
    def generate(self, x: Tensor, **kwargs) -> Tensor:
        xxx

Currently hparams of class VAEXperiment(pl.LightningModule) (described in the previous post) are saved to ckpt. I guess it is because train_step, validation_step, test_step are under this class.

I would like the hparams of the class BetaVAE (i.e. in_channels, latent_dim, hidden_dims,) also saved to hparams of ckpt. Is there a way of adding them to the dict?

I hope I explained myself in an understandable way. Otherwise, please do ask again. Many thanks!

Hi @awaelchli ,

I finally got it by adding hparam of one LightningModule to the hparam dict of the other one!

class VAEXperiment(pl.LightningModule):

    def __init__(self,
                 vae_model: BaseVAE,
                 train_batch_size: int,
                 params: dict) -> None:
        super(VAEXperiment, self).__init__()
        self.save_hyperparameters(logger=False)

        self.model = vae_model
        self.hparams.update(self.model.hparams) # <-------my solution

Before we close this thread, please help me understand saving hparams in ckpt better:
apparently in my case where there are two LightningModules and only the hparams of one was saved automatically to ckpt, even though both have self.save_hyperparameters() in init.

My question is: how is it decided in the background which LightningModule’s hparams are saved to ckpt? Many thanks!
(I attempted to look into your github code but I must admit that I did not figure it out there.)