Hi @awaelchli
Thanks for your quick reply.
I could see hparams by running your code:
dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'state_dict', 'loops', 'callbacks', 'optimizer_states', 'lr_schedulers', 'hparams_name', 'hyper_parameters'])
But in my code, if I load the checkpoint right after training, I still could not see hparams among dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'state_dict', 'loops', 'callbacks', 'optimizer_states', 'lr_schedulers'])
My run.py:
parser = argparse.ArgumentParser(description='Generic runner for VAE models')
parser.add_argument('--config', '-c',
dest="filename",
metavar='FILE',
help = 'path to the config file',
default='configs/vae.yaml')
parser.add_argument("--wandb_name",
dest="run_name",
help="name of wandb run"
)
args = parser.parse_args()
with open(args.filename, 'r') as file:
try:
config = yaml.safe_load(file)
except yaml.YAMLError as exc:
print(exc)
wandb_log_dir = Path(config['logging_params']['save_dir']) / args.run_name
wandb_logger = WandbLogger(save_dir=wandb_log_dir,
name=args.run_name,
project="beta-vae"
)
seed_everything(config['exp_params']['manual_seed'], True)
model = vae_models[config['model_params']['name']](**config['model_params'])
experiment = VAEXperiment(model,
config['data_params']['train_batch_size'],
config['exp_params']
)
ds_params = {**config["data_params"],
**config["spec_params"],
**config["predict_params"]}
data = VAEDataset(
config["data_params"]["data_dir"],
pin_memory=len(config['trainer_params']['gpus']) != 0,
**ds_params
)
data.setup()
checkpoint_callback = ModelCheckpoint(save_top_k=1,
dirpath =os.path.join(wandb_log_dir, "checkpoints"),
filename='{epoch}-{val_loss:.2f}',
monitor="val_loss",
save_last=True
)
checkpoint_callback.CHECKPOINT_NAME_LAST = "last_ckpt-{epoch}-{val_loss:.2f}"
profiler = AdvancedProfiler(dirpath=wandb_log_dir, filename=f"{args.run_name}")
if config['train_params']['resume'] and Path(config['train_params']['ckpt_path']).exists():
resume_ckpt = config['train_params']['ckpt_path']
else:
resume_ckpt = None
runner = Trainer(logger=wandb_logger,
callbacks=[
LearningRateMonitor(),
checkpoint_callback,
EarlyStopping(monitor="val_loss", mode="min", patience=config['train_params']['patience'])
],
profiler=profiler,
**config['trainer_params'])
print("\n")
print(f"======= Training {config['model_params']['name']} =======")
runner.fit(experiment, datamodule=data, ckpt_path=resume_ckpt)
print("\n")
print(f"======= Testing the best model {config['model_params']['name']} =======")
runner.test(
ckpt_path="best",
datamodule=data
)
print(torch.load(runner.checkpoint_callback.best_model_path).keys())
wandb.finish()
Thanks in advance for looking into it. Please let me know if you need other parts of my code.