Hello community,
I’m currently working on finetuning the AnyDoor model by adding LoRA layers, inspired by a technique I found in this post. I’ve integrated LoRA layers into specific parts of the model successfully, but when I start the training process, PyTorch’s autograd throws a RuntimeError One of the differentiated Tensors does not require grad
related to tensor differentiation.
Below is the relevant section of my code where I define the LoRA layers and attempt to substitute the original model layers with these:
torch.autograd.set_detect_anomaly(True)
class LoRALayer(torch.nn.Module):
def __init__(self, in_dim, out_dim, rank, alpha):
super().__init__()
std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
self.W_a = torch.nn.Parameter(torch.randn(in_dim, rank) * std_dev)
self.W_b = torch.nn.Parameter(torch.zeros(rank, out_dim))
self.alpha = alpha
def forward(self, x):
x = self.alpha * (x @ self.W_a @ self.W_b)
return x
class LinearWithLoRA(torch.nn.Module):
def __init__(self, linear, rank, alpha):
super().__init__()
self.linear = linear
self.lora = LoRALayer(linear.in_features, linear.out_features, rank, alpha)
def forward(self, x):
return self.linear(x) + self.lora(x)
save_memory = False
disable_verbosity()
if save_memory:
enable_sliced_attention()
# Configs
resume_path = ".ckpt/epoch=1-step=8687_ft.ckpt"
batch_size = 1
logger_freq = 1000
learning_rate = 1e-5
sd_locked = False
only_mid_control = False
n_gpus = 1
accumulate_grad_batches = 1
# First use cpu to load models. Pytorch Lightning will automatically move it to GPUs.
model = create_model("./configs/anydoor.yaml").cpu()
model.load_state_dict(load_state_dict(resume_path, location="cpu"))
model.learning_rate = learning_rate
model.sd_locked = sd_locked
model.only_mid_control = only_mid_control
for name, param in model.named_parameters():
param.requires_grad = False
for name, param in model.named_parameters():
if "model.diffusion_model.output_blocks" in name:
param.requires_grad = True
lora_r = 8
lora_alpha = 16
lora_dropout = 0.05
assign_lora = partial(LinearWithLoRA, rank=lora_r, alpha=lora_alpha)
for block in model.model.diffusion_model.output_blocks:
for layer in block:
# Some Linear layers where I applied LoRA. Both raise the error.
if isinstance(layer, ResBlock):
# Access the emb_layers which is a Sequential containing Linear layers
emb_layers = layer.emb_layers
for i, layer in enumerate(emb_layers):
if isinstance(layer, torch.nn.Linear):
# Assign LoRA or any other modifications to the Linear layer
emb_layers[i] = assign_lora(layer)
if isinstance(layer, SpatialTransformer):
layer.proj_in = assign_lora(layer.proj_in)
trainable_count = sum(p.numel() for p in model.parameters() if p.requires_grad == True)
print("trainable parameters: ", trainable_count)
with open("model_parameters.txt", "w") as file:
for name, param in model.named_parameters():
file.write(f"{name}: {param.requires_grad}\n")
with open("lora_model.txt", "w") as file:
print(model, file=file)
# Datasets
DConf = OmegaConf.load("./configs/datasets.yaml")
dataset = VitonHDDataset(**DConf.Train.VitonHD)
dataloader = DataLoader(dataset, num_workers=8, batch_size=batch_size, shuffle=True)
logger = ImageLogger(batch_frequency=logger_freq)
trainer = pl.Trainer(
gpus=n_gpus,
strategy="ddp",
precision=16,
accelerator="gpu",
callbacks=[logger],
progress_bar_refresh_rate=1,
accumulate_grad_batches=accumulate_grad_batches,
)
# Train
trainer.fit(model, dataloader)
I’ve made sure to freeze the parameters of the original model and only allow gradients for the newly added LoRA layers. However, during the training initiation, I encounter the following error:
self.precision_plugin.backward(self.lightning_module, closure_loss, *args, **kwargs)
File "/opt/conda/envs/anydoor/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 91, in backward
model.backward(closure_loss, optimizer, *args, **kwargs)
File "/opt/conda/envs/anydoor/lib/python3.8/site-packages/pytorch_lightning/core/lightning.py", line 1444, in backward
loss.backward(*args, **kwargs)
File "/opt/conda/envs/anydoor/lib/python3.8/site-packages/torch/_tensor.py", line 487, in backward
torch.autograd.backward(
File "/opt/conda/envs/anydoor/lib/python3.8/site-packages/torch/autograd/__init__.py", line 200, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/opt/conda/envs/anydoor/lib/python3.8/site-packages/torch/autograd/function.py", line 274, in apply
return user_fn(self, *args)
File "/home/ubuntu/mnt/myData/AnyDoor/ldm/modules/diffusionmodules/util.py", line 142, in backward
input_grads = torch.autograd.grad(
File "/opt/conda/envs/anydoor/lib/python3.8/site-packages/torch/autograd/__init__.py", line 303, in grad
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: One of the differentiated Tensors does not require grad
This error is raised when I call trainer.fit(model, dataloader) using PyTorch Lightning’s Trainer class.
I’ve already tried enabling torch.autograd.set_detect_anomaly(True) to pinpoint the issue, but the additional information provided hasn’t led me to a clear solution. The error seems to indicate a problem with tensor differentiation, possibly suggesting that a tensor involved in the computation does not have its requires_grad property set correctly. However, I’m not directly manipulating tensors’ requires_grad property except for the initial parameter freezing and subsequent modification to incorporate LoRA layers.
Has anyone encountered a similar issue or can offer insights into what might be causing this error? I’m particularly interested in understanding how to correctly integrate custom layers like LoRA into existing models without disrupting the autograd mechanism.
Any help or pointers would be greatly appreciated!