I am using a Unet model for semantic segmentation - I have a custom dataset of images and their masks both in .png format. I have looked in the online forums and tried stuff, but not much works? Any suggestions in how to resolve the error or improve the code would be helpful.
model.eval()
with torch.no_grad():
for xb, yb in val_dl:
yb_pred = model(xb.to(device))
# yb_pred = yb_pred["out"].cpu()
print(yb_pred.shape)
yb_pred = torch.argmax(yb_pred,axis = 1)
break
print(yb_pred.shape)
criteron = nn.CrossEntropyLoss(reduction = 'sum')
opt = optim.Adam(model.parameters(), lr = 3e-4)
def loss_batch(loss_func, output, target, opt = None):
loss = loss_func(output, target)
if opt is not None:
opt.zero_grad()
loss.backward()
opt.step()
return loss.item(), None
lr_scheduler = ReduceLROnPlateau(opt, mode = 'min', factor = 0.5, patience= 20, verbose = 1)
def get_lr(opt):
for param_group in opt.param_groups:
return param_group['lr']
current_lr = get_lr(opt)
print('current_lr = {}'.format(current_lr))
def loss_epoch(model, loss_func, dataset_dl, sanity_check = False, opt = None):
running_loss = 0.0
len_data = len(dataset_dl.dataset)
for xb, yb in dataset_dl:
xb = xb.to(device)
yb = yb.to(device)
# xb = torch.tensor(xbh, requires_grad=True)
output = model(xb)
loss_b, metric_b = loss_batch(loss_func, output, yb, opt)
running_loss += loss_b
if sanity_check is True:
break
loss = running_loss/float(len_data)
return loss, None
def train_val(model, params):
num_epochs = params["num_epochs"]
loss_func = params["loss_func"]
opt = params["optimizer"]
train_dl = params["train_dl"]
val_dl = params["val_dl"]
sanity_check = params["sanity_check"]
lr_scheduler = params["lr_scheduler"]
path2weights = params["path2weights"]
loss_history = {"train": [],
"val": []}
best_model_wts = copy.deepcopy(model.state_dict())
best_loss = float('inf')
for epoch in range(num_epochs):
current_lr = get_lr(opt)
print('Epoch {}/{}, current_lr = {}'.format(epoch, num_epochs - 1, current_lr))
with torch.enable_grad():
model.train()
train_loss, _ = loss_epoch(model, loss_func, train_dl, sanity_check, opt)
loss_history["train"].append(train_loss)
model.eval()
with torch.no_grad():
val_loss, _ = loss_epoch(model, loss_func, val_dl, sanity_check, opt)
loss_history["val"].append(val_loss)
if val_loss < best_loss:
best_loss = val_loss
best_model_wts = copy.deepcopy(model.state_dict())
torch.save(model.state_dict(), path2weights)
print("copied best model weights!!")
lr_scheduler.step(val_loss)
if current_lr != get_lr(opt):
print("Loading best model weights!!")
model.load_state_dict(best_model_wts)
print("train Loss: %.6f" %(train_loss))
print("val_loss: %.6f" %(val_loss))
print("-"*20)
model.load_state_dict(best_model_wts)
return model, loss_history, metric_history
path2models = "./models/"
if not os.path.exists(path2models):
os.mkdir(path2models)
param_train = {
"num_epochs": 10,
"loss_func": criteron,
"optimizer": opt,
"train_dl": train_dl,
"val_dl": val_dl,
"sanity_check": False,
"lr_scheduler": lr_scheduler,
"path2weights": path2models + "weights.pt"
model, loss_hist, _ = train_val(model, param_train)
The error message looks like:
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
And here is the traceback:
File "<ipython-input-108-1ef24c0b1593>", line 10, in <module>
model, loss_hist, _ = train_val(model, param_train)
File "<ipython-input-106-53830bafab8b>", line 27, in train_val
val_loss, _ = loss_epoch(model, loss_func, val_dl, sanity_check, opt)
File "<ipython-input-104-5fc229145602>", line 13, in loss_epoch
loss_b, metric_b = loss_batch(loss_func, output, yb, opt)
File "<ipython-input-100-68322a002c04>", line 6, in loss_batch
loss.backward()
File "C:\Users\W540\anaconda3\lib\site-packages\torch\tensor.py", line 198, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "C:\Users\W540\anaconda3\lib\site-packages\torch\autograd\__init__.py", line 100, in backward
allow_unreachable=True) # allow_unreachable flag
I am not sure which variable to set as require_grad = True or where I should enable grad...