So, I was trying to train on ResNet model in PyTorch using the ImageNet example in the GitHub repository.
Here's what my train method looks like (it is almost similar to that in example)
def train(train_loader, model, criterion, optimizer, epoch):
batch_time = AverageMeter()
data_time = AverageMeter()
losses = AverageMeter()
top1 = AverageMeter()
top5 = AverageMeter()
args = get_args()
# switch to train mode
model.train()
end = time.time()
for i, (input, target) in enumerate(train_loader):
print(i)
# data loading time
data_time.update(time.time() - end)
if cuda:
target = target.cuda(async = True)
input_var = torch.autograd.Variable(input).cuda()
else:
input_var = torch.autograd.Variable(input)
target_var = torch.autograd.Variable(target)
# compute output
output = model(input_var)
loss = criterion(output, target_var)
# measure accuracy and record loss
prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
losses.update(loss.item(), input.size(0))
top1.update(prec1.item(), input.size(0))
# top5.update(prec5.item(), input.size(0))
# compute gradient and do optimizer step
optimizer.zero_grad()
loss.backward()
optimizer.step()
#measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
# print to console and write logs to tensorboard
if i % args.print_freq == 0:
print('Epoch: [{0}][{1}/{2}]\t'
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'.format(
epoch, i, len(train_loader), batch_time=batch_time,
data_time=data_time, loss=losses, top1=top1, top5=top5))
niter = epoch * len(train_loader) + i
# writer.add_scalar('Train/Loss', losses.val, niter)
# writer.add_scalar('Train/Prec@1', top1.val, niter)
# writer.add_scalar('Train/Prec@5', top5.val, niter)
System Information: GPU: Nvidia Titan XP Memory: 32 Gb
PyTorch: 0.4.0
When I run this code, training starts with epoch 0
Epoch: [0][0/108] Time 5.644 (5.644) Data 1.929 (1.929) Loss 6.9052 (6.9052) Prec@1 0.000 (0.000)
And then the remote server automatically disconnects. It happened five times.
And this is the data loader:
#Load the Data --> TRAIN
traindir = 'train'
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
train_dataset = datasets.ImageFolder(traindir, transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
]))
train_loader = torch.utils.data.DataLoader(
train_dataset,
batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers,
pin_memory=cuda
)
# Load the data --> Validation
valdir = 'valid'
valid_loader = torch.utils.data.DataLoader(
datasets.ImageFolder(valdir, transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
])),
batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers,
pin_memory=cuda
)
if args.evaluate:
validate(valid_loader, model, criterion, epoch=0)
return
# Start
for epoch in range(args.start_epoch, args.epochs):
adjust_learning_rate(optimizer, epoch)
# train for epoch
train(train_loader, model, criterion, optimizer, epoch)
# evaluate on valid
prec1 = validate(valid_loader, model, criterion, epoch)
# remember best prec1 and save checkpoint
is_best = prec1 > best_prec1
best_prec1 = max(prec1, best_prec1)
save_checkpoint({
'epoch': epoch + 1,
'arch': args.arch,
'state_dict': model.state_dict(),
'best_prec1': best_prec1,
'optimizer': optimizer.state_dict()
}, is_best)
With this params for the loader:
args.num_workers = 4
args.batch_size = 32
pin_memory = torch.cuda.is_available()
Is there something wrong in my approach?