System hangs after first epoch training in pytorch
Asked Answered
P

1

6

So, I was trying to train on ResNet model in PyTorch using the ImageNet example in the GitHub repository.

Here's what my train method looks like (it is almost similar to that in example)

def train(train_loader, model, criterion, optimizer, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    args = get_args()

    # switch to train mode
    model.train()

    end = time.time()

    for i, (input, target) in enumerate(train_loader):
        print(i)
        # data loading time
        data_time.update(time.time() - end)

        if cuda:
            target = target.cuda(async = True)
            input_var = torch.autograd.Variable(input).cuda()
        else:
            input_var = torch.autograd.Variable(input)

        target_var = torch.autograd.Variable(target)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
        losses.update(loss.item(), input.size(0))
        top1.update(prec1.item(), input.size(0))
        # top5.update(prec5.item(), input.size(0))

        # compute gradient and do optimizer step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        #measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        # print to console and write logs to tensorboard
        if i % args.print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'.format(
                epoch, i, len(train_loader), batch_time=batch_time,
                data_time=data_time, loss=losses, top1=top1, top5=top5))
            niter = epoch * len(train_loader) + i
            # writer.add_scalar('Train/Loss', losses.val, niter)
            # writer.add_scalar('Train/Prec@1', top1.val, niter)
            # writer.add_scalar('Train/Prec@5', top5.val, niter)

System Information: GPU: Nvidia Titan XP Memory: 32 Gb

PyTorch: 0.4.0

When I run this code, training starts with epoch 0

Epoch: [0][0/108]   Time 5.644 (5.644)  Data 1.929 (1.929)  Loss 6.9052 (6.9052)    Prec@1 0.000 (0.000)

And then the remote server automatically disconnects. It happened five times.

And this is the data loader:

#Load the Data --> TRAIN
    traindir = 'train'
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    train_dataset = datasets.ImageFolder(traindir, transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers,
        pin_memory=cuda
    )

    # Load the data --> Validation
    valdir = 'valid'
    valid_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers,
        pin_memory=cuda
    )

    if args.evaluate:
        validate(valid_loader, model, criterion, epoch=0)
        return

    # Start
    for epoch in range(args.start_epoch, args.epochs):
        adjust_learning_rate(optimizer, epoch)

        # train for epoch
        train(train_loader, model, criterion, optimizer, epoch)

        # evaluate on valid
        prec1 = validate(valid_loader, model, criterion, epoch)

        # remember best prec1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        save_checkpoint({
            'epoch': epoch + 1,
            'arch': args.arch,
            'state_dict': model.state_dict(),
            'best_prec1': best_prec1,
            'optimizer': optimizer.state_dict()
        }, is_best)

With this params for the loader:

args.num_workers = 4
args.batch_size = 32
pin_memory = torch.cuda.is_available()

Is there something wrong in my approach?

Paresis answered 18/6, 2018 at 14:35 Comment(0)
L
1

seems a bug in pytorch's dataloader.

try args.num_workers = 0

Lodgment answered 23/6, 2020 at 14:53 Comment(1)
where did you get this from?Selenodont

© 2022 - 2024 — McMap. All rights reserved.