How to solve ' CUDA out of memory. Tried to allocate xxx MiB' in pytorch?
Asked Answered
D

2

11

I am trying to train a CNN in pytorch,but I meet some problems. The RuntimeError:

RuntimeError: CUDA out of memory. Tried to allocate 512.00 MiB (GPU 0; 2.00 GiB total capacity; 584.97 MiB already allocated; 13.81 MiB free; 590.00 MiB reserved in total by PyTorch)

This is my code:

import os
import numpy as np
import cv2
import torch as t
import torch.nn as nn
import torchvision.transforms as transforms
from torch.utils.data import DataLoader,Dataset
import time
import matplotlib.pyplot as plt
%matplotlib inline
root_path='C:/Users/60960/Desktop/recet-task/course_LeeML20/course_LeeML20-datasets/hw3/food-11'
training_path=root_path+'/training'
testing_path=root_path+'/testing'
validation_path=root_path+'/validation'
def readfile(path,has_label):
    img_paths=sorted(os.listdir(path))
    x=np.zeros((len(img_paths),128,128,3),dtype=np.uint8)
    y=np.zeros((len(img_paths)),dtype=np.uint8)
    for i,file in enumerate(img_paths):
        img=cv2.imread(path+'/'+file)
        x[i,:,:]=cv2.resize(img,(128,128))
        if has_label:
            y[i]=int(file.split('_')[0])
    if has_label:
        return x,y
    else:
        return x
def show_img(img_from_cv2):
    b,g,r=cv2.split(img_from_cv2)
    img=cv2.merge([r,g,b])
    plt.imshow(img)
    plt.show()
x_train,y_train=readfile(training_path,True)
x_val,y_val=readfile(validation_path,True)
x_test=readfile(testing_path,False)
train_transform=transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ToTensor()
])
test_transform=transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor()
])
class ImgDataset(Dataset):
    def __init__(self,x,y=None,transform=None):
        self.x=x
        self.y=y
        if y is not None:
            self.y=t.LongTensor(y)
        self.transform=transform
    def __len__(self):
        return len(self.x)
    def __getitem__(self,idx):
        X=self.x[idx]
        if self.transform is not None:
            X=self.transform(X)
        if self.y is not None:
            Y=self.y[idx]
            return X,Y
        return X
batch_size=128
train_set=ImgDataset(x_train,y_train,transform=train_transform)
val_set=ImgDataset(x_val,y_val,transform=test_transform)
train_loader=DataLoader(train_set,batch_size=batch_size,shuffle=True)
val_loader=DataLoader(val_set,batch_size=batch_size,shuffle=False)
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier,self).__init__()
        self.cnn=nn.Sequential(
            nn.Conv2d(3,64,3,1,1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2,2,0),

            nn.Conv2d(64,128,3,1,1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2,2,0),

            nn.Conv2d(128,256,3,1,1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2,2,0),

            nn.Conv2d(256,512,3,1,1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(2,2,0),

            nn.Conv2d(512,512,3,1,1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(2,2,0)
        )
        self.fc=nn.Sequential(
            nn.Linear(512*4*4,1024),
            nn.ReLU(),
            nn.Linear(1024,512),
            nn.ReLU(),
            nn.Linear(512,11)
        )
    def forward(self,x):
        out=self.cnn(x)
        out=out.view(out.size()[0],-1)
        return self.fc(out)
model=Classifier().cuda()
loss_fn=nn.CrossEntropyLoss()
optim=t.optim.Adam(model.parameters(),lr=0.001)
epochs=30
for epoch in range(epochs):
    epoch_start_time=time.time()
    train_acc=0.0
    train_loss=0.0
    val_acc=0.0
    val_loss=0.0
    model.train()
    for i,data in enumerate(train_loader):
        optim.zero_grad()
        train_pred=model(data[0].cuda())
        batch_loss=loss_fn(train_pred,data[1].cuda())
        batch_loss.backward()
        optim.step()
        train_acc+=np.sum(np.argmax(train_pred.cpu().data.numpy(),axis=1)==data[1].numpy())
        train_loss+=batch_loss.item()
    model.eval()
    with t.no_grad():
        for i,data in enumerate(val_loader):
            val_pred=model(data[0].cuda())
            batch_loss=loss_fn(val_pred,data[1].cuda())
            val_acc+=np.sum(np.argmax(val_pred.cpu().data.numpy(),axis=1)==data[1].numpy())
            val_loss+=batch_loss.item()
        print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f | Val Acc: %3.6f loss: %3.6f' % (epoch + 1, epochs, time.time()-epoch_start_time,train_acc/train_set.__len__(), train_loss/train_set.__len__(), val_acc/val_set.__len__(), val_loss/val_set.__len__()))
x_train_val=np.concatenate((x_train,x_val),axis=0)
y_train_val=np.concatenate((y_train,y_val),axis=0)
train_val_set=ImgDataset(x_train_val,x_train_val,train_transform)
train_val_loader=DataLoader(train_val_set,batch_size=batch_size,shuffle=True)
model_final=Classifier().cuda()
loss_fn=nn.CrossEntropy()
optim=t.optim.Adam(model_final.parameters(),lr=0.001)
epochs=30
for epoch in range(epochs):
    epoch_start_time=time.time()
    train_acc=0.0
    train_loss=0.0
    model_final.train()
    for i,data in enumerate(train_val_loader):
        optim.zero_grad()
        train_pred=model_final(data[0].cuda())
        batch_loss=loss_fn(train_pred,data[1].cuda())
        batch_loss.backward()
        optim.step()
        train_acc+=np.sum(np.argmax(train_pred.cpu().data.numpy(),axis=1)==data[1].numpy())
        train_loss+=batch_loss.item()
    print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f' % (epoch + 1, epochs, time.time()-epoch_start_time,train_acc/train_val_set.__len__(), train_loss/train_val_set.__len__()))
test_set=ImgDataset(x_test,transform=test_transform)
test_loader=DataLoader(test_set,batch_size=batch_size,shuffle=False)
model_final.eval()
prediction=[]
with t.no_grad():
    for i,data in enumerate(test_loader):
        test_pred=model_final(data.cuda())
        test_label=np.argmax(test_pred.cpu().data.numpy(),axis=1)
        for y in test_label:
            prediction.append(y)
with open('predict.csv','w') as f:
    f.write('Id,Category\n')
    for i,y in enumerate(prediction):
        f.write('{},{}\n,'.format(i,y))

Pytorch version is 1.4.0, opencv2 version is 4.2.0.
The training dataset are pictures like these:training set

The error happens at this line:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-1-770be67177f4> in <module>
    119     for i,data in enumerate(train_loader):
    120         optim.zero_grad()
--> 121         train_pred=model(data[0].cuda())
    122         batch_loss=loss_fn(train_pred,data[1].cuda())
    123         batch_loss.backward()

I have already installed: some information.
GPU utilization is low,close to zero: GPU utilization.
Error message says:

RuntimeError: CUDA out of memory. Tried to allocate 512.00 MiB.

So I want to know how to allocate more memory.
What's more, I have tried to reduce the batch size to 1, but this doesn't work. HELP!!!

Dole answered 15/4, 2020 at 17:31 Comment(3)
It means you don't have enough GPU RAM to hold your model in memory. What type of GPU do you have?Erogenous
My GPU's information is here: i.loli.net/2020/04/16/1i8whHmfkxV3S9p.pngDole
Your GPU only has 2GB of GPU RAM which is simply not enough to train modern deep 2d conv nets. To reduce the memory footprint I would advise reducing the number of channels in your linear layers since these tend to take a lot of memory.Erogenous
U
6

Before reducing the batch size check the status of GPU memory :slight_smile:

nvidia-smi

Then check which process is eating up the memory choose PID and kill :boom: that process with

sudo kill -9 PID

or

sudo fuser -v /dev/nvidia*

sudo kill -9 PID

Upgrowth answered 23/1, 2021 at 6:8 Comment(0)
G
5

Try reducing your batch_size (ex. 32). This can happen because your GPU memory can't hold all your images for a single epoch.

Godfry answered 16/4, 2020 at 4:58 Comment(6)
I have tried to reduce the batch size to 1, but this doesn't work.Dole
hmm you can reduce the number of convolution layer and the kernel size. but if you still wanna try this, you can try run your model on google colabGodfry
Any other suggestions? I've made my model ridiculously small and it still failsTremulant
@RylanSchaeffer what is your GPU?Godfry
I figured out the issue. Reducing the batch size didn't help. The problem was that my custom dataloaders weren't releasing memory due to holding torch gradients.Tremulant
@SuryaMahadi I'm running into this problem on Google Colab lol, I think the failure is in the software somewhere and not the hardware because my instance is powerfulPanzer

© 2022 - 2024 — McMap. All rights reserved.