When doing fine-tuning with Hg trainer, training is fine but it failed during validation. Even reducing the eval_accumation_steps = 1 did not work.
I followed the procedure in the link: Why is evaluation set draining the memory in pytorch hugging face? It did not work for me.
When I removed the evaluation dataset in the TrainingArguments, it works fine! But if I added it back like the following, it ran out of memory after finishing 10th step of training (because it was going to do evaluation).
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(
'bigscience/bloom-1b1',
load_in_8bit=True,
device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained('bigscience/bloom-1b1')
from peft import LoraConfig, get_peft_model
config = LoraConfig(
r= 8, #attention heads
lora_alpha=32, #alpha scaling
target_modules=["query_key_value"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM" # set this for CLM or Seq2Seq
)
model = get_peft_model(model, config)
import transformers
trainer = transformers.Trainer(
model=model,
train_dataset=tokenized_datasets['train'],
eval_dataset=tokenized_datasets["validation"],
args=transformers.TrainingArguments(
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
warmup_steps=2,
max_steps=60,
learning_rate=2e-4,
evaluation_strategy = 'steps',
eval_accumulation_steps = 1,
eval_steps = 10,
seed = 42,
report_to="wandb",
fp16=True,
logging_steps=1,
output_dir='outputs'
),
data_collator=transformers.DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True)
# data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
trainer.train()
error:
OutOfMemoryError Traceback (most recent call last) in <cell line: 26>() 24 ) 25 model.config.use_cache = False # silence the warnings. Please re-enable for inference! ---> 26 trainer.train() 27 28 wandb.finish()
17 frames /usr/local/lib/python3.10/dist-packages/torch/nn/functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing) 3027 if size_average is not None or reduce is not None: 3028 reduction = _Reduction.legacy_get_string(size_average, reduce) -> 3029 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing) 3030 3031
OutOfMemoryError: CUDA out of memory. Tried to allocate 5.56 GiB (GPU 0; 14.75 GiB total capacity; 12.58 GiB already allocated; 840.81 MiB free; 12.86 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF