How does one create a pytorch data loader with a custom hugging face data set without having errors?
Asked Answered
H

1

0

Currently my custom data set gives None indices in the data loader, but NOT in the pure data set. When I wrap it in pytorch data loader it fails.

Code is in colab but will put it here in case colab dies someday:

pip install datasets
pip install pytorch
pip install transformers

then run

token = None
batch_size = 10
from datasets import load_dataset
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
if tokenizer.pad_token_id is None:
  tokenizer.pad_token = tokenizer.eos_token
probe_network = GPT2LMHeadModel.from_pretrained("gpt2")
device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu")
probe_network = probe_network.to(device)

# -- Get batch from dataset
from datasets import load_dataset
# path, name = 'brando/debug1_af', 'debug1_af'
path, name = 'brando/debug0_af', 'debug0_af'
remove_columns = []
dataset = load_dataset(path, name, streaming=True, split="train", token=token).with_format("torch")
print(f'{dataset=}')
batch = dataset.take(batch_size)
# print(f'{next(iter(batch))=}')

# - Prepare functions to tokenize batch
def preprocess(examples):  # gets the raw text batch according to the specific names in table in data set & tokenize
    return tokenizer(examples["link"], padding="max_length", max_length=128, truncation=True, return_tensors="pt")
def map(batch):  # apply preprocess to batch to all examples in batch represented as a dataset
    return batch.map(preprocess, batched=True, remove_columns=remove_columns)
tokenized_batch = batch.map(preprocess, batched=True, remove_columns=remove_columns)
tokenized_batch = map(batch)
# print(f'{next(iter(tokenized_batch))=}')

from torch.utils.data import Dataset, DataLoader, SequentialSampler
dataset = tokenized_batch
print(f'{type(dataset)=}')
print(f'{dataset.__class__=}')
print(f'{isinstance(dataset, Dataset)=}')
# for i, d in enumerate(dataset):
#     assert isinstance(d, dict)
#     # dd = dataset[i]
#     # assert isinstance(dd, dict)
loader_opts = {}
classifier_opts = {} 
# data_loader = DataLoader(dataset, shuffle=False, batch_size=loader_opts.get('batch_size', 1),
#                         num_workers=loader_opts.get('num_workers', 0), drop_last=False, sampler=SequentialSampler(range(512))  )
data_loader = DataLoader(dataset, shuffle=False, batch_size=loader_opts.get('batch_size', 1),
                    num_workers=loader_opts.get('num_workers', 0), drop_last=False, sampler=None)
print(f'{iter(data_loader)=}')
print(f'{next(iter(data_loader))=}')
print('Done\a')

error:

dataset=<datasets.iterable_dataset.IterableDataset object at 0x7e42c2f21d20>
type(dataset)=<class 'datasets.iterable_dataset.IterableDataset'>
dataset.__class__=<class 'datasets.iterable_dataset.IterableDataset'>
isinstance(dataset, Dataset)=True
iter(data_loader)=<torch.utils.data.dataloader._SingleProcessDataLoaderIter object at 0x7e42c2f21660>
/usr/local/lib/python3.10/dist-packages/datasets/formatting/torch_formatter.py:68: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in collate(batch, collate_fn_map)
    126         try:
--> 127             return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
    128         except TypeError:

9 frames
/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in <dictcomp>(.0)
    126         try:
--> 127             return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
    128         except TypeError:

/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in collate(batch, collate_fn_map)
    149 
--> 150     raise TypeError(default_collate_err_msg_format.format(elem_type))
    151 

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-6-1153c5915bd8> in <cell line: 49>()
     47                     num_workers=loader_opts.get('num_workers', 0), drop_last=False, sampler=None)
     48 print(f'{iter(data_loader)=}')
---> 49 print(f'{next(iter(data_loader))=}')
     50 print('Done\a')

/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py in __next__(self)
    631                 # TODO(https://github.com/pytorch/pytorch/issues/76750)
    632                 self._reset()  # type: ignore[call-arg]
--> 633             data = self._next_data()
    634             self._num_yielded += 1
    635             if self._dataset_kind == _DatasetKind.Iterable and \

/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
    675     def _next_data(self):
    676         index = self._next_index()  # may raise StopIteration
--> 677         data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    678         if self._pin_memory:
    679             data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)

/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
     40         else:
     41             data = next(self.dataset_iter)
---> 42         return self.collate_fn(data)
     43 
     44 

/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in default_collate(batch)
    263             >>> default_collate(batch)  # Handle `CustomType` automatically
    264     """
--> 265     return collate(batch, collate_fn_map=default_collate_fn_map)

/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in collate(batch, collate_fn_map)
    128         except TypeError:
    129             # The mapping type may not support `__init__(iterable)`.
--> 130             return {key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem}
    131     elif isinstance(elem, tuple) and hasattr(elem, '_fields'):  # namedtuple
    132         return elem_type(*(collate(samples, collate_fn_map=collate_fn_map) for samples in zip(*batch)))

/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in <dictcomp>(.0)
    128         except TypeError:
    129             # The mapping type may not support `__init__(iterable)`.
--> 130             return {key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem}
    131     elif isinstance(elem, tuple) and hasattr(elem, '_fields'):  # namedtuple
    132         return elem_type(*(collate(samples, collate_fn_map=collate_fn_map) for samples in zip(*batch)))

/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in collate(batch, collate_fn_map)
    148                 return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]
    149 
--> 150     raise TypeError(default_collate_err_msg_format.format(elem_type))
    151 
    152 

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>

why is this error happening?

I've done all the checks, e.g., make sure the return indices are dicts, even went in detail debugging mode with pdb inside of pytorch's code.

Herzberg answered 10/8, 2023 at 1:24 Comment(1)
a small modification to this should work: https://mcmap.net/q/673093/-how-does-one-create-a-pytoch-data-loader-using-an-interleaved-hugging-face-datasetHerzberg
H
0

Without using a collate fn remove the columns so default collate works

def get_tokenized_dataset_to_work_with_pytorch_dataloader_by_removing_columns_without_tenosr():
    """
    Remove the columns that are not tensors, and then it works with pytorch dataloader.

    ref so: https://mcmap.net/q/673092/-how-does-one-create-a-pytorch-data-loader-with-a-custom-hugging-face-data-set-without-having-errors
    """
    batch_size = 10
    token = open(Path('~/data/hf_token.txt').expanduser()).read().strip()

    # -- AF now
    from datasets import load_dataset
    import torch
    from transformers import GPT2Tokenizer, GPT2LMHeadModel
    
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    if tokenizer.pad_token_id is None:
      tokenizer.pad_token = tokenizer.eos_token
    probe_network = GPT2LMHeadModel.from_pretrained("gpt2")
    device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu")
    probe_network = probe_network.to(device)

    # -- Get batch from dataset
    from datasets import load_dataset
    path, name = 'brando/debug1_af', 'debug1_af'
    dataset = load_dataset(path, name, streaming=True, split="train", token=token).with_format(type="torch")
    print(f'{dataset.column_names=}')
    batch = dataset.take(1)
    def preprocess_formalize(examples): 
        """ link,formal statement,generated informal statement,solvable by sledgehammer,keep or not,informalization correct """
        informal_statement = examples["generated informal statement"]
        formal_statement = examples["formal statement"]
        text = f'informal statement {informal_statement} formal statement {formal_statement}'
        return tokenizer(text, padding="max_length", max_length=128, truncation=True, return_tensors="pt")
    column_names = next(iter(batch)).keys()
    print(f'{column_names=}')

    # - Prepare functions to tokenize batch
    preprocess = preprocess_formalize
    remove_columns = column_names  # remove everything except the tokenized fields in the dict
    print(f'{remove_columns=}')
    def map(batch):  # apply preprocess to batch to all examples in batch represented as a dataset
        return batch.map(preprocess, batched=True, remove_columns=remove_columns)
    tokenized_batch = map(batch)

    # -- Get data loader
    from torch.utils.data import DataLoader, Dataset
    data_loader = DataLoader(tokenized_batch, shuffle=False, batch_size=8, num_workers=0, drop_last=False)
    print(f'{next(iter(data_loader))=}')
    print('Done!\a')

otherwise write your own collate e.g.,

def demo_how_to_use_collate_fn_with_pytorch_dataloader():
    """
    so: https://mcmap.net/q/673092/-how-does-one-create-a-pytorch-data-loader-with-a-custom-hugging-face-data-set-without-having-errors
    """
    batch_size = 512
    token = open(Path('~/data/hf_token.txt').expanduser()).read().strip()

    # -- AF now
    from datasets import load_dataset
    import torch
    from transformers import GPT2Tokenizer, GPT2LMHeadModel
    
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    if tokenizer.pad_token_id is None:
      tokenizer.pad_token = tokenizer.eos_token
    probe_network = GPT2LMHeadModel.from_pretrained("gpt2")
    device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu")
    probe_network = probe_network.to(device)

    # -- Get batch from dataset
    from datasets import load_dataset
    path, name = 'brando/debug1_af', 'debug1_af'
    dataset = load_dataset(path, name, streaming=True, split="train", token=token).with_format(type="torch")
    batch = dataset.take(512)
    # column_names = next(iterbatch).keys()
    # print(f'{column_names=}')
    
    # -- Get data loader
    from torch.utils.data import DataLoader, Dataset

    def collate_tokenize(data):
        text_batch = [f'informal statement {example["generated informal statement"]} formal statement {example["formal statement"]}' for example in data]
        tokenized = tokenizer(text_batch, padding='longest', max_length=128, truncation=True, return_tensors='pt')
        return tokenized
    data_loader = DataLoader(batch, shuffle=False, batch_size=8, num_workers=0, drop_last=False, collate_fn=collate_tokenize)
    batch = next(iter(data_loader))
    print(f'{batch=}')

    data_loader = DataLoader(dataset, shuffle=False, batch_size=8, num_workers=0, drop_last=False, collate_fn=collate_tokenize)
    batch = next(iter(data_loader))
    print(f'{batch=}')
    print('Done!\a')
Herzberg answered 10/8, 2023 at 22:14 Comment(0)

© 2022 - 2024 — McMap. All rights reserved.