Currently my custom data set gives None indices in the data loader, but NOT in the pure data set. When I wrap it in pytorch data loader it fails.
Code is in colab but will put it here in case colab dies someday:
pip install datasets
pip install pytorch
pip install transformers
then run
token = None
batch_size = 10
from datasets import load_dataset
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
if tokenizer.pad_token_id is None:
tokenizer.pad_token = tokenizer.eos_token
probe_network = GPT2LMHeadModel.from_pretrained("gpt2")
device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu")
probe_network = probe_network.to(device)
# -- Get batch from dataset
from datasets import load_dataset
# path, name = 'brando/debug1_af', 'debug1_af'
path, name = 'brando/debug0_af', 'debug0_af'
remove_columns = []
dataset = load_dataset(path, name, streaming=True, split="train", token=token).with_format("torch")
print(f'{dataset=}')
batch = dataset.take(batch_size)
# print(f'{next(iter(batch))=}')
# - Prepare functions to tokenize batch
def preprocess(examples): # gets the raw text batch according to the specific names in table in data set & tokenize
return tokenizer(examples["link"], padding="max_length", max_length=128, truncation=True, return_tensors="pt")
def map(batch): # apply preprocess to batch to all examples in batch represented as a dataset
return batch.map(preprocess, batched=True, remove_columns=remove_columns)
tokenized_batch = batch.map(preprocess, batched=True, remove_columns=remove_columns)
tokenized_batch = map(batch)
# print(f'{next(iter(tokenized_batch))=}')
from torch.utils.data import Dataset, DataLoader, SequentialSampler
dataset = tokenized_batch
print(f'{type(dataset)=}')
print(f'{dataset.__class__=}')
print(f'{isinstance(dataset, Dataset)=}')
# for i, d in enumerate(dataset):
# assert isinstance(d, dict)
# # dd = dataset[i]
# # assert isinstance(dd, dict)
loader_opts = {}
classifier_opts = {}
# data_loader = DataLoader(dataset, shuffle=False, batch_size=loader_opts.get('batch_size', 1),
# num_workers=loader_opts.get('num_workers', 0), drop_last=False, sampler=SequentialSampler(range(512)) )
data_loader = DataLoader(dataset, shuffle=False, batch_size=loader_opts.get('batch_size', 1),
num_workers=loader_opts.get('num_workers', 0), drop_last=False, sampler=None)
print(f'{iter(data_loader)=}')
print(f'{next(iter(data_loader))=}')
print('Done\a')
error:
dataset=<datasets.iterable_dataset.IterableDataset object at 0x7e42c2f21d20>
type(dataset)=<class 'datasets.iterable_dataset.IterableDataset'>
dataset.__class__=<class 'datasets.iterable_dataset.IterableDataset'>
isinstance(dataset, Dataset)=True
iter(data_loader)=<torch.utils.data.dataloader._SingleProcessDataLoaderIter object at 0x7e42c2f21660>
/usr/local/lib/python3.10/dist-packages/datasets/formatting/torch_formatter.py:68: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in collate(batch, collate_fn_map)
126 try:
--> 127 return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
128 except TypeError:
9 frames
/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in <dictcomp>(.0)
126 try:
--> 127 return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
128 except TypeError:
/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in collate(batch, collate_fn_map)
149
--> 150 raise TypeError(default_collate_err_msg_format.format(elem_type))
151
TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-6-1153c5915bd8> in <cell line: 49>()
47 num_workers=loader_opts.get('num_workers', 0), drop_last=False, sampler=None)
48 print(f'{iter(data_loader)=}')
---> 49 print(f'{next(iter(data_loader))=}')
50 print('Done\a')
/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py in __next__(self)
631 # TODO(https://github.com/pytorch/pytorch/issues/76750)
632 self._reset() # type: ignore[call-arg]
--> 633 data = self._next_data()
634 self._num_yielded += 1
635 if self._dataset_kind == _DatasetKind.Iterable and \
/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
675 def _next_data(self):
676 index = self._next_index() # may raise StopIteration
--> 677 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
678 if self._pin_memory:
679 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
40 else:
41 data = next(self.dataset_iter)
---> 42 return self.collate_fn(data)
43
44
/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in default_collate(batch)
263 >>> default_collate(batch) # Handle `CustomType` automatically
264 """
--> 265 return collate(batch, collate_fn_map=default_collate_fn_map)
/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in collate(batch, collate_fn_map)
128 except TypeError:
129 # The mapping type may not support `__init__(iterable)`.
--> 130 return {key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem}
131 elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple
132 return elem_type(*(collate(samples, collate_fn_map=collate_fn_map) for samples in zip(*batch)))
/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in <dictcomp>(.0)
128 except TypeError:
129 # The mapping type may not support `__init__(iterable)`.
--> 130 return {key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem}
131 elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple
132 return elem_type(*(collate(samples, collate_fn_map=collate_fn_map) for samples in zip(*batch)))
/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in collate(batch, collate_fn_map)
148 return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]
149
--> 150 raise TypeError(default_collate_err_msg_format.format(elem_type))
151
152
TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>
why is this error happening?
I've done all the checks, e.g., make sure the return indices are dicts, even went in detail debugging mode with pdb inside of pytorch's code.