I am trying to fine tune a LLM
My code so far:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
AutoTokenizer,
AutoConfig,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
TrainingArguments,
Trainer)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
# load dataset
dataset = load_dataset('TokenBender/code_instructions_122k_alpaca_style')
dataset
Error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In [12], line 2
1 # load dataset
----> 2 dataset = load_dataset('TokenBender/code_instructions_122k_alpaca_style')
3 dataset
File /usr/local/lib/python3.9/dist-packages/datasets/load.py:1664, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, **config_kwargs)
1661 ignore_verifications = ignore_verifications or save_infos
1663 # Create a dataset builder
-> 1664 builder_instance = load_dataset_builder(
1665 path=path,
1666 name=name,
1667 data_dir=data_dir,
1668 data_files=data_files,
1669 cache_dir=cache_dir,
1670 features=features,
1671 download_config=download_config,
1672 download_mode=download_mode,
1673 revision=revision,
1674 use_auth_token=use_auth_token,
1675 **config_kwargs,
1676 )
1678 # Return iterable dataset in case of streaming
1679 if streaming:
File /usr/local/lib/python3.9/dist-packages/datasets/load.py:1490, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, use_auth_token, **config_kwargs)
1488 download_config = download_config.copy() if download_config else DownloadConfig()
1489 download_config.use_auth_token = use_auth_token
-> 1490 dataset_module = dataset_module_factory(
1491 path,
1492 revision=revision,
1493 download_config=download_config,
1494 download_mode=download_mode,
1495 data_dir=data_dir,
1496 data_files=data_files,
1497 )
1499 # Get dataset builder class from the processing script
1500 builder_cls = import_main_class(dataset_module.module_path)
File /usr/local/lib/python3.9/dist-packages/datasets/load.py:1242, in dataset_module_factory(path, revision, download_config, download_mode, force_local_path, dynamic_modules_path, data_dir, data_files, **download_kwargs)
1237 if isinstance(e1, FileNotFoundError):
1238 raise FileNotFoundError(
1239 f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. "
1240 f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
1241 ) from None
-> 1242 raise e1 from None
1243 else:
1244 raise FileNotFoundError(
1245 f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory."
1246 )
File /usr/local/lib/python3.9/dist-packages/datasets/load.py:1223, in dataset_module_factory(path, revision, download_config, download_mode, force_local_path, dynamic_modules_path, data_dir, data_files, **download_kwargs)
1215 return HubDatasetModuleFactoryWithScript(
1216 path,
1217 revision=revision,
(...)
1220 dynamic_modules_path=dynamic_modules_path,
1221 ).get_module()
1222 else:
-> 1223 return HubDatasetModuleFactoryWithoutScript(
1224 path,
1225 revision=revision,
1226 data_dir=data_dir,
1227 data_files=data_files,
1228 download_config=download_config,
1229 download_mode=download_mode,
1230 ).get_module()
1231 except Exception as e1: # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
1232 try:
File /usr/local/lib/python3.9/dist-packages/datasets/load.py:846, in HubDatasetModuleFactoryWithoutScript.get_module(self)
836 token = self.download_config.use_auth_token
837 hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info(
838 self.name,
839 revision=self.revision,
840 token=token,
841 timeout=100.0,
842 )
843 patterns = (
844 sanitize_patterns(self.data_files)
845 if self.data_files is not None
--> 846 else get_patterns_in_dataset_repository(hfh_dataset_info)
847 )
848 data_files = DataFilesDict.from_hf_repo(
849 patterns,
850 dataset_info=hfh_dataset_info,
851 allowed_extensions=ALL_ALLOWED_EXTENSIONS,
852 )
853 infered_module_names = {
854 key: infer_module_for_data_files(data_files_list, use_auth_token=self.download_config.use_auth_token)
855 for key, data_files_list in data_files.items()
856 }
File /usr/local/lib/python3.9/dist-packages/datasets/data_files.py:471, in get_patterns_in_dataset_repository(dataset_info)
469 resolver = partial(_resolve_single_pattern_in_dataset_repository, dataset_info)
470 try:
--> 471 return _get_data_files_patterns(resolver)
472 except FileNotFoundError:
473 raise FileNotFoundError(
474 f"The dataset repository at '{dataset_info.id}' doesn't contain any data file."
475 ) from None
File /usr/local/lib/python3.9/dist-packages/datasets/data_files.py:99, in _get_data_files_patterns(pattern_resolver)
97 try:
98 for pattern in patterns:
---> 99 data_files = pattern_resolver(pattern)
100 if len(data_files) > 0:
101 non_empty_splits.append(split)
File /usr/local/lib/python3.9/dist-packages/datasets/data_files.py:303, in _resolve_single_pattern_in_dataset_repository(dataset_info, pattern, allowed_extensions)
301 data_files_ignore = FILES_TO_IGNORE
302 fs = HfFileSystem(repo_info=dataset_info)
--> 303 glob_iter = [PurePath(filepath) for filepath in fs.glob(PurePath(pattern).as_posix()) if fs.isfile(filepath)]
304 matched_paths = [
305 filepath
306 for filepath in glob_iter
307 if filepath.name not in data_files_ignore and not filepath.name.startswith(".")
308 ]
309 if allowed_extensions is not None:
File /usr/local/lib/python3.9/dist-packages/fsspec/spec.py:606, in AbstractFileSystem.glob(self, path, maxdepth, **kwargs)
602 depth = None
604 allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
--> 606 pattern = glob_translate(path + ("/" if ends_with_sep else ""))
607 pattern = re.compile(pattern)
609 out = {
610 p: info
611 for p, info in sorted(allpaths.items())
(...)
618 )
619 }
File /usr/local/lib/python3.9/dist-packages/fsspec/utils.py:734, in glob_translate(pat)
732 continue
733 elif "**" in part:
--> 734 raise ValueError(
735 "Invalid pattern: '**' can only be an entire path component"
736 )
737 if part:
738 results.extend(_translate(part, f"{not_sep}*", not_sep))
ValueError: Invalid pattern: '**' can only be an entire path component
I tried to find something online the closet I found is this article https://github.com/coala/coala/issues/401
but I could not understand their solution. Can anyone help me in understanding the solution for the error I am facing. Thanks.
My library versions:
- peft : '0.6.0'
- torch : '2.1.2+cu121'
- datasets : '2.1.0'
- transformers : '4.21.3'