| import inspect |
| import re |
| from typing import Dict, List, Tuple |
|
|
| from huggingface_hub.utils import insecure_hashlib |
|
|
| from .arrow import arrow |
| from .audiofolder import audiofolder |
| from .cache import cache |
| from .csv import csv |
| from .hdf5 import hdf5 |
| from .imagefolder import imagefolder |
| from .json import json |
| from .pandas import pandas |
| from .parquet import parquet |
| from .pdffolder import pdffolder |
| from .sql import sql |
| from .text import text |
| from .videofolder import videofolder |
| from .webdataset import webdataset |
| from .xml import xml |
|
|
|
|
| def _hash_python_lines(lines: list[str]) -> str: |
| filtered_lines = [] |
| for line in lines: |
| line = re.sub(r"#.*", "", line) |
| if line: |
| filtered_lines.append(line) |
| full_str = "\n".join(filtered_lines) |
|
|
| |
| full_bytes = full_str.encode("utf-8") |
| return insecure_hashlib.sha256(full_bytes).hexdigest() |
|
|
|
|
| |
| _PACKAGED_DATASETS_MODULES = { |
| "csv": (csv.__name__, _hash_python_lines(inspect.getsource(csv).splitlines())), |
| "json": (json.__name__, _hash_python_lines(inspect.getsource(json).splitlines())), |
| "pandas": (pandas.__name__, _hash_python_lines(inspect.getsource(pandas).splitlines())), |
| "parquet": (parquet.__name__, _hash_python_lines(inspect.getsource(parquet).splitlines())), |
| "arrow": (arrow.__name__, _hash_python_lines(inspect.getsource(arrow).splitlines())), |
| "text": (text.__name__, _hash_python_lines(inspect.getsource(text).splitlines())), |
| "imagefolder": (imagefolder.__name__, _hash_python_lines(inspect.getsource(imagefolder).splitlines())), |
| "audiofolder": (audiofolder.__name__, _hash_python_lines(inspect.getsource(audiofolder).splitlines())), |
| "videofolder": (videofolder.__name__, _hash_python_lines(inspect.getsource(videofolder).splitlines())), |
| "pdffolder": (pdffolder.__name__, _hash_python_lines(inspect.getsource(pdffolder).splitlines())), |
| "webdataset": (webdataset.__name__, _hash_python_lines(inspect.getsource(webdataset).splitlines())), |
| "xml": (xml.__name__, _hash_python_lines(inspect.getsource(xml).splitlines())), |
| "hdf5": (hdf5.__name__, _hash_python_lines(inspect.getsource(hdf5).splitlines())), |
| } |
|
|
| |
| _PACKAGED_DATASETS_MODULES_2_15_HASHES = { |
| "csv": "eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d", |
| "json": "8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96", |
| "pandas": "3ac4ffc4563c796122ef66899b9485a3f1a977553e2d2a8a318c72b8cc6f2202", |
| "parquet": "ca31c69184d9832faed373922c2acccec0b13a0bb5bbbe19371385c3ff26f1d1", |
| "arrow": "74f69db2c14c2860059d39860b1f400a03d11bf7fb5a8258ca38c501c878c137", |
| "text": "c4a140d10f020282918b5dd1b8a49f0104729c6177f60a6b49ec2a365ec69f34", |
| "imagefolder": "7b7ce5247a942be131d49ad4f3de5866083399a0f250901bd8dc202f8c5f7ce5", |
| "audiofolder": "d3c1655c66c8f72e4efb5c79e952975fa6e2ce538473a6890241ddbddee9071c", |
| } |
|
|
| |
| _EXTENSION_TO_MODULE: dict[str, tuple[str, dict]] = { |
| ".csv": ("csv", {}), |
| ".tsv": ("csv", {"sep": "\t"}), |
| ".json": ("json", {}), |
| ".jsonl": ("json", {}), |
| |
| ".ndjson": ("json", {}), |
| ".parquet": ("parquet", {}), |
| ".geoparquet": ("parquet", {}), |
| ".gpq": ("parquet", {}), |
| ".arrow": ("arrow", {}), |
| ".txt": ("text", {}), |
| ".tar": ("webdataset", {}), |
| ".xml": ("xml", {}), |
| ".hdf5": ("hdf5", {}), |
| ".h5": ("hdf5", {}), |
| } |
| _EXTENSION_TO_MODULE.update({ext: ("imagefolder", {}) for ext in imagefolder.ImageFolder.EXTENSIONS}) |
| _EXTENSION_TO_MODULE.update({ext.upper(): ("imagefolder", {}) for ext in imagefolder.ImageFolder.EXTENSIONS}) |
| _EXTENSION_TO_MODULE.update({ext: ("audiofolder", {}) for ext in audiofolder.AudioFolder.EXTENSIONS}) |
| _EXTENSION_TO_MODULE.update({ext.upper(): ("audiofolder", {}) for ext in audiofolder.AudioFolder.EXTENSIONS}) |
| _EXTENSION_TO_MODULE.update({ext: ("videofolder", {}) for ext in videofolder.VideoFolder.EXTENSIONS}) |
| _EXTENSION_TO_MODULE.update({ext.upper(): ("videofolder", {}) for ext in videofolder.VideoFolder.EXTENSIONS}) |
| _EXTENSION_TO_MODULE.update({ext: ("pdffolder", {}) for ext in pdffolder.PdfFolder.EXTENSIONS}) |
| _EXTENSION_TO_MODULE.update({ext.upper(): ("pdffolder", {}) for ext in pdffolder.PdfFolder.EXTENSIONS}) |
|
|
| |
| _MODULE_TO_EXTENSIONS: dict[str, list[str]] = {} |
| for _ext, (_module, _) in _EXTENSION_TO_MODULE.items(): |
| _MODULE_TO_EXTENSIONS.setdefault(_module, []).append(_ext) |
|
|
| for _module in _MODULE_TO_EXTENSIONS: |
| _MODULE_TO_EXTENSIONS[_module].append(".zip") |
|
|
| |
| _MODULE_TO_METADATA_FILE_NAMES: Dict[str, List[str]] = {} |
| for _module in _MODULE_TO_EXTENSIONS: |
| _MODULE_TO_METADATA_FILE_NAMES[_module] = [] |
| _MODULE_TO_METADATA_FILE_NAMES["imagefolder"] = imagefolder.ImageFolder.METADATA_FILENAMES |
| _MODULE_TO_METADATA_FILE_NAMES["audiofolder"] = imagefolder.ImageFolder.METADATA_FILENAMES |
| _MODULE_TO_METADATA_FILE_NAMES["videofolder"] = imagefolder.ImageFolder.METADATA_FILENAMES |
| _MODULE_TO_METADATA_FILE_NAMES["pdffolder"] = imagefolder.ImageFolder.METADATA_FILENAMES |
|
|