Upload loaders.py with huggingface_hub
Browse files- loaders.py +31 -16
loaders.py
CHANGED
@@ -22,19 +22,18 @@ LoadFromKaggle: loads datasets from the kaggle.com community site
|
|
22 |
LoadFromIBMCloud: loads a dataset from the IBM cloud.
|
23 |
------------------------
|
24 |
"""
|
25 |
-
import importlib
|
26 |
import itertools
|
27 |
import os
|
28 |
import tempfile
|
29 |
from pathlib import Path
|
30 |
from tempfile import TemporaryDirectory
|
31 |
-
from typing import Dict, Mapping, Optional, Sequence, Union
|
32 |
|
33 |
import pandas as pd
|
34 |
from datasets import load_dataset as hf_load_dataset
|
35 |
from tqdm import tqdm
|
36 |
|
37 |
-
from .dataclass import InternalField
|
38 |
from .logging_utils import get_logger
|
39 |
from .operator import SourceOperator
|
40 |
from .settings_utils import get_settings
|
@@ -43,20 +42,13 @@ from .stream import MultiStream, Stream
|
|
43 |
logger = get_logger()
|
44 |
settings = get_settings()
|
45 |
|
46 |
-
try:
|
47 |
-
import ibm_boto3
|
48 |
-
|
49 |
-
ibm_boto3_available = True
|
50 |
-
except ImportError:
|
51 |
-
ibm_boto3_available = False
|
52 |
-
|
53 |
|
54 |
class Loader(SourceOperator):
|
55 |
# The loader_limit an optional parameter used to control the maximum number of instances to load from the the source.
|
56 |
# It is usually provided to the loader via the recipe (see standard.py)
|
57 |
# The loader can use this value to limit the amount of data downloaded from the source
|
58 |
# to reduce loading time. However, this may not always be possible, so the
|
59 |
-
# loader may
|
60 |
# stream, after load is complete.
|
61 |
loader_limit: int = None
|
62 |
streaming: bool = False
|
@@ -92,7 +84,24 @@ class LoadHF(Loader):
|
|
92 |
Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
|
93 |
] = None
|
94 |
streaming: bool = True
|
|
|
95 |
_cache: dict = InternalField(default=None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
def stream_dataset(self):
|
98 |
if self._cache is None:
|
@@ -114,6 +123,9 @@ class LoadHF(Loader):
|
|
114 |
f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment vairable: UNITXT_ALLOW_UNVERIFIED_CODE."
|
115 |
) from e
|
116 |
|
|
|
|
|
|
|
117 |
if self.split is not None:
|
118 |
dataset = {self.split: dataset}
|
119 |
|
@@ -143,6 +155,10 @@ class LoadHF(Loader):
|
|
143 |
raise ValueError(
|
144 |
f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment vairable: UNITXT_ALLOW_UNVERIFIED_CODE."
|
145 |
) from e
|
|
|
|
|
|
|
|
|
146 |
if self.split is None:
|
147 |
for split in dataset.keys():
|
148 |
dataset[split] = dataset[split].to_iterable_dataset()
|
@@ -241,13 +257,10 @@ class MissingKaggleCredentialsError(ValueError):
|
|
241 |
# TODO write how to obtain kaggle credentials
|
242 |
class LoadFromKaggle(Loader):
|
243 |
url: str
|
|
|
244 |
|
245 |
def verify(self):
|
246 |
super().verify()
|
247 |
-
if importlib.util.find_spec("opendatasets") is None:
|
248 |
-
raise ImportError(
|
249 |
-
"Please install opendatasets in order to use the LoadFromKaggle loader (using `pip install opendatasets`) "
|
250 |
-
)
|
251 |
if not os.path.isfile("kaggle.json"):
|
252 |
raise MissingKaggleCredentialsError(
|
253 |
"Please obtain kaggle credentials https://christianjmills.com/posts/kaggle-obtain-api-key-tutorial/ and save them to local ./kaggle.json file"
|
@@ -283,6 +296,7 @@ class LoadFromIBMCloud(Loader):
|
|
283 |
# 3. Mapping: split -> file_names, e.g. {"test" : ["test1.json", "test2.json"], "train": ["train.json"]}
|
284 |
data_files: Union[Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
|
285 |
caching: bool = True
|
|
|
286 |
|
287 |
def _download_from_cos(self, cos, bucket_name, item_name, local_file):
|
288 |
logger.info(f"Downloading {item_name} from {bucket_name} COS")
|
@@ -337,7 +351,6 @@ class LoadFromIBMCloud(Loader):
|
|
337 |
|
338 |
def verify(self):
|
339 |
super().verify()
|
340 |
-
assert ibm_boto3_available, "Please install ibm_boto3 in order to use the LoadFromIBMCloud loader (using `pip install ibm-cos-sdk`) "
|
341 |
assert (
|
342 |
self.endpoint_url is not None
|
343 |
), f"Please set the {self.endpoint_url_env} environmental variable"
|
@@ -351,6 +364,8 @@ class LoadFromIBMCloud(Loader):
|
|
351 |
raise NotImplementedError("LoadFromKaggle cannot load with streaming.")
|
352 |
|
353 |
def process(self):
|
|
|
|
|
354 |
cos = ibm_boto3.resource(
|
355 |
"s3",
|
356 |
aws_access_key_id=self.aws_access_key_id,
|
|
|
22 |
LoadFromIBMCloud: loads a dataset from the IBM cloud.
|
23 |
------------------------
|
24 |
"""
|
|
|
25 |
import itertools
|
26 |
import os
|
27 |
import tempfile
|
28 |
from pathlib import Path
|
29 |
from tempfile import TemporaryDirectory
|
30 |
+
from typing import Dict, List, Mapping, Optional, Sequence, Union
|
31 |
|
32 |
import pandas as pd
|
33 |
from datasets import load_dataset as hf_load_dataset
|
34 |
from tqdm import tqdm
|
35 |
|
36 |
+
from .dataclass import InternalField, OptionalField
|
37 |
from .logging_utils import get_logger
|
38 |
from .operator import SourceOperator
|
39 |
from .settings_utils import get_settings
|
|
|
42 |
logger = get_logger()
|
43 |
settings = get_settings()
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
class Loader(SourceOperator):
|
47 |
# The loader_limit an optional parameter used to control the maximum number of instances to load from the the source.
|
48 |
# It is usually provided to the loader via the recipe (see standard.py)
|
49 |
# The loader can use this value to limit the amount of data downloaded from the source
|
50 |
# to reduce loading time. However, this may not always be possible, so the
|
51 |
+
# loader may ignore this. In any case, the recipe, will limit the number of instances in the returned
|
52 |
# stream, after load is complete.
|
53 |
loader_limit: int = None
|
54 |
streaming: bool = False
|
|
|
84 |
Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
|
85 |
] = None
|
86 |
streaming: bool = True
|
87 |
+
filtering_lambda: Optional[str] = None
|
88 |
_cache: dict = InternalField(default=None)
|
89 |
+
requirements_list: List[str] = OptionalField(default_factory=list)
|
90 |
+
|
91 |
+
def verify(self):
|
92 |
+
for requirement in self.requirements_list:
|
93 |
+
if requirement not in self._requirements_list:
|
94 |
+
self._requirements_list.append(requirement)
|
95 |
+
super().verify()
|
96 |
+
|
97 |
+
def filtered_load(self, dataset):
|
98 |
+
logger.info(f"\nLoading filtered by: {self.filtering_lambda};")
|
99 |
+
return MultiStream(
|
100 |
+
{
|
101 |
+
name: dataset[name].filter(eval(self.filtering_lambda))
|
102 |
+
for name in dataset
|
103 |
+
}
|
104 |
+
)
|
105 |
|
106 |
def stream_dataset(self):
|
107 |
if self._cache is None:
|
|
|
123 |
f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment vairable: UNITXT_ALLOW_UNVERIFIED_CODE."
|
124 |
) from e
|
125 |
|
126 |
+
if self.filtering_lambda is not None:
|
127 |
+
dataset = self.filtered_load(dataset)
|
128 |
+
|
129 |
if self.split is not None:
|
130 |
dataset = {self.split: dataset}
|
131 |
|
|
|
155 |
raise ValueError(
|
156 |
f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment vairable: UNITXT_ALLOW_UNVERIFIED_CODE."
|
157 |
) from e
|
158 |
+
|
159 |
+
if self.filtering_lambda is not None:
|
160 |
+
dataset = self.filtered_load(dataset)
|
161 |
+
|
162 |
if self.split is None:
|
163 |
for split in dataset.keys():
|
164 |
dataset[split] = dataset[split].to_iterable_dataset()
|
|
|
257 |
# TODO write how to obtain kaggle credentials
|
258 |
class LoadFromKaggle(Loader):
|
259 |
url: str
|
260 |
+
_requirements_list: List[str] = ["opendatasets"]
|
261 |
|
262 |
def verify(self):
|
263 |
super().verify()
|
|
|
|
|
|
|
|
|
264 |
if not os.path.isfile("kaggle.json"):
|
265 |
raise MissingKaggleCredentialsError(
|
266 |
"Please obtain kaggle credentials https://christianjmills.com/posts/kaggle-obtain-api-key-tutorial/ and save them to local ./kaggle.json file"
|
|
|
296 |
# 3. Mapping: split -> file_names, e.g. {"test" : ["test1.json", "test2.json"], "train": ["train.json"]}
|
297 |
data_files: Union[Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
|
298 |
caching: bool = True
|
299 |
+
_requirements_list: List[str] = ["ibm_boto3"]
|
300 |
|
301 |
def _download_from_cos(self, cos, bucket_name, item_name, local_file):
|
302 |
logger.info(f"Downloading {item_name} from {bucket_name} COS")
|
|
|
351 |
|
352 |
def verify(self):
|
353 |
super().verify()
|
|
|
354 |
assert (
|
355 |
self.endpoint_url is not None
|
356 |
), f"Please set the {self.endpoint_url_env} environmental variable"
|
|
|
364 |
raise NotImplementedError("LoadFromKaggle cannot load with streaming.")
|
365 |
|
366 |
def process(self):
|
367 |
+
import ibm_boto3
|
368 |
+
|
369 |
cos = ibm_boto3.resource(
|
370 |
"s3",
|
371 |
aws_access_key_id=self.aws_access_key_id,
|