File size: 668 Bytes
e3ab2c6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
from .stream import MultiStream
from .operator import SourceOperator
from typing import Optional, Union, Sequence, Mapping
from datasets import load_dataset as hf_load_dataset
class Loader(SourceOperator):
pass
class LoadHF(Loader):
path: str
name: Optional[str] = None
data_dir: Optional[str] = None
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None
cached = False
def process(self):
dataset = hf_load_dataset(
self.path, name=self.name, data_dir=self.data_dir, data_files=self.data_files, streaming=True
)
return MultiStream.from_iterables(dataset)
|