Elron commited on
Commit
9564cbf
1 Parent(s): c6e9c8c

Upload loaders.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. loaders.py +18 -2
loaders.py CHANGED
@@ -1,12 +1,13 @@
1
  import os
2
  from tempfile import TemporaryDirectory
3
- from typing import Mapping, Optional, Sequence, Union
4
 
 
5
  from datasets import load_dataset as hf_load_dataset
6
  from tqdm import tqdm
7
 
8
  from .operator import SourceOperator
9
- from .stream import MultiStream
10
 
11
  try:
12
  import ibm_boto3
@@ -37,6 +38,21 @@ class LoadHF(Loader):
37
  return MultiStream.from_iterables(dataset)
38
 
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  class LoadFromIBMCloud(Loader):
41
  endpoint_url_env: str
42
  aws_access_key_id_env: str
 
1
  import os
2
  from tempfile import TemporaryDirectory
3
+ from typing import Dict, Mapping, Optional, Sequence, Union
4
 
5
+ import pandas as pd
6
  from datasets import load_dataset as hf_load_dataset
7
  from tqdm import tqdm
8
 
9
  from .operator import SourceOperator
10
+ from .stream import MultiStream, Stream
11
 
12
  try:
13
  import ibm_boto3
 
38
  return MultiStream.from_iterables(dataset)
39
 
40
 
41
+ class LoadCSV(Loader):
42
+ files: Dict[str, str]
43
+ chunksize: int = 1000
44
+
45
+ def load_csv(self, file):
46
+ for chunk in pd.read_csv(file, chunksize=self.chunksize):
47
+ for index, row in chunk.iterrows():
48
+ yield row.to_dict()
49
+
50
+ def process(self):
51
+ return MultiStream(
52
+ {name: Stream(generator=self.load_csv, gen_kwargs={"file": file}) for name, file in self.files.items()}
53
+ )
54
+
55
+
56
  class LoadFromIBMCloud(Loader):
57
  endpoint_url_env: str
58
  aws_access_key_id_env: str