Elron commited on
Commit
99fde4e
·
1 Parent(s): 21d2506

Upload splitters.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. splitters.py +40 -2
splitters.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from dataclasses import field
2
  from typing import Dict, List, Optional
3
 
@@ -11,16 +12,24 @@ class Splitter(MultiStreamOperator):
11
  pass
12
 
13
 
14
- import random
15
-
16
  from .split_utils import (
17
  parse_random_mix_string,
18
  parse_slices_string,
19
  random_mix_streams,
 
20
  slice_streams,
21
  )
22
 
23
 
 
 
 
 
 
 
 
 
24
  class SplitRandomMix(Splitter):
25
  mix: Dict[str, str]
26
 
@@ -30,6 +39,35 @@ class SplitRandomMix(Splitter):
30
  return MultiStream.from_generators(generators, streaming=True)
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  class SliceSplit(Splitter):
34
  slices: Dict[str, str]
35
 
 
1
+ import itertools
2
  from dataclasses import field
3
  from typing import Dict, List, Optional
4
 
 
12
  pass
13
 
14
 
15
+ from .random_utils import random
 
16
  from .split_utils import (
17
  parse_random_mix_string,
18
  parse_slices_string,
19
  random_mix_streams,
20
+ rename_split,
21
  slice_streams,
22
  )
23
 
24
 
25
+ class RenameSplits(Splitter):
26
+ mapper: Dict[str, str]
27
+
28
+ def process(self, multi_stream: MultiStream) -> MultiStream:
29
+ generators = rename_split(multi_stream, self.mapper)
30
+ return MultiStream(generators)
31
+
32
+
33
  class SplitRandomMix(Splitter):
34
  mix: Dict[str, str]
35
 
 
39
  return MultiStream.from_generators(generators, streaming=True)
40
 
41
 
42
+ class SeparateSplit(Splitter):
43
+ """
44
+ Separates a split (e.g. train) into several splits (e.g. train1, train2)
45
+ sizes must indicate the size of every split except the last. If no size is give for the last split,
46
+ it includes all the examples not allocated to any split.
47
+ """
48
+
49
+ from_split: str
50
+ to_split_names: List[str]
51
+ to_split_sizes: List[int]
52
+
53
+ def verify(self):
54
+ assert (
55
+ len(self.to_split_names) == len(self.to_split_sizes)
56
+ or len(self.to_split_names) == len(self.to_split_sizes) + 1
57
+ ), f"Examples num should be specified to all or all but the last splits, instead given {len(self.to_split_names)} split names and {len(self.to_split_sizes)} split sizes. \n split names:{self.to_split_names} split sizes {self.to_split_sizes}"
58
+ return super().verify()
59
+
60
+ def process(self, multi_stream: MultiStream) -> MultiStream:
61
+ mapping = {key: {key: [(None, None)]} for key in multi_stream.keys() if key != self.from_split}
62
+ so_far = 0
63
+ for name, size in itertools.zip_longest(self.to_split_names, self.to_split_sizes):
64
+ mapping[name] = {self.from_split: [(so_far, size)]}
65
+ if size:
66
+ so_far += size
67
+ generators = slice_streams(multi_stream, mapping)
68
+ return MultiStream.from_generators(generators, streaming=True)
69
+
70
+
71
  class SliceSplit(Splitter):
72
  slices: Dict[str, str]
73