Elron commited on
Commit
68d64cc
1 Parent(s): 247f8e0

Upload splitters.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. splitters.py +23 -0
splitters.py CHANGED
@@ -29,6 +29,29 @@ class RenameSplits(Splitter):
29
 
30
 
31
  class SplitRandomMix(Splitter):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  mix: Dict[str, str]
33
 
34
  def process(self, multi_stream: MultiStream) -> MultiStream:
 
29
 
30
 
31
  class SplitRandomMix(Splitter):
32
+ """Splits a multistream into new streams (splits), whose names, source input stream, and amount of instances, are specified by arg 'mix'.
33
+
34
+ The keys of arg 'mix', are the names of the new streams, the values are of the form: 'name-of-source-stream[percentage-of-source-stream]'
35
+ Each input instance, of any input stream, is selected exactly once for inclusion in any of the output streams.
36
+
37
+ Examples:
38
+ When processing a multistream made of two streams whose names are 'train' and 'test', by
39
+ SplitRandomMix(mix = { "train": "train[99%]", "validation": "train[1%]", "test": "test" })
40
+ the output is a multistream, whose three streams are named 'train', 'validation', and 'test'.
41
+ Output stream 'train' is made of randomly selected 99% of the instances of input stream 'train',
42
+ output stream 'validation' is made of the remaining 1% instances of input 'train', and output stream 'test' is made
43
+ of the whole of input stream 'test'.
44
+
45
+ When processing the above input multistream by
46
+ SplitRandomMix(mix = { "train": "train[50%]+test[0.1]", "validation": "train[50%]+test[0.2]", "test": "test[0.7]" })
47
+ the output is a multistream, whose three streams are named 'train', 'validation', and 'test'.
48
+ Output stream 'train' is made of randomly selected 50% of the instances of input stream 'train' + randomly selected
49
+ 0.1 (i.e., 10%) of the instances of input stream 'test'.
50
+ Output stream 'validation' is made of the remaining 50% instances of input 'train'+ randomly selected 0.2 (i.e.,
51
+ 20%) of the original instances of input 'test', that were not selected for output 'train',
52
+ and output stream 'test' is made of the remaining instances of input 'test'.
53
+ """
54
+
55
  mix: Dict[str, str]
56
 
57
  def process(self, multi_stream: MultiStream) -> MultiStream: