Elron commited on
Commit
f905982
·
1 Parent(s): c6e17c3

Upload split_utils.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. split_utils.py +7 -7
split_utils.py CHANGED
@@ -1,6 +1,6 @@
1
- import re
2
- import random
3
  import itertools
 
 
4
 
5
  from .generator_utils import ReusableGenerator
6
 
@@ -26,7 +26,7 @@ def parse_random_mix_string(input_str):
26
  >>> parse_random_mix_string("dale[90%]+oren[0.7]+mike")
27
  {'dale': 0.9, 'oren': 0.7, 'mike': 1.0}
28
  """
29
-
30
  if not re.fullmatch(r"(([a-zA-Z]+\[\d*\.?\d*%?\]|[a-zA-Z]+)\+)*([a-zA-Z]+\[\d*\.?\d*%?\]|[a-zA-Z]+)", input_str):
31
  raise ValueError("Invalid input format")
32
 
@@ -63,7 +63,7 @@ def parse_slices_string(input_str):
63
  >>> parse_slices_string("oren[:50]+jake[24:]+test+oren[5:10]")
64
  {'oren': [(None, 50), (5, 10)], 'jake': [(24, None)], 'test': [(None, None)]}
65
  """
66
-
67
  result_dict = {}
68
 
69
  # Split the input string into a list of sources
@@ -127,7 +127,7 @@ def slice_streams(input_streams, mapping):
127
  >>> slice_streams(old_streams, mapping)
128
  {"new_train": [1, 2, 3, 4, 5, 8, 9], "new_test": [12, 13, 14]}
129
  """
130
-
131
  new_streams = {}
132
  for new_stream, sources in mapping.items():
133
 
@@ -178,7 +178,7 @@ def build_stream_routing(mapping):
178
  # Output: {'my_old_stream1': (['my_new_stream', 'my_new_stream2'], [0.6, 0.4]),
179
  # 'my_old_stream2': (['my_new_stream', 'my_new_stream2'], [0.2, 0.8])}
180
  """
181
-
182
  stream_mapping = {}
183
 
184
  # Calculate total weight for each old stream
@@ -257,7 +257,7 @@ def random_mix_streams(input_streams, mapping):
257
  for _, item in zip(range(10), new_stream):
258
  print(item)
259
  """
260
-
261
  new_streams = {}
262
 
263
  # Build stream routing
 
 
 
1
  import itertools
2
+ import random
3
+ import re
4
 
5
  from .generator_utils import ReusableGenerator
6
 
 
26
  >>> parse_random_mix_string("dale[90%]+oren[0.7]+mike")
27
  {'dale': 0.9, 'oren': 0.7, 'mike': 1.0}
28
  """
29
+
30
  if not re.fullmatch(r"(([a-zA-Z]+\[\d*\.?\d*%?\]|[a-zA-Z]+)\+)*([a-zA-Z]+\[\d*\.?\d*%?\]|[a-zA-Z]+)", input_str):
31
  raise ValueError("Invalid input format")
32
 
 
63
  >>> parse_slices_string("oren[:50]+jake[24:]+test+oren[5:10]")
64
  {'oren': [(None, 50), (5, 10)], 'jake': [(24, None)], 'test': [(None, None)]}
65
  """
66
+
67
  result_dict = {}
68
 
69
  # Split the input string into a list of sources
 
127
  >>> slice_streams(old_streams, mapping)
128
  {"new_train": [1, 2, 3, 4, 5, 8, 9], "new_test": [12, 13, 14]}
129
  """
130
+
131
  new_streams = {}
132
  for new_stream, sources in mapping.items():
133
 
 
178
  # Output: {'my_old_stream1': (['my_new_stream', 'my_new_stream2'], [0.6, 0.4]),
179
  # 'my_old_stream2': (['my_new_stream', 'my_new_stream2'], [0.2, 0.8])}
180
  """
181
+
182
  stream_mapping = {}
183
 
184
  # Calculate total weight for each old stream
 
257
  for _, item in zip(range(10), new_stream):
258
  print(item)
259
  """
260
+
261
  new_streams = {}
262
 
263
  # Build stream routing