Upload split_utils.py with huggingface_hub
Browse files- split_utils.py +7 -7
split_utils.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
-
import re
|
2 |
-
import random
|
3 |
import itertools
|
|
|
|
|
4 |
|
5 |
from .generator_utils import ReusableGenerator
|
6 |
|
@@ -26,7 +26,7 @@ def parse_random_mix_string(input_str):
|
|
26 |
>>> parse_random_mix_string("dale[90%]+oren[0.7]+mike")
|
27 |
{'dale': 0.9, 'oren': 0.7, 'mike': 1.0}
|
28 |
"""
|
29 |
-
|
30 |
if not re.fullmatch(r"(([a-zA-Z]+\[\d*\.?\d*%?\]|[a-zA-Z]+)\+)*([a-zA-Z]+\[\d*\.?\d*%?\]|[a-zA-Z]+)", input_str):
|
31 |
raise ValueError("Invalid input format")
|
32 |
|
@@ -63,7 +63,7 @@ def parse_slices_string(input_str):
|
|
63 |
>>> parse_slices_string("oren[:50]+jake[24:]+test+oren[5:10]")
|
64 |
{'oren': [(None, 50), (5, 10)], 'jake': [(24, None)], 'test': [(None, None)]}
|
65 |
"""
|
66 |
-
|
67 |
result_dict = {}
|
68 |
|
69 |
# Split the input string into a list of sources
|
@@ -127,7 +127,7 @@ def slice_streams(input_streams, mapping):
|
|
127 |
>>> slice_streams(old_streams, mapping)
|
128 |
{"new_train": [1, 2, 3, 4, 5, 8, 9], "new_test": [12, 13, 14]}
|
129 |
"""
|
130 |
-
|
131 |
new_streams = {}
|
132 |
for new_stream, sources in mapping.items():
|
133 |
|
@@ -178,7 +178,7 @@ def build_stream_routing(mapping):
|
|
178 |
# Output: {'my_old_stream1': (['my_new_stream', 'my_new_stream2'], [0.6, 0.4]),
|
179 |
# 'my_old_stream2': (['my_new_stream', 'my_new_stream2'], [0.2, 0.8])}
|
180 |
"""
|
181 |
-
|
182 |
stream_mapping = {}
|
183 |
|
184 |
# Calculate total weight for each old stream
|
@@ -257,7 +257,7 @@ def random_mix_streams(input_streams, mapping):
|
|
257 |
for _, item in zip(range(10), new_stream):
|
258 |
print(item)
|
259 |
"""
|
260 |
-
|
261 |
new_streams = {}
|
262 |
|
263 |
# Build stream routing
|
|
|
|
|
|
|
1 |
import itertools
|
2 |
+
import random
|
3 |
+
import re
|
4 |
|
5 |
from .generator_utils import ReusableGenerator
|
6 |
|
|
|
26 |
>>> parse_random_mix_string("dale[90%]+oren[0.7]+mike")
|
27 |
{'dale': 0.9, 'oren': 0.7, 'mike': 1.0}
|
28 |
"""
|
29 |
+
|
30 |
if not re.fullmatch(r"(([a-zA-Z]+\[\d*\.?\d*%?\]|[a-zA-Z]+)\+)*([a-zA-Z]+\[\d*\.?\d*%?\]|[a-zA-Z]+)", input_str):
|
31 |
raise ValueError("Invalid input format")
|
32 |
|
|
|
63 |
>>> parse_slices_string("oren[:50]+jake[24:]+test+oren[5:10]")
|
64 |
{'oren': [(None, 50), (5, 10)], 'jake': [(24, None)], 'test': [(None, None)]}
|
65 |
"""
|
66 |
+
|
67 |
result_dict = {}
|
68 |
|
69 |
# Split the input string into a list of sources
|
|
|
127 |
>>> slice_streams(old_streams, mapping)
|
128 |
{"new_train": [1, 2, 3, 4, 5, 8, 9], "new_test": [12, 13, 14]}
|
129 |
"""
|
130 |
+
|
131 |
new_streams = {}
|
132 |
for new_stream, sources in mapping.items():
|
133 |
|
|
|
178 |
# Output: {'my_old_stream1': (['my_new_stream', 'my_new_stream2'], [0.6, 0.4]),
|
179 |
# 'my_old_stream2': (['my_new_stream', 'my_new_stream2'], [0.2, 0.8])}
|
180 |
"""
|
181 |
+
|
182 |
stream_mapping = {}
|
183 |
|
184 |
# Calculate total weight for each old stream
|
|
|
257 |
for _, item in zip(range(10), new_stream):
|
258 |
print(item)
|
259 |
"""
|
260 |
+
|
261 |
new_streams = {}
|
262 |
|
263 |
# Build stream routing
|