Elron commited on
Commit
54b3a72
1 Parent(s): c3f01f9

Upload dataset_utils.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. dataset_utils.py +58 -0
dataset_utils.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from .artifact import Artifact, UnitxtArtifactNotFoundError, fetch_artifact
4
+ from .logging_utils import get_logger
5
+ from .register import _reset_env_local_catalogs, register_all_artifacts
6
+
7
+ logger = get_logger()
8
+
9
+ __default_recipe__ = "standard_recipe"
10
+
11
+
12
+ def fetch(artifact_name):
13
+ try:
14
+ artifact, _ = fetch_artifact(artifact_name)
15
+ return artifact
16
+ except UnitxtArtifactNotFoundError:
17
+ return None
18
+
19
+
20
+ def parse(query: str):
21
+ """Parses a query of the form 'key1=value1,key2=value2,...' into a dictionary."""
22
+ result = {}
23
+ kvs = query.split(",")
24
+ if len(kvs) == 0:
25
+ raise ValueError(
26
+ 'Illegal query: "{query}" should contain at least one assignment of the form: key1=value1,key2=value2'
27
+ )
28
+ for kv in kvs:
29
+ key_val = kv.split("=")
30
+ if (
31
+ len(key_val) != 2
32
+ or len(key_val[0].strip()) == 0
33
+ or len(key_val[1].strip()) == 0
34
+ ):
35
+ raise ValueError(
36
+ f'Illegal query: "{query}" with wrong assignment "{kv}" should be of the form: key=value.'
37
+ )
38
+ key, val = key_val
39
+ if val.isdigit():
40
+ result[key] = int(val)
41
+ elif val.replace(".", "", 1).isdigit():
42
+ result[key] = float(val)
43
+ else:
44
+ result[key] = val
45
+
46
+ return result
47
+
48
+
49
+ def get_dataset_artifact(dataset_str):
50
+ _reset_env_local_catalogs()
51
+ register_all_artifacts()
52
+ recipe = fetch(dataset_str)
53
+ if recipe is None:
54
+ args = parse(dataset_str)
55
+ if "type" not in args:
56
+ args["type"] = os.environ.get("UNITXT_DEFAULT_RECIPE", __default_recipe__)
57
+ recipe = Artifact.from_dict(args)
58
+ return recipe