Elron commited on
Commit
1083665
1 Parent(s): 17a636b

Upload dataset.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. dataset.py +26 -19
dataset.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
 
3
  import datasets
@@ -9,7 +10,6 @@ from .blocks import __file__ as _
9
  from .card import __file__ as _
10
  from .catalog import __file__ as _
11
  from .collections import __file__ as _
12
- from .common import __file__ as _
13
  from .dataclass import __file__ as _
14
  from .dict_utils import __file__ as _
15
  from .file_utils import __file__ as _
@@ -45,7 +45,7 @@ from .validate import __file__ as _
45
  from .version import __file__ as _
46
  from .version import version
47
 
48
- __default_recipe__ = "common_recipe"
49
 
50
 
51
  def fetch(artifact_name):
@@ -57,9 +57,7 @@ def fetch(artifact_name):
57
 
58
 
59
  def parse(query: str):
60
- """
61
- Parses a query of the form 'key1=value1,key2=value2,...' into a dictionary.
62
- """
63
  result = {}
64
  kvs = query.split(",")
65
  if len(kvs) == 0:
@@ -68,8 +66,14 @@ def parse(query: str):
68
  )
69
  for kv in kvs:
70
  key_val = kv.split("=")
71
- if len(key_val) != 2 or len(key_val[0].strip()) == 0 or len(key_val[1].strip()) == 0:
72
- raise ValueError('Illegal query: "{query}" with wrong assignment "{kv}" should be of the form: key=value.')
 
 
 
 
 
 
73
  key, val = key_val
74
  if val.isdigit():
75
  result[key] = int(val)
@@ -97,25 +101,23 @@ class Dataset(datasets.GeneratorBasedBuilder):
97
  """TODO: Short description of my dataset."""
98
 
99
  VERSION = datasets.Version(version)
100
- builder_configs = {}
101
 
102
  @property
103
  def generators(self):
104
  if not hasattr(self, "_generators") or self._generators is None:
105
  try:
106
- from unitxt.dataset import (
107
- get_dataset_artifact as get_dataset_artifact_installed,
108
- )
109
 
110
  unitxt_installed = True
111
  except ImportError:
112
  unitxt_installed = False
113
 
114
  if unitxt_installed:
115
- print("Loading with installed unitxt library...")
116
  dataset = get_dataset_artifact_installed(self.config.name)
117
  else:
118
- print("Loading with installed unitxt library...")
119
  dataset = get_dataset_artifact(self.config.name)
120
 
121
  self._generators = dataset()
@@ -126,13 +128,18 @@ class Dataset(datasets.GeneratorBasedBuilder):
126
  return datasets.DatasetInfo()
127
 
128
  def _split_generators(self, _):
129
- return [datasets.SplitGenerator(name=name, gen_kwargs={"split_name": name}) for name in self.generators.keys()]
 
 
 
130
 
131
  def _generate_examples(self, split_name):
132
  generator = self.generators[split_name]
133
- for i, row in enumerate(generator):
134
- yield i, row
135
 
136
- def _download_and_prepare(self, dl_manager, verification_mode, **prepare_splits_kwargs):
137
- result = super()._download_and_prepare(dl_manager, "no_checks", **prepare_splits_kwargs)
138
- return result
 
 
 
 
1
+ import logging
2
  import os
3
 
4
  import datasets
 
10
  from .card import __file__ as _
11
  from .catalog import __file__ as _
12
  from .collections import __file__ as _
 
13
  from .dataclass import __file__ as _
14
  from .dict_utils import __file__ as _
15
  from .file_utils import __file__ as _
 
45
  from .version import __file__ as _
46
  from .version import version
47
 
48
+ __default_recipe__ = "standard_recipe"
49
 
50
 
51
  def fetch(artifact_name):
 
57
 
58
 
59
  def parse(query: str):
60
+ """Parses a query of the form 'key1=value1,key2=value2,...' into a dictionary."""
 
 
61
  result = {}
62
  kvs = query.split(",")
63
  if len(kvs) == 0:
 
66
  )
67
  for kv in kvs:
68
  key_val = kv.split("=")
69
+ if (
70
+ len(key_val) != 2
71
+ or len(key_val[0].strip()) == 0
72
+ or len(key_val[1].strip()) == 0
73
+ ):
74
+ raise ValueError(
75
+ f'Illegal query: "{query}" with wrong assignment "{kv}" should be of the form: key=value.'
76
+ )
77
  key, val = key_val
78
  if val.isdigit():
79
  result[key] = int(val)
 
101
  """TODO: Short description of my dataset."""
102
 
103
  VERSION = datasets.Version(version)
 
104
 
105
  @property
106
  def generators(self):
107
  if not hasattr(self, "_generators") or self._generators is None:
108
  try:
109
+ from unitxt.dataset import \
110
+ get_dataset_artifact as get_dataset_artifact_installed
 
111
 
112
  unitxt_installed = True
113
  except ImportError:
114
  unitxt_installed = False
115
 
116
  if unitxt_installed:
117
+ logging.info("Loading with installed unitxt library...")
118
  dataset = get_dataset_artifact_installed(self.config.name)
119
  else:
120
+ logging.info("Loading with installed unitxt library...")
121
  dataset = get_dataset_artifact(self.config.name)
122
 
123
  self._generators = dataset()
 
128
  return datasets.DatasetInfo()
129
 
130
  def _split_generators(self, _):
131
+ return [
132
+ datasets.SplitGenerator(name=name, gen_kwargs={"split_name": name})
133
+ for name in self.generators.keys()
134
+ ]
135
 
136
  def _generate_examples(self, split_name):
137
  generator = self.generators[split_name]
138
+ yield from enumerate(generator)
 
139
 
140
+ def _download_and_prepare(
141
+ self, dl_manager, verification_mode, **prepare_splits_kwargs
142
+ ):
143
+ return super()._download_and_prepare(
144
+ dl_manager, "no_checks", **prepare_splits_kwargs
145
+ )