Quentin Lhoest commited on
Commit
9242f47
Β·
1 Parent(s): 40a1ebe

update task taxonomy

Browse files

- `datasets` version is from the PR on github for now
- pre-loading tags are from the PR that updates all the datasets
- specify the size category yourself

build_metadata_file.py CHANGED
@@ -14,6 +14,8 @@ import yaml
14
 
15
  from apputils import new_state
16
 
 
 
17
 
18
  def metadata_from_readme(f: Path) -> Dict:
19
  with f.open() as fi:
@@ -29,7 +31,7 @@ def load_ds_datas():
29
  if drepo.exists() and drepo.is_dir():
30
  check_call(["git", "pull"], cwd=drepo)
31
  else:
32
- check_call(["git", "clone", "https://github.com/huggingface/datasets.git"])
33
  head_sha = check_output(["git", "rev-parse", "HEAD"], cwd=drepo)
34
 
35
  datasets_md = dict()
 
14
 
15
  from apputils import new_state
16
 
17
+ DATASETS_BRANCH = "tasks-alignment-with-models"
18
+
19
 
20
  def metadata_from_readme(f: Path) -> Dict:
21
  with f.open() as fi:
 
31
  if drepo.exists() and drepo.is_dir():
32
  check_call(["git", "pull"], cwd=drepo)
33
  else:
34
+ check_call(["git", "clone", "-b", DATASETS_BRANCH, "https://github.com/huggingface/datasets.git"])
35
  head_sha = check_output(["git", "rev-parse", "HEAD"], cwd=drepo)
36
 
37
  datasets_md = dict()
metadata_8418b1d4ebac7c59372c5a55556522584891ba9c.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a72566a87cb959e17e04840367969ef3a5966db12f82039ca6faea9b87da54d9
3
- size 29912341
 
 
 
 
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
  pyyaml
2
- datasets==1.9.0
3
  streamlit>=0.88.0
4
  langcodes[data]
 
 
1
  pyyaml
 
2
  streamlit>=0.88.0
3
  langcodes[data]
4
+ git+https://github.com/huggingface/datasets.git@update-task-list
tagging_app.py CHANGED
@@ -73,20 +73,22 @@ def multiselect(
73
  if len(invalid_values) > 0:
74
  w.markdown("Found the following invalid values:")
75
  w.error(invalid_values)
76
- return w.multiselect(markdown, valid_set, default=valid_values, format_func=format_func)
77
 
78
 
79
  def validate_dict(w: st.delta_generator.DeltaGenerator, state_dict: Dict):
80
  try:
81
  DatasetMetadata(**state_dict)
 
 
82
  w.markdown("βœ… This is a valid tagset! πŸ€—")
83
  except Exception as e:
84
  w.markdown("❌ This is an invalid tagset, here are the errors in it:")
85
  w.error(e)
86
 
87
 
88
- def map_num_examples_to_size_categories(n: int) -> str:
89
- if n <= 0:
90
  size_cat = "unknown"
91
  elif n < 1000:
92
  size_cat = "n<1K"
@@ -212,8 +214,7 @@ state["task_categories"] = multiselect(
212
  "Task category",
213
  "What categories of task does the dataset support?",
214
  values=state["task_categories"],
215
- valid_set=list(known_task_ids.keys()),
216
- format_func=lambda tg: f"{tg}: {known_task_ids[tg]['description']}",
217
  )
218
  task_specifics = []
219
  for task_category in state["task_categories"]:
@@ -221,8 +222,8 @@ for task_category in state["task_categories"]:
221
  leftcol,
222
  f"Specific _{task_category}_ tasks",
223
  f"What specific tasks does the dataset support?",
224
- values=[ts for ts in (state["task_ids"] or []) if ts in known_task_ids[task_category]["options"]],
225
- valid_set=known_task_ids[task_category]["options"],
226
  )
227
  if "other" in specs:
228
  other_task = leftcol.text_input(
@@ -355,14 +356,24 @@ initial_num_examples = (
355
  if initial_infos is not None
356
  else -1
357
  )
358
- initial_size_cats = map_num_examples_to_size_categories(initial_num_examples)
359
- leftcol.markdown(f"Computed size category from automatically generated dataset info to: `{initial_size_cats}`")
360
- current_size_cats = state.get("size_categories") or ["unknown"]
361
- ok, nonok = split_known(current_size_cats, known_size_categories)
 
 
 
 
 
 
 
 
 
 
 
362
  if len(nonok) > 0:
363
  leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
364
- else:
365
- state["size_categories"] = [initial_size_cats]
366
 
367
 
368
  ########################
 
73
  if len(invalid_values) > 0:
74
  w.markdown("Found the following invalid values:")
75
  w.error(invalid_values)
76
+ return w.multiselect(markdown, valid_set, default=valid_values, format_func=format_func, key=title)
77
 
78
 
79
  def validate_dict(w: st.delta_generator.DeltaGenerator, state_dict: Dict):
80
  try:
81
  DatasetMetadata(**state_dict)
82
+ if not state_dict.get("pretty_name"):
83
+ raise ValueError("Please specify a non-empty Dataset name.")
84
  w.markdown("βœ… This is a valid tagset! πŸ€—")
85
  except Exception as e:
86
  w.markdown("❌ This is an invalid tagset, here are the errors in it:")
87
  w.error(e)
88
 
89
 
90
+ def map_num_examples_to_size_category(n: int) -> str:
91
+ if n < 0:
92
  size_cat = "unknown"
93
  elif n < 1000:
94
  size_cat = "n<1K"
 
214
  "Task category",
215
  "What categories of task does the dataset support?",
216
  values=state["task_categories"],
217
+ valid_set=sorted(list(known_task_ids.keys())),
 
218
  )
219
  task_specifics = []
220
  for task_category in state["task_categories"]:
 
222
  leftcol,
223
  f"Specific _{task_category}_ tasks",
224
  f"What specific tasks does the dataset support?",
225
+ values=[ts for ts in (state["task_ids"] or []) if ts in known_task_ids[task_category].get("subtasks", [])],
226
+ valid_set=known_task_ids[task_category].get("subtasks", []),
227
  )
228
  if "other" in specs:
229
  other_task = leftcol.text_input(
 
356
  if initial_infos is not None
357
  else -1
358
  )
359
+ if initial_num_examples >= 0:
360
+ initial_size_categories = [map_num_examples_to_size_category(initial_num_examples)]
361
+ else:
362
+ initial_size_categories = []
363
+ current_size_cats = multiselect(
364
+ leftcol,
365
+ f"Size category",
366
+ f"How many samples are there in the dataset?",
367
+ values=initial_size_categories,
368
+ valid_set=known_size_categories,
369
+ )
370
+ if initial_size_categories:
371
+ leftcol.markdown(f"Computed size category from automatically generated dataset info to: `{initial_size_categories}`")
372
+ prev_size_cats = state.get("size_categories") or []
373
+ ok, nonok = split_known(prev_size_cats, known_size_categories)
374
  if len(nonok) > 0:
375
  leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
376
+ state["size_categories"] = current_size_cats
 
377
 
378
 
379
  ########################