Spaces:
Running
Running
Quentin Lhoest
commited on
Commit
Β·
9242f47
1
Parent(s):
40a1ebe
update task taxonomy
Browse files- `datasets` version is from the PR on github for now
- pre-loading tags are from the PR that updates all the datasets
- specify the size category yourself
- build_metadata_file.py +3 -1
- metadata_8418b1d4ebac7c59372c5a55556522584891ba9c.json +0 -3
- requirements.txt +1 -1
- tagging_app.py +24 -13
build_metadata_file.py
CHANGED
@@ -14,6 +14,8 @@ import yaml
|
|
14 |
|
15 |
from apputils import new_state
|
16 |
|
|
|
|
|
17 |
|
18 |
def metadata_from_readme(f: Path) -> Dict:
|
19 |
with f.open() as fi:
|
@@ -29,7 +31,7 @@ def load_ds_datas():
|
|
29 |
if drepo.exists() and drepo.is_dir():
|
30 |
check_call(["git", "pull"], cwd=drepo)
|
31 |
else:
|
32 |
-
check_call(["git", "clone", "https://github.com/huggingface/datasets.git"])
|
33 |
head_sha = check_output(["git", "rev-parse", "HEAD"], cwd=drepo)
|
34 |
|
35 |
datasets_md = dict()
|
|
|
14 |
|
15 |
from apputils import new_state
|
16 |
|
17 |
+
DATASETS_BRANCH = "tasks-alignment-with-models"
|
18 |
+
|
19 |
|
20 |
def metadata_from_readme(f: Path) -> Dict:
|
21 |
with f.open() as fi:
|
|
|
31 |
if drepo.exists() and drepo.is_dir():
|
32 |
check_call(["git", "pull"], cwd=drepo)
|
33 |
else:
|
34 |
+
check_call(["git", "clone", "-b", DATASETS_BRANCH, "https://github.com/huggingface/datasets.git"])
|
35 |
head_sha = check_output(["git", "rev-parse", "HEAD"], cwd=drepo)
|
36 |
|
37 |
datasets_md = dict()
|
metadata_8418b1d4ebac7c59372c5a55556522584891ba9c.json
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:a72566a87cb959e17e04840367969ef3a5966db12f82039ca6faea9b87da54d9
|
3 |
-
size 29912341
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
pyyaml
|
2 |
-
datasets==1.9.0
|
3 |
streamlit>=0.88.0
|
4 |
langcodes[data]
|
|
|
|
1 |
pyyaml
|
|
|
2 |
streamlit>=0.88.0
|
3 |
langcodes[data]
|
4 |
+
git+https://github.com/huggingface/datasets.git@update-task-list
|
tagging_app.py
CHANGED
@@ -73,20 +73,22 @@ def multiselect(
|
|
73 |
if len(invalid_values) > 0:
|
74 |
w.markdown("Found the following invalid values:")
|
75 |
w.error(invalid_values)
|
76 |
-
return w.multiselect(markdown, valid_set, default=valid_values, format_func=format_func)
|
77 |
|
78 |
|
79 |
def validate_dict(w: st.delta_generator.DeltaGenerator, state_dict: Dict):
|
80 |
try:
|
81 |
DatasetMetadata(**state_dict)
|
|
|
|
|
82 |
w.markdown("β
This is a valid tagset! π€")
|
83 |
except Exception as e:
|
84 |
w.markdown("β This is an invalid tagset, here are the errors in it:")
|
85 |
w.error(e)
|
86 |
|
87 |
|
88 |
-
def
|
89 |
-
if n
|
90 |
size_cat = "unknown"
|
91 |
elif n < 1000:
|
92 |
size_cat = "n<1K"
|
@@ -212,8 +214,7 @@ state["task_categories"] = multiselect(
|
|
212 |
"Task category",
|
213 |
"What categories of task does the dataset support?",
|
214 |
values=state["task_categories"],
|
215 |
-
valid_set=list(known_task_ids.keys()),
|
216 |
-
format_func=lambda tg: f"{tg}: {known_task_ids[tg]['description']}",
|
217 |
)
|
218 |
task_specifics = []
|
219 |
for task_category in state["task_categories"]:
|
@@ -221,8 +222,8 @@ for task_category in state["task_categories"]:
|
|
221 |
leftcol,
|
222 |
f"Specific _{task_category}_ tasks",
|
223 |
f"What specific tasks does the dataset support?",
|
224 |
-
values=[ts for ts in (state["task_ids"] or []) if ts in known_task_ids[task_category]
|
225 |
-
valid_set=known_task_ids[task_category]
|
226 |
)
|
227 |
if "other" in specs:
|
228 |
other_task = leftcol.text_input(
|
@@ -355,14 +356,24 @@ initial_num_examples = (
|
|
355 |
if initial_infos is not None
|
356 |
else -1
|
357 |
)
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
362 |
if len(nonok) > 0:
|
363 |
leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
|
364 |
-
|
365 |
-
state["size_categories"] = [initial_size_cats]
|
366 |
|
367 |
|
368 |
########################
|
|
|
73 |
if len(invalid_values) > 0:
|
74 |
w.markdown("Found the following invalid values:")
|
75 |
w.error(invalid_values)
|
76 |
+
return w.multiselect(markdown, valid_set, default=valid_values, format_func=format_func, key=title)
|
77 |
|
78 |
|
79 |
def validate_dict(w: st.delta_generator.DeltaGenerator, state_dict: Dict):
|
80 |
try:
|
81 |
DatasetMetadata(**state_dict)
|
82 |
+
if not state_dict.get("pretty_name"):
|
83 |
+
raise ValueError("Please specify a non-empty Dataset name.")
|
84 |
w.markdown("β
This is a valid tagset! π€")
|
85 |
except Exception as e:
|
86 |
w.markdown("β This is an invalid tagset, here are the errors in it:")
|
87 |
w.error(e)
|
88 |
|
89 |
|
90 |
+
def map_num_examples_to_size_category(n: int) -> str:
|
91 |
+
if n < 0:
|
92 |
size_cat = "unknown"
|
93 |
elif n < 1000:
|
94 |
size_cat = "n<1K"
|
|
|
214 |
"Task category",
|
215 |
"What categories of task does the dataset support?",
|
216 |
values=state["task_categories"],
|
217 |
+
valid_set=sorted(list(known_task_ids.keys())),
|
|
|
218 |
)
|
219 |
task_specifics = []
|
220 |
for task_category in state["task_categories"]:
|
|
|
222 |
leftcol,
|
223 |
f"Specific _{task_category}_ tasks",
|
224 |
f"What specific tasks does the dataset support?",
|
225 |
+
values=[ts for ts in (state["task_ids"] or []) if ts in known_task_ids[task_category].get("subtasks", [])],
|
226 |
+
valid_set=known_task_ids[task_category].get("subtasks", []),
|
227 |
)
|
228 |
if "other" in specs:
|
229 |
other_task = leftcol.text_input(
|
|
|
356 |
if initial_infos is not None
|
357 |
else -1
|
358 |
)
|
359 |
+
if initial_num_examples >= 0:
|
360 |
+
initial_size_categories = [map_num_examples_to_size_category(initial_num_examples)]
|
361 |
+
else:
|
362 |
+
initial_size_categories = []
|
363 |
+
current_size_cats = multiselect(
|
364 |
+
leftcol,
|
365 |
+
f"Size category",
|
366 |
+
f"How many samples are there in the dataset?",
|
367 |
+
values=initial_size_categories,
|
368 |
+
valid_set=known_size_categories,
|
369 |
+
)
|
370 |
+
if initial_size_categories:
|
371 |
+
leftcol.markdown(f"Computed size category from automatically generated dataset info to: `{initial_size_categories}`")
|
372 |
+
prev_size_cats = state.get("size_categories") or []
|
373 |
+
ok, nonok = split_known(prev_size_cats, known_size_categories)
|
374 |
if len(nonok) > 0:
|
375 |
leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
|
376 |
+
state["size_categories"] = current_size_cats
|
|
|
377 |
|
378 |
|
379 |
########################
|