theo commited on
Commit
326ad7e
β€’
1 Parent(s): 26742b2

better state mgmt + use validator script

Browse files
Files changed (1) hide show
  1. tagging_app.py +74 -57
tagging_app.py CHANGED
@@ -4,6 +4,7 @@ from typing import Callable, List, Tuple
4
 
5
  import streamlit as st
6
  import yaml
 
7
 
8
  st.set_page_config(
9
  page_title="HF Dataset Tagging App",
@@ -128,31 +129,52 @@ Beware that clicking pre-load will overwrite the current state!
128
  )
129
 
130
 
131
- qp = st.experimental_get_query_params()
132
- preload = qp.get("preload_dataset", list())
133
  preloaded_id = None
 
134
  did_index = 0
135
  if len(preload) == 1 and preload[0] in all_dataset_ids:
136
  preloaded_id, *_ = preload
137
- state = existing_tag_sets[preloaded_id] or new_state()
 
138
  did_index = all_dataset_ids.index(preloaded_id)
139
 
140
- did = st.sidebar.selectbox(label="Choose dataset to load tag set from", options=all_dataset_ids, index=did_index)
141
-
 
142
  leftbtn, rightbtn = st.sidebar.beta_columns(2)
143
- if leftbtn.button("pre-load tagset"):
144
- state = existing_tag_sets[did] or new_state()
145
- st.experimental_set_query_params(preload_dataset=did)
 
146
  if rightbtn.button("flush state"):
147
  state = new_state()
 
 
148
  st.experimental_set_query_params()
149
 
150
- if preloaded_id is not None:
 
 
 
 
 
 
 
 
 
 
 
151
  st.sidebar.markdown(
152
  f"""
153
- Took [`{preloaded_id}`](https://huggingface.co/datasets/{preloaded_id}) as base tagset:
 
 
 
 
154
  ```yaml
155
- {yaml.dump(state)}
156
  ```
157
  """
158
  )
@@ -162,7 +184,7 @@ leftcol, _, rightcol = st.beta_columns([12, 1, 12])
162
 
163
 
164
  leftcol.markdown("### Supported tasks")
165
- task_categories = multiselect(
166
  leftcol,
167
  "Task category",
168
  "What categories of task does the dataset support?",
@@ -171,27 +193,27 @@ task_categories = multiselect(
171
  format_func=lambda tg: f"{tg}: {task_set[tg]['description']}",
172
  )
173
  task_specifics = []
174
- for tg in task_categories:
175
- task_specs = multiselect(
176
  leftcol,
177
- "Specific tasks",
178
- f"What specific *{tg}* tasks does the dataset support?",
179
- values=[ts for ts in state["task_ids"] if ts in task_set[tg]["options"]],
180
  valid_set=task_set[tg]["options"],
181
  )
182
- if "other" in task_specs:
183
  other_task = st.text_input(
184
  "You selected 'other' task. Please enter a short hyphen-separated description for the task:",
185
  value="my-task-description",
186
  )
187
  st.write(f"Registering {tg}-other-{other_task} task")
188
- task_specs[task_specs.index("other")] = f"{tg}-other-{other_task}"
189
- task_specifics += task_specs
 
190
 
191
 
192
  leftcol.markdown("### Languages")
193
-
194
- multilinguality = multiselect(
195
  leftcol,
196
  "Monolingual?",
197
  "Does the dataset contain more than one language?",
@@ -200,16 +222,15 @@ multilinguality = multiselect(
200
  format_func=lambda m: f"{m} : {multilinguality_set[m]}",
201
  )
202
 
203
- if "other" in multilinguality:
204
  other_multilinguality = st.text_input(
205
  "You selected 'other' type of multilinguality. Please enter a short hyphen-separated description:",
206
  value="my-multilinguality",
207
  )
208
  st.write(f"Registering other-{other_multilinguality} multilinguality")
209
- multilinguality[multilinguality.index("other")] = f"other-{other_multilinguality}"
210
 
211
-
212
- languages = multiselect(
213
  leftcol,
214
  "Languages",
215
  "What languages are represented in the dataset?",
@@ -220,14 +241,14 @@ languages = multiselect(
220
 
221
 
222
  leftcol.markdown("### Dataset creators")
223
- language_creators = multiselect(
224
  leftcol,
225
  "Data origin",
226
  "Where does the text in the dataset come from?",
227
  values=state["language_creators"],
228
  valid_set=creator_set["language"],
229
  )
230
- annotations_creators = multiselect(
231
  leftcol,
232
  "Annotations origin",
233
  "Where do the annotations in the dataset come from?",
@@ -236,7 +257,7 @@ annotations_creators = multiselect(
236
  )
237
 
238
 
239
- licenses = multiselect(
240
  leftcol,
241
  "Licenses",
242
  "What licenses is the dataset under?",
@@ -244,13 +265,13 @@ licenses = multiselect(
244
  values=state["licenses"],
245
  format_func=lambda l: f"{l} : {license_set[l]}",
246
  )
247
- if "other" in licenses:
248
  other_license = st.text_input(
249
  "You selected 'other' type of license. Please enter a short hyphen-separated description:",
250
  value="my-license",
251
  )
252
  st.write(f"Registering other-{other_license} license")
253
- licenses[licenses.index("other")] = f"other-{other_license}"
254
 
255
  # link to supported datasets
256
  pre_select_ext_a = []
@@ -258,17 +279,16 @@ if "original" in state["source_datasets"]:
258
  pre_select_ext_a += ["original"]
259
  if any([p.startswith("extended") for p in state["source_datasets"]]):
260
  pre_select_ext_a += ["extended"]
261
- extended = multiselect(
262
  leftcol,
263
  "Relations to existing work",
264
  "Does the dataset contain original data and/or was it extended from other datasets?",
265
  values=pre_select_ext_a,
266
  valid_set=["original", "extended"],
267
  )
268
- source_datasets = ["original"] if "original" in extended else []
269
 
270
- # todo: show bad tags
271
- if "extended" in extended:
272
  pre_select_ext_b = [p.split("|")[1] for p in state["source_datasets"] if p.startswith("extended")]
273
  extended_sources = multiselect(
274
  leftcol,
@@ -284,43 +304,40 @@ if "extended" in extended:
284
  )
285
  st.write(f"Registering other-{other_extended_sources} dataset")
286
  extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
287
- source_datasets += [f"extended|{src}" for src in extended_sources]
288
 
289
  size_cats = ["unknown", "n<1K", "1K<n<10K", "10K<n<100K", "100K<n<1M", "n>1M"]
290
  current_size_cats = state.get("size_categories") or ["unknown"]
291
  ok, nonok = split_known(current_size_cats, size_cats)
292
  if len(nonok) > 0:
293
  leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
294
- size_category = leftcol.selectbox(
295
- "What is the size category of the dataset?",
296
- options=size_cats,
297
- index=size_cats.index(ok[0]) if len(ok) > 0 else 0,
298
- )
 
 
299
 
300
 
301
  ########################
302
  ## Show results
303
  ########################
304
- yamlblock = yaml.dump(
305
- {
306
- "task_categories": task_categories,
307
- "task_ids": task_specifics,
308
- "multilinguality": multilinguality,
309
- "languages": languages,
310
- "language_creators": language_creators,
311
- "annotations_creators": annotations_creators,
312
- "source_datasets": source_datasets,
313
- "size_categories": size_category,
314
- "licenses": licenses,
315
- }
316
- )
317
  rightcol.markdown(
318
  f"""
319
  ### Finalized tag set
320
 
321
- Copy it into your dataset's `README.md` header! πŸ€—
322
 
323
  ```yaml
324
- {yamlblock}
325
- ```""",
 
326
  )
 
4
 
5
  import streamlit as st
6
  import yaml
7
+ from datasets.utils.metadata_validator import DatasetMetadata
8
 
9
  st.set_page_config(
10
  page_title="HF Dataset Tagging App",
 
129
  )
130
 
131
 
132
+ queryparams = st.experimental_get_query_params()
133
+ preload = queryparams.get("preload_dataset", list())
134
  preloaded_id = None
135
+ initial_state = None
136
  did_index = 0
137
  if len(preload) == 1 and preload[0] in all_dataset_ids:
138
  preloaded_id, *_ = preload
139
+ initial_state = existing_tag_sets.get(preloaded_id)
140
+ state = initial_state or new_state()
141
  did_index = all_dataset_ids.index(preloaded_id)
142
 
143
+ preloaded_id = st.sidebar.selectbox(
144
+ label="Choose dataset to load tag set from", options=all_dataset_ids, index=did_index
145
+ )
146
  leftbtn, rightbtn = st.sidebar.beta_columns(2)
147
+ if leftbtn.button("pre-load"):
148
+ initial_state = existing_tag_sets[preloaded_id]
149
+ state = initial_state or new_state()
150
+ st.experimental_set_query_params(preload_dataset=preloaded_id)
151
  if rightbtn.button("flush state"):
152
  state = new_state()
153
+ initial_state = None
154
+ preloaded_id = None
155
  st.experimental_set_query_params()
156
 
157
+ if preloaded_id is not None and initial_state is not None:
158
+ try:
159
+ DatasetMetadata(**initial_state)
160
+ valid = "βœ”οΈ This is a valid tagset!"
161
+ except Exception as e:
162
+ valid = f"""
163
+ πŸ™ This is an invalid tagset, here are the errors in it:
164
+ ```
165
+ {e}
166
+ ```
167
+ You're _very_ welcome to fix these issues and submit a new PR on [`datasets`](https://github.com/huggingface/datasets/)
168
+ """
169
  st.sidebar.markdown(
170
  f"""
171
+ ---
172
+ The current base tagset is [`{preloaded_id}`](https://huggingface.co/datasets/{preloaded_id})
173
+ {valid}
174
+ Here is the matching yaml block:
175
+
176
  ```yaml
177
+ {yaml.dump(initial_state)}
178
  ```
179
  """
180
  )
 
184
 
185
 
186
  leftcol.markdown("### Supported tasks")
187
+ state["task_categories"] = multiselect(
188
  leftcol,
189
  "Task category",
190
  "What categories of task does the dataset support?",
 
193
  format_func=lambda tg: f"{tg}: {task_set[tg]['description']}",
194
  )
195
  task_specifics = []
196
+ for tg in state["task_categories"]:
197
+ specs = multiselect(
198
  leftcol,
199
+ f"Specific _{tg}_ tasks",
200
+ f"What specific tasks does the dataset support?",
201
+ values=[ts for ts in (state["task_ids"] or []) if ts in task_set[tg]["options"]],
202
  valid_set=task_set[tg]["options"],
203
  )
204
+ if "other" in specs:
205
  other_task = st.text_input(
206
  "You selected 'other' task. Please enter a short hyphen-separated description for the task:",
207
  value="my-task-description",
208
  )
209
  st.write(f"Registering {tg}-other-{other_task} task")
210
+ specs[specs.index("other")] = f"{tg}-other-{other_task}"
211
+ task_specifics += specs
212
+ state["task_ids"] = task_specifics
213
 
214
 
215
  leftcol.markdown("### Languages")
216
+ state["multilinguality"] = multiselect(
 
217
  leftcol,
218
  "Monolingual?",
219
  "Does the dataset contain more than one language?",
 
222
  format_func=lambda m: f"{m} : {multilinguality_set[m]}",
223
  )
224
 
225
+ if "other" in state["multilinguality"]:
226
  other_multilinguality = st.text_input(
227
  "You selected 'other' type of multilinguality. Please enter a short hyphen-separated description:",
228
  value="my-multilinguality",
229
  )
230
  st.write(f"Registering other-{other_multilinguality} multilinguality")
231
+ state["multilinguality"][state["multilinguality"].index("other")] = f"other-{other_multilinguality}"
232
 
233
+ state["languages"] = multiselect(
 
234
  leftcol,
235
  "Languages",
236
  "What languages are represented in the dataset?",
 
241
 
242
 
243
  leftcol.markdown("### Dataset creators")
244
+ state["language_creators"] = multiselect(
245
  leftcol,
246
  "Data origin",
247
  "Where does the text in the dataset come from?",
248
  values=state["language_creators"],
249
  valid_set=creator_set["language"],
250
  )
251
+ state["annotations_creators"] = multiselect(
252
  leftcol,
253
  "Annotations origin",
254
  "Where do the annotations in the dataset come from?",
 
257
  )
258
 
259
 
260
+ state["licenses"] = multiselect(
261
  leftcol,
262
  "Licenses",
263
  "What licenses is the dataset under?",
 
265
  values=state["licenses"],
266
  format_func=lambda l: f"{l} : {license_set[l]}",
267
  )
268
+ if "other" in state["licenses"]:
269
  other_license = st.text_input(
270
  "You selected 'other' type of license. Please enter a short hyphen-separated description:",
271
  value="my-license",
272
  )
273
  st.write(f"Registering other-{other_license} license")
274
+ state["licenses"][state["licenses"].index("other")] = f"other-{other_license}"
275
 
276
  # link to supported datasets
277
  pre_select_ext_a = []
 
279
  pre_select_ext_a += ["original"]
280
  if any([p.startswith("extended") for p in state["source_datasets"]]):
281
  pre_select_ext_a += ["extended"]
282
+ state["extended"] = multiselect(
283
  leftcol,
284
  "Relations to existing work",
285
  "Does the dataset contain original data and/or was it extended from other datasets?",
286
  values=pre_select_ext_a,
287
  valid_set=["original", "extended"],
288
  )
289
+ state["source_datasets"] = ["original"] if "original" in state["extended"] else []
290
 
291
+ if "extended" in state["extended"]:
 
292
  pre_select_ext_b = [p.split("|")[1] for p in state["source_datasets"] if p.startswith("extended")]
293
  extended_sources = multiselect(
294
  leftcol,
 
304
  )
305
  st.write(f"Registering other-{other_extended_sources} dataset")
306
  extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
307
+ state["source_datasets"] += [f"extended|{src}" for src in extended_sources]
308
 
309
  size_cats = ["unknown", "n<1K", "1K<n<10K", "10K<n<100K", "100K<n<1M", "n>1M"]
310
  current_size_cats = state.get("size_categories") or ["unknown"]
311
  ok, nonok = split_known(current_size_cats, size_cats)
312
  if len(nonok) > 0:
313
  leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
314
+ state["size_categories"] = [
315
+ leftcol.selectbox(
316
+ "What is the size category of the dataset?",
317
+ options=size_cats,
318
+ index=size_cats.index(ok[0]) if len(ok) > 0 else 0,
319
+ )
320
+ ]
321
 
322
 
323
  ########################
324
  ## Show results
325
  ########################
326
+ try:
327
+ DatasetMetadata(**state)
328
+ valid = "βœ” Validated! Copy it into your dataset's `README.md` header! πŸ€— "
329
+ except Exception as e:
330
+ valid = f"""πŸ™ Could not validate:
331
+ ```{e}```
332
+ """
 
 
 
 
 
 
333
  rightcol.markdown(
334
  f"""
335
  ### Finalized tag set
336
 
337
+ {valid}
338
 
339
  ```yaml
340
+ {yaml.dump(state)}
341
+ ```
342
+ """,
343
  )