theo commited on
Commit
c4882f0
1 Parent(s): ef36700

rely on tagsets from datasets

Browse files
Files changed (1) hide show
  1. tagging_app.py +18 -45
tagging_app.py CHANGED
@@ -5,7 +5,9 @@ from typing import Callable, Dict, List, Tuple
5
  import langcodes as lc
6
  import streamlit as st
7
  import yaml
8
- from datasets.utils.metadata import DatasetMetadata
 
 
9
 
10
  st.set_page_config(
11
  page_title="HF Dataset Tagging App",
@@ -26,34 +28,6 @@ st.markdown(
26
  unsafe_allow_html=True,
27
  )
28
 
29
- task_set = json.load(open("task_set.json"))
30
- license_set = json.load(open("license_set.json"))
31
-
32
- multilinguality_set = {
33
- "monolingual": "contains a single language",
34
- "multilingual": "contains multiple languages",
35
- "translation": "contains translated or aligned text",
36
- "other": "other type of language distribution",
37
- }
38
-
39
- creator_set = {
40
- "language": [
41
- "found",
42
- "crowdsourced",
43
- "expert-generated",
44
- "machine-generated",
45
- "other",
46
- ],
47
- "annotations": [
48
- "found",
49
- "crowdsourced",
50
- "expert-generated",
51
- "machine-generated",
52
- "no-annotation",
53
- "other",
54
- ],
55
- }
56
-
57
  ########################
58
  ## Helper functions
59
  ########################
@@ -117,7 +91,7 @@ def new_state() -> Dict[str, List]:
117
 
118
 
119
  def is_state_empty(state: Dict[str, List]) -> bool:
120
- return sum(len(v) if v is not None else 0 for v in state.values()) > 0
121
 
122
 
123
  state = new_state()
@@ -160,7 +134,7 @@ if leftbtn.button("pre-load"):
160
  initial_state = existing_tag_sets[preloaded_id]
161
  state = initial_state or new_state()
162
  st.experimental_set_query_params(preload_dataset=preloaded_id)
163
- if is_state_empty(state):
164
  if rightbtn.button("flush state"):
165
  state = new_state()
166
  initial_state = None
@@ -195,8 +169,8 @@ state["task_categories"] = multiselect(
195
  "Task category",
196
  "What categories of task does the dataset support?",
197
  values=state["task_categories"],
198
- valid_set=list(task_set.keys()),
199
- format_func=lambda tg: f"{tg}: {task_set[tg]['description']}",
200
  )
201
  task_specifics = []
202
  for tg in state["task_categories"]:
@@ -204,8 +178,8 @@ for tg in state["task_categories"]:
204
  leftcol,
205
  f"Specific _{tg}_ tasks",
206
  f"What specific tasks does the dataset support?",
207
- values=[ts for ts in (state["task_ids"] or []) if ts in task_set[tg]["options"]],
208
- valid_set=task_set[tg]["options"],
209
  )
210
  if "other" in specs:
211
  other_task = st.text_input(
@@ -224,8 +198,8 @@ state["multilinguality"] = multiselect(
224
  "Monolingual?",
225
  "Does the dataset contain more than one language?",
226
  values=state["multilinguality"],
227
- valid_set=list(multilinguality_set.keys()),
228
- format_func=lambda m: f"{m} : {multilinguality_set[m]}",
229
  )
230
 
231
  if "other" in state["multilinguality"]:
@@ -260,14 +234,14 @@ state["language_creators"] = multiselect(
260
  "Data origin",
261
  "Where does the text in the dataset come from?",
262
  values=state["language_creators"],
263
- valid_set=creator_set["language"],
264
  )
265
  state["annotations_creators"] = multiselect(
266
  leftcol,
267
  "Annotations origin",
268
  "Where do the annotations in the dataset come from?",
269
  values=state["annotations_creators"],
270
- valid_set=creator_set["annotations"],
271
  )
272
 
273
 
@@ -275,9 +249,9 @@ state["licenses"] = multiselect(
275
  leftcol,
276
  "Licenses",
277
  "What licenses is the dataset under?",
278
- valid_set=list(license_set.keys()),
279
  values=state["licenses"],
280
- format_func=lambda l: f"{l} : {license_set[l]}",
281
  )
282
  if "other" in state["licenses"]:
283
  other_license = st.text_input(
@@ -320,16 +294,15 @@ if "extended" in state["extended"]:
320
  extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
321
  state["source_datasets"] += [f"extended|{src}" for src in extended_sources]
322
 
323
- size_cats = ["unknown", "n<1K", "1K<n<10K", "10K<n<100K", "100K<n<1M", "n>1M", ...]
324
  current_size_cats = state.get("size_categories") or ["unknown"]
325
- ok, nonok = split_known(current_size_cats, size_cats)
326
  if len(nonok) > 0:
327
  leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
328
  state["size_categories"] = [
329
  leftcol.selectbox(
330
  "What is the size category of the dataset?",
331
- options=size_cats,
332
- index=size_cats.index(ok[0]) if len(ok) > 0 else 0,
333
  )
334
  ]
335
 
 
5
  import langcodes as lc
6
  import streamlit as st
7
  import yaml
8
+ from datasets.utils.metadata import (DatasetMetadata, known_creators,
9
+ known_licenses, known_multilingualities,
10
+ known_size_categories, known_task_ids)
11
 
12
  st.set_page_config(
13
  page_title="HF Dataset Tagging App",
 
28
  unsafe_allow_html=True,
29
  )
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  ########################
32
  ## Helper functions
33
  ########################
 
91
 
92
 
93
  def is_state_empty(state: Dict[str, List]) -> bool:
94
+ return sum(len(v) if v is not None else 0 for v in state.values()) == 0
95
 
96
 
97
  state = new_state()
 
134
  initial_state = existing_tag_sets[preloaded_id]
135
  state = initial_state or new_state()
136
  st.experimental_set_query_params(preload_dataset=preloaded_id)
137
+ if not is_state_empty(state):
138
  if rightbtn.button("flush state"):
139
  state = new_state()
140
  initial_state = None
 
169
  "Task category",
170
  "What categories of task does the dataset support?",
171
  values=state["task_categories"],
172
+ valid_set=list(known_task_ids.keys()),
173
+ format_func=lambda tg: f"{tg}: {known_task_ids[tg]['description']}",
174
  )
175
  task_specifics = []
176
  for tg in state["task_categories"]:
 
178
  leftcol,
179
  f"Specific _{tg}_ tasks",
180
  f"What specific tasks does the dataset support?",
181
+ values=[ts for ts in (state["task_ids"] or []) if ts in known_task_ids[tg]["options"]],
182
+ valid_set=known_task_ids[tg]["options"],
183
  )
184
  if "other" in specs:
185
  other_task = st.text_input(
 
198
  "Monolingual?",
199
  "Does the dataset contain more than one language?",
200
  values=state["multilinguality"],
201
+ valid_set=list(known_multilingualities.keys()),
202
+ format_func=lambda m: f"{m} : {known_multilingualities[m]}",
203
  )
204
 
205
  if "other" in state["multilinguality"]:
 
234
  "Data origin",
235
  "Where does the text in the dataset come from?",
236
  values=state["language_creators"],
237
+ valid_set=known_creators["language"],
238
  )
239
  state["annotations_creators"] = multiselect(
240
  leftcol,
241
  "Annotations origin",
242
  "Where do the annotations in the dataset come from?",
243
  values=state["annotations_creators"],
244
+ valid_set=known_creators["annotations"],
245
  )
246
 
247
 
 
249
  leftcol,
250
  "Licenses",
251
  "What licenses is the dataset under?",
252
+ valid_set=list(known_licenses.keys()),
253
  values=state["licenses"],
254
+ format_func=lambda l: f"{l} : {known_licenses[l]}",
255
  )
256
  if "other" in state["licenses"]:
257
  other_license = st.text_input(
 
294
  extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
295
  state["source_datasets"] += [f"extended|{src}" for src in extended_sources]
296
 
 
297
  current_size_cats = state.get("size_categories") or ["unknown"]
298
+ ok, nonok = split_known(current_size_cats, known_size_categories)
299
  if len(nonok) > 0:
300
  leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
301
  state["size_categories"] = [
302
  leftcol.selectbox(
303
  "What is the size category of the dataset?",
304
+ options=known_size_categories,
305
+ index=known_size_categories.index(ok[0]) if len(ok) > 0 else 0,
306
  )
307
  ]
308