Yacine Jernite commited on
Commit
9994065
1 Parent(s): 3578aa2

can download

Browse files
Files changed (5) hide show
  1. app.py +37 -12
  2. datacards/curation.py +128 -18
  3. datacards/gem.py +51 -2
  4. datacards/overview.py +1 -1
  5. datacards/results.py +1 -1
app.py CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  from datetime import datetime
2
 
3
  import datacards
@@ -60,8 +63,8 @@ def main():
60
  pages = {
61
  "Dataset at a Glance": glance_page,
62
  "Section: Dataset Overview": overview_page,
63
- "Section: Dataset in GEM": gem_page,
64
  "Section: Dataset Curation": curation_page,
 
65
  "Section: Previous Results": results_page,
66
  "Section: Considerations for Using Data": considerations_page,
67
  "Section: Broader Social Context": context_page,
@@ -78,30 +81,52 @@ def main():
78
  def glance_page():
79
  with st.expander("Dataset at a Glance", expanded=True):
80
  dataset_summary = ""
81
- dataset_summary += f"- **Dataset Website**: {st.session_state.save_state.get('overview_where_website', '')}\n"
82
- dataset_summary += f"- **Dataset Contact**: {st.session_state.save_state.get('overview_where_contact-name', '')}\n"
83
- dataset_summary += f"- **Dataset License**: {st.session_state.save_state.get('overview_languages_license', '')}\n"
84
- dataset_summary += f"- **Multilingual Dataset**: {st.session_state.save_state.get('overview_languages_is-multilingual', '')}\n"
85
- dataset_summary += f"- **Dataset Languages**: {', '.join(st.session_state.save_state.get('overview_languages_language-names', []))}\n"
86
- dataset_summary += f"- **Dataset Supported Task**: {st.session_state.save_state.get('overview_languages_task', '')}\n"
87
- dataset_summary += f"- **Communicative Goal**: {st.session_state.save_state.get('dataset_communicative', '')}\n"
88
- dataset_summary += f"- **Language Data Origin**: {st.session_state.save_state.get('curation_language_origin', '')}\n"
89
- dataset_summary += f"- **Annotation Data Origin**: {st.session_state.save_state.get('curation_annotation_origin', '')}\n"
90
- dataset_summary += f"- **Likelihood of PII**: {st.session_state.save_state.get('pii_likelihood', '')}\n"
91
  st.markdown(dataset_summary + "---\n")
92
  num_fields = sum([len(dct) for k in st.session_state.get("card_dict", {}) for dct in st.session_state.card_dict.get(k, {}).values()])
93
  st.markdown(f"You have currently filled out **{num_fields} of {_N_FIELDS} required fields** in the data card.")
94
  left_col, right_col = st.columns(2)
95
  with left_col:
96
  overview_summary()
97
- gem_summary()
98
  curation_summary()
 
99
  with right_col:
100
  results_summary()
101
  considerations_summary()
102
  context_summary()
103
 
104
  def review_page():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  st.write(st.session_state.get("card_dict", {}))
106
  # TODO add buttons to save and download
107
 
 
1
+ import json
2
+ import re
3
+
4
  from datetime import datetime
5
 
6
  import datacards
 
63
  pages = {
64
  "Dataset at a Glance": glance_page,
65
  "Section: Dataset Overview": overview_page,
 
66
  "Section: Dataset Curation": curation_page,
67
+ "Section: Dataset in GEM": gem_page,
68
  "Section: Previous Results": results_page,
69
  "Section: Considerations for Using Data": considerations_page,
70
  "Section: Broader Social Context": context_page,
 
81
  def glance_page():
82
  with st.expander("Dataset at a Glance", expanded=True):
83
  dataset_summary = ""
84
+ dataset_summary += f"- **Dataset Website**: {st.session_state.save_state.get('overview_where_website', '*Go to `Section: Dataset Overview` to fill in*')}\n"
85
+ dataset_summary += f"- **Dataset Contact**: {st.session_state.save_state.get('overview_where_contact-name', '*Go to `Section: Dataset Overview` to fill in*')}\n"
86
+ dataset_summary += f"- **Dataset License**: {st.session_state.save_state.get('overview_languages_license', '*Go to `Section: Dataset Overview` to fill in*')}\n"
87
+ dataset_summary += f"- **Multilingual Dataset**: {st.session_state.save_state.get('overview_languages_is-multilingual', '*Go to `Section: Dataset Overview` to fill in*')}\n"
88
+ dataset_summary += f"- **Dataset Languages**: {st.session_state.save_state.get('overview_languages_language-names', '*Go to `Section: Dataset Overview` to fill in*')}\n"
89
+ dataset_summary += f"- **Dataset Supported Task**: {st.session_state.save_state.get('overview_languages_task', '*Go to `Section: Dataset Overview` to fill in*')}\n"
90
+ dataset_summary += f"- **Communicative Goal**: {st.session_state.save_state.get('overview_languages_communicative', '*Go to `Section: Dataset Overview` to fill in*')}\n"
91
+ dataset_summary += f"- **Language Data Origin**: {st.session_state.save_state.get('curation_language_obtained', '*Go to `Section: Dataset Curation` to fill in*')}\n"
92
+ dataset_summary += f"- **Annotation Data Origin**: {st.session_state.save_state.get('curation_annotations_obtained', '*Go to `Section: Dataset Curation` to fill in*')}\n"
93
+ dataset_summary += f"- **Likelihood of PII**: {st.session_state.save_state.get('curation_pii_has-pii', '*Go to `Section: Dataset Curation` to fill in*')}\n"
94
  st.markdown(dataset_summary + "---\n")
95
  num_fields = sum([len(dct) for k in st.session_state.get("card_dict", {}) for dct in st.session_state.card_dict.get(k, {}).values()])
96
  st.markdown(f"You have currently filled out **{num_fields} of {_N_FIELDS} required fields** in the data card.")
97
  left_col, right_col = st.columns(2)
98
  with left_col:
99
  overview_summary()
 
100
  curation_summary()
101
+ gem_summary()
102
  with right_col:
103
  results_summary()
104
  considerations_summary()
105
  context_summary()
106
 
107
  def review_page():
108
+ dataset_name = st.text_input(
109
+ label="Enter dataset name here",
110
+ )
111
+ if dataset_name != "":
112
+ friendly_name = re.sub(
113
+ r"[^\w\s]", " ", dataset_name.lower()
114
+ ).strip().replace(" ", "_")
115
+ current_date = datetime.now().strftime(
116
+ "%m/%d/%Y, %H:%M:%S"
117
+ )
118
+ friendly_date = re.sub(
119
+ r"[^\w\s]", "_", current_date
120
+ ).replace(" ", "_").replace("__", "_").replace("-", "")
121
+ dataset_file_name = f"{friendly_name}-{friendly_date}.json"
122
+ st.download_button(
123
+ label=f"Download the Dataset Card below as {dataset_file_name}",
124
+ data=json.dumps(st.session_state.get("card_dict", {}), indent=2),
125
+ file_name=dataset_file_name,
126
+ )
127
+ else:
128
+ st.markdown("Enter dataset name above to save!")
129
+ st.markdown("---\n")
130
  st.write(st.session_state.get("card_dict", {}))
131
  # TODO add buttons to save and download
132
 
datacards/curation.py CHANGED
@@ -13,10 +13,9 @@ from .streamlit_utils import (
13
  N_FIELDS_ORIGINAL = 4
14
  N_FIELDS_LANGUAGE = 12
15
  N_FIELDS_ANNOTATIONS = 10
16
- N_FIELDS_CONSENT = 0
17
- N_FIELDS_PII = 0
18
- N_FIELDS_MAINTENANCE = 0
19
- N_FIELDS_GEM = 0
20
 
21
  N_FIELDS = (
22
  N_FIELDS_ORIGINAL
@@ -25,15 +24,9 @@ N_FIELDS = (
25
  + N_FIELDS_CONSENT
26
  + N_FIELDS_PII
27
  + N_FIELDS_MAINTENANCE
28
- + N_FIELDS_GEM
29
  )
30
 
31
 
32
- """
33
- What was the selection criteria? [Describe the process for selecting instances to include in the dataset, including any tools used.]
34
- """
35
-
36
-
37
  def curation_page():
38
  st.session_state.card_dict["curation"] = st.session_state.card_dict.get(
39
  "curation", {}
@@ -64,6 +57,7 @@ def curation_page():
64
  key_list=key_pref + ["aggregated-sources"],
65
  help="Otherwise, type N/A",
66
  )
 
67
  with st.expander("Language Data", expanded=False):
68
  key_pref = ["curation", "language"]
69
  st.session_state.card_dict["curation"]["language"] = st.session_state.card_dict[
@@ -222,27 +216,144 @@ def curation_page():
222
  help="Describe how quality was ensured in the data curation process.",
223
  )
224
 
225
-
226
  with st.expander("Consent", expanded=False):
227
  key_pref = ["curation", "consent"]
228
  st.session_state.card_dict["curation"]["consent"] = st.session_state.card_dict[
229
  "curation"
230
  ].get("consent", {})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  with st.expander("Private Identifying Information (PII)", expanded=False):
232
  key_pref = ["curation", "pii"]
233
  st.session_state.card_dict["curation"]["pii"] = st.session_state.card_dict[
234
  "curation"
235
  ].get("pii", {})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  with st.expander("Maintenance", expanded=False):
237
  key_pref = ["curation", "maintenance"]
238
  st.session_state.card_dict["curation"][
239
  "maintenance"
240
  ] = st.session_state.card_dict["curation"].get("maintenance", {})
241
- with st.expander("GEM Additional Curation", expanded=False):
242
- key_pref = ["curation", "gem"]
243
- st.session_state.card_dict["curation"]["gem"] = st.session_state.card_dict[
244
- "curation"
245
- ].get("gem", {})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
 
247
 
248
  def curation_summary():
@@ -254,7 +365,7 @@ def curation_summary():
254
  ):
255
  completion_markdown = ""
256
  completion_markdown += (
257
- f"- **Overall competion:**\n - {total_filled} of {N_FIELDS} fields\n"
258
  )
259
  completion_markdown += f"- **Sub-section - Original Curation:**\n - {len(st.session_state.card_dict.get('curation', {}).get('original', {}))} of {N_FIELDS_ORIGINAL} fields\n"
260
  completion_markdown += f"- **Sub-section - Language Data:**\n - {len(st.session_state.card_dict.get('curation', {}).get('language', {}))} of {N_FIELDS_LANGUAGE} fields\n"
@@ -262,5 +373,4 @@ def curation_summary():
262
  completion_markdown += f"- **Sub-section - Consent:**\n - {len(st.session_state.card_dict.get('curation', {}).get('consent', {}))} of {N_FIELDS_CONSENT} fields\n"
263
  completion_markdown += f"- **Sub-section - PII:**\n - {len(st.session_state.card_dict.get('curation', {}).get('pii', {}))} of {N_FIELDS_PII} fields\n"
264
  completion_markdown += f"- **Sub-section - Maintenance:**\n - {len(st.session_state.card_dict.get('curation', {}).get('maintenance', {}))} of {N_FIELDS_MAINTENANCE} fields\n"
265
- completion_markdown += f"- **Sub-section - GEM Curation:**\n - {len(st.session_state.card_dict.get('curation', {}).get('gem', {}))} of {N_FIELDS_GEM} fields\n"
266
  st.markdown(completion_markdown)
 
13
  N_FIELDS_ORIGINAL = 4
14
  N_FIELDS_LANGUAGE = 12
15
  N_FIELDS_ANNOTATIONS = 10
16
+ N_FIELDS_CONSENT = 4
17
+ N_FIELDS_PII = 7
18
+ N_FIELDS_MAINTENANCE = 6
 
19
 
20
  N_FIELDS = (
21
  N_FIELDS_ORIGINAL
 
24
  + N_FIELDS_CONSENT
25
  + N_FIELDS_PII
26
  + N_FIELDS_MAINTENANCE
 
27
  )
28
 
29
 
 
 
 
 
 
30
  def curation_page():
31
  st.session_state.card_dict["curation"] = st.session_state.card_dict.get(
32
  "curation", {}
 
57
  key_list=key_pref + ["aggregated-sources"],
58
  help="Otherwise, type N/A",
59
  )
60
+
61
  with st.expander("Language Data", expanded=False):
62
  key_pref = ["curation", "language"]
63
  st.session_state.card_dict["curation"]["language"] = st.session_state.card_dict[
 
216
  help="Describe how quality was ensured in the data curation process.",
217
  )
218
 
 
219
  with st.expander("Consent", expanded=False):
220
  key_pref = ["curation", "consent"]
221
  st.session_state.card_dict["curation"]["consent"] = st.session_state.card_dict[
222
  "curation"
223
  ].get("consent", {})
224
+ make_radio(
225
+ label="Was there a consent policy involved when gathering the data?",
226
+ options=["no", "yes"],
227
+ key_list=key_pref+["has-consent"],
228
+ )
229
+ if st.session_state.card_dict["curation"]["consent"]["has-consent"] == "yes":
230
+ make_text_area(
231
+ label="What was the consent policy?",
232
+ key_list=key_pref+["consent-policy"],
233
+ help="If available, provide the text that data creators were shown, else, describe the process.",
234
+ )
235
+ make_text_area(
236
+ label="What other downstream uses of the data did the original data creators and the data curators consent to?",
237
+ key_list=key_pref+["consent-other"],
238
+ )
239
+ st.session_state.card_dict["curation"]["consent"]["no-consent-justification"] = "N/A"
240
+ else:
241
+ st.session_state.card_dict["curation"]["consent"]["consent-policy"] = "N/A"
242
+ st.session_state.card_dict["curation"]["consent"]["consent-other"] = "N/A"
243
+ make_text_area(
244
+ label="If not, what is the justification for reusing the data? ",
245
+ key_list=key_pref+["no-consent-justification"],
246
+ help="Why would be a justification the data without consent of the data creators in this case?",
247
+ )
248
+
249
  with st.expander("Private Identifying Information (PII)", expanded=False):
250
  key_pref = ["curation", "pii"]
251
  st.session_state.card_dict["curation"]["pii"] = st.session_state.card_dict[
252
  "curation"
253
  ].get("pii", {})
254
+ make_radio(
255
+ label="Does the source language data likely contain Personal Identifying Information about the data creators or subjects?",
256
+ options=["yes/very likely", "likely", "unlikely", "no PII"],
257
+ key_list=key_pref+["has-pii"],
258
+ help="most datasets have some form of PII: names, addresses, emails, account names, personal beliefs, gender, etc. - select `no PII` only if sure",
259
+ )
260
+ if st.session_state.card_dict["curation"]["pii"]["has-pii"] == "no PII":
261
+ make_text_area(
262
+ label="Provide a justification for selecting `no PII` above.",
263
+ key_list=key_pref+["no-pii-justification"],
264
+ help="for example, if the text is about general knowledge without references to the author or to any persons.",
265
+ )
266
+ st.session_state.card_dict["curation"]["pii"]["pii-categories"] = []
267
+ st.session_state.card_dict["curation"]["pii"]["is-pii-identified"] = "N/A"
268
+ st.session_state.card_dict["curation"]["pii"]["pii-identified-method"] = "N/A"
269
+ st.session_state.card_dict["curation"]["pii"]["is-pii-replaced"] = "N/A"
270
+ st.session_state.card_dict["curation"]["pii"]["pii-replaced-method"] = "N/A"
271
+ else:
272
+ st.session_state.card_dict["curation"]["pii"]["no-pii-justification"] = "N/A"
273
+ pii_help_text = """
274
+ - Personally identifying general information includes names, physical and email addresses, website accounts with names or handles, dates (birth, death, etc.), full-face photographs and comparable images, URLS, and biometric identifiers (fingerprints, voice, etc.).
275
+ - Personally identifying numbers include information such as telephone numbers, fax numbers, vehicle and device identifiers and serial numbers, social security numbers and equivalent, IP addresses, medical record numbers, health plan beneficiary numbers, account numbers, certificate/license numbers, and any other uniquely identifying numbers.
276
+ - Sensitive information includes descriptions of racial or ethnic origin, political opinions, religious or philosophical beliefs, trade-union membership, genetic data, health-related data, and data concerning a person's sex life or sexual orientation.
277
+ """
278
+ make_multiselect(
279
+ label="What categories of PII are present or suspected in the data?",
280
+ options=["generic PII", "numeric PII", "sensitive information"],
281
+ key_list=key_pref+["pii-categories"],
282
+ help=pii_help_text,
283
+ )
284
+ make_radio(
285
+ label="Did the curators use any automatic/manual method to identify PII in the dataset?",
286
+ options=["no identification", "manual identification", "automatic identification", "mixed method"],
287
+ key_list=key_pref+["is-pii-identified"],
288
+ )
289
+ if st.session_state.card_dict["curation"]["pii"]["is-pii-identified"] == "no identification":
290
+ st.session_state.card_dict["curation"]["pii"]["pii-identified-method"] = "N/A"
291
+ st.session_state.card_dict["curation"]["pii"]["is-pii-replaced"] = "N/A"
292
+ st.session_state.card_dict["curation"]["pii"]["pii-replaced-method"] = "N/A"
293
+ else:
294
+ make_text_area(
295
+ label="Describe the method used to identify PII in the dataset",
296
+ key_list=key_pref+["pii-identified-method"],
297
+ )
298
+ make_radio(
299
+ label="Was the PII pseudonymized/handled somehow?",
300
+ options=["no", "yes"],
301
+ key_list=key_pref+["is-pii-replaced"],
302
+ )
303
+ if st.session_state.card_dict["curation"]["pii"]["is-pii-replaced"] == "yes":
304
+ make_text_area(
305
+ label="Describe the methods that were used to process the PII",
306
+ key_list=key_pref+["pii-replaced-method"],
307
+ )
308
+ else:
309
+ st.session_state.card_dict["curation"]["pii"]["pii-replaced-method"] = "N/A"
310
+
311
  with st.expander("Maintenance", expanded=False):
312
  key_pref = ["curation", "maintenance"]
313
  st.session_state.card_dict["curation"][
314
  "maintenance"
315
  ] = st.session_state.card_dict["curation"].get("maintenance", {})
316
+ make_radio(
317
+ label="Does the original dataset have a maintenance plan?",
318
+ options=["no", "yes"],
319
+ key_list=key_pref+["has-maintenance"],
320
+ help="this can include planned update or a commitment to removing content on request",
321
+ )
322
+ if st.session_state.card_dict["curation"]["maintenance"]["has-maintenance"] == "yes":
323
+ make_text_area(
324
+ label="Describe the original dataset's maintenance plan.",
325
+ key_list=key_pref+["description"],
326
+ )
327
+ make_text_area(
328
+ label="Provide contact information of a person responsible for the dataset maintenance",
329
+ key_list=key_pref+["contact"],
330
+ )
331
+ make_radio(
332
+ label="Does the maintenance plan include a contestation mechanism allowing individuals to request removal fo content?",
333
+ options=["no mechanism", "form submission", "contact maintainer", "other"],
334
+ key_list=key_pref+["contestation-mechanism"],
335
+ )
336
+ if st.session_state.card_dict["curation"]["maintenance"]["contestation-mechanism"] == "no mechanism":
337
+ st.session_state.card_dict["curation"]["maintenance"]["contestation-link"] = "N/A"
338
+ st.session_state.card_dict["curation"]["maintenance"]["contestation-description"] = "N/A"
339
+ elif st.session_state.card_dict["curation"]["maintenance"]["contestation-mechanism"] == "other":
340
+ st.session_state.card_dict["curation"]["maintenance"]["contestation-link"] = "N/A"
341
+ make_text_area(
342
+ label="Describe the contestation mechanism",
343
+ key_list=key_pref+["contestation-description"],
344
+ )
345
+ else:
346
+ make_text_input(
347
+ label="Provide the form link or contact information",
348
+ key_list=key_pref+["contestation-link"],
349
+ )
350
+ st.session_state.card_dict["curation"]["maintenance"]["contestation-description"] = "N/A"
351
+ else:
352
+ st.session_state.card_dict["curation"]["maintenance"]["description"] = "N/A"
353
+ st.session_state.card_dict["curation"]["maintenance"]["contact"] = "N/A"
354
+ st.session_state.card_dict["curation"]["maintenance"]["contestation-mechanism"] = "N/A"
355
+ st.session_state.card_dict["curation"]["maintenance"]["contestation-link"] = "N/A"
356
+ st.session_state.card_dict["curation"]["maintenance"]["contestation-description"] = "N/A"
357
 
358
 
359
  def curation_summary():
 
365
  ):
366
  completion_markdown = ""
367
  completion_markdown += (
368
+ f"- **Overall completion:**\n - {total_filled} of {N_FIELDS} fields\n"
369
  )
370
  completion_markdown += f"- **Sub-section - Original Curation:**\n - {len(st.session_state.card_dict.get('curation', {}).get('original', {}))} of {N_FIELDS_ORIGINAL} fields\n"
371
  completion_markdown += f"- **Sub-section - Language Data:**\n - {len(st.session_state.card_dict.get('curation', {}).get('language', {}))} of {N_FIELDS_LANGUAGE} fields\n"
 
373
  completion_markdown += f"- **Sub-section - Consent:**\n - {len(st.session_state.card_dict.get('curation', {}).get('consent', {}))} of {N_FIELDS_CONSENT} fields\n"
374
  completion_markdown += f"- **Sub-section - PII:**\n - {len(st.session_state.card_dict.get('curation', {}).get('pii', {}))} of {N_FIELDS_PII} fields\n"
375
  completion_markdown += f"- **Sub-section - Maintenance:**\n - {len(st.session_state.card_dict.get('curation', {}).get('maintenance', {}))} of {N_FIELDS_MAINTENANCE} fields\n"
 
376
  st.markdown(completion_markdown)
datacards/gem.py CHANGED
@@ -3,14 +3,16 @@ import streamlit as st
3
  from .streamlit_utils import make_text_input
4
 
5
  from .streamlit_utils import (
 
6
  make_text_area,
7
  make_radio,
8
  )
9
 
10
  N_FIELDS_RATIONALE = 5
 
11
  N_FIELDS_STARTING = 2
12
 
13
- N_FIELDS = N_FIELDS_RATIONALE + N_FIELDS_STARTING
14
 
15
 
16
  def gem_page():
@@ -47,6 +49,51 @@ def gem_page():
47
  key_list=key_pref + ["model-ability"],
48
  help="What kind of abilities should a model exhibit that performs well on the task of this dataset (e.g., reasoning capability, morphological inflection)?",
49
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  with st.expander("Getting Started", expanded=False):
51
  key_pref = ["gem", "starting"]
52
  st.session_state.card_dict["gem"]["starting"] = st.session_state.card_dict[
@@ -64,6 +111,7 @@ def gem_page():
64
  )
65
 
66
 
 
67
  def gem_summary():
68
  total_filled = sum(
69
  [len(dct) for dct in st.session_state.card_dict.get("gem", {}).values()]
@@ -73,8 +121,9 @@ def gem_summary():
73
  ):
74
  completion_markdown = ""
75
  completion_markdown += (
76
- f"- **Overall competion:**\n - {total_filled} of {N_FIELDS} fields\n"
77
  )
78
  completion_markdown += f"- **Sub-section - Rationale:**\n - {len(st.session_state.card_dict.get('gem', {}).get('rationale', {}))} of {N_FIELDS_RATIONALE} fields\n"
 
79
  completion_markdown += f"- **Sub-section - Getting Started:**\n - {len(st.session_state.card_dict.get('gem', {}).get('starting', {}))} of {N_FIELDS_STARTING} fields\n"
80
  st.markdown(completion_markdown)
 
3
  from .streamlit_utils import make_text_input
4
 
5
  from .streamlit_utils import (
6
+ make_multiselect,
7
  make_text_area,
8
  make_radio,
9
  )
10
 
11
  N_FIELDS_RATIONALE = 5
12
+ N_FIELDS_CURATION = 6
13
  N_FIELDS_STARTING = 2
14
 
15
+ N_FIELDS = N_FIELDS_RATIONALE + N_FIELDS_CURATION + N_FIELDS_STARTING
16
 
17
 
18
  def gem_page():
 
49
  key_list=key_pref + ["model-ability"],
50
  help="What kind of abilities should a model exhibit that performs well on the task of this dataset (e.g., reasoning capability, morphological inflection)?",
51
  )
52
+
53
+ with st.expander("GEM Additional Curation", expanded=False):
54
+ key_pref = ["gem", "curation"]
55
+ st.session_state.card_dict["gem"]["curation"] = st.session_state.card_dict[
56
+ "gem"
57
+ ].get("curation", {})
58
+ make_radio(
59
+ label="Has the GEM version of the dataset been modified in any way (data, processing, splits) from the original curated data?",
60
+ options=["no", "yes"],
61
+ key_list=key_pref+["has-additional-curation"],
62
+ )
63
+ if st.session_state.card_dict["gem"]["curation"]["has-additional-curation"] == "yes":
64
+ make_multiselect(
65
+ label="What changes have been made to he original dataset?",
66
+ options=["data points added", "data points removed", "data points modified", "annotations added", "other"],
67
+ key_list=key_pref+["modification-types"],
68
+ )
69
+ make_text_area(
70
+ label="For each of these changes, described them in more details and provided the intended purpose of the modification",
71
+ key_list=key_pref+["modification-description"],
72
+ )
73
+ make_radio(
74
+ label="Does GEM provide additional splits to the dataset?",
75
+ options=["no", "yes"],
76
+ key_list=key_pref+["has-additional-splits"],
77
+ )
78
+ if st.session_state.card_dict["gem"]["curation"]["has-additional-splits"] == "yes":
79
+ make_text_area(
80
+ label="Describe how the new splits were created",
81
+ key_list=key_pref+["additional-splits-description"],
82
+ )
83
+ make_text_area(
84
+ label="What aspects of the model's generation capacities were the splits created to test?",
85
+ key_list=key_pref+["additional-splits-capacicites"],
86
+ )
87
+ else:
88
+ st.session_state.card_dict["gem"]["curation"]["additional-splits-description"] = "N/A"
89
+ st.session_state.card_dict["gem"]["curation"]["additional-splits-capacicites"] = "N/A"
90
+ else:
91
+ st.session_state.card_dict["gem"]["curation"]["modification-types"] = []
92
+ st.session_state.card_dict["gem"]["curation"]["modification-description"] = "N/A"
93
+ st.session_state.card_dict["gem"]["curation"]["has-additional-splits"] = "no"
94
+ st.session_state.card_dict["gem"]["curation"]["additional-splits-description"] = "N/A"
95
+ st.session_state.card_dict["gem"]["curation"]["additional-splits-capacicites"] = "N/A"
96
+
97
  with st.expander("Getting Started", expanded=False):
98
  key_pref = ["gem", "starting"]
99
  st.session_state.card_dict["gem"]["starting"] = st.session_state.card_dict[
 
111
  )
112
 
113
 
114
+
115
  def gem_summary():
116
  total_filled = sum(
117
  [len(dct) for dct in st.session_state.card_dict.get("gem", {}).values()]
 
121
  ):
122
  completion_markdown = ""
123
  completion_markdown += (
124
+ f"- **Overall completion:**\n - {total_filled} of {N_FIELDS} fields\n"
125
  )
126
  completion_markdown += f"- **Sub-section - Rationale:**\n - {len(st.session_state.card_dict.get('gem', {}).get('rationale', {}))} of {N_FIELDS_RATIONALE} fields\n"
127
+ completion_markdown += f"- **Sub-section - GEM Additional Curation:**\n - {len(st.session_state.card_dict.get('gem', {}).get('curation', {}))} of {N_FIELDS_CURATION} fields\n"
128
  completion_markdown += f"- **Sub-section - Getting Started:**\n - {len(st.session_state.card_dict.get('gem', {}).get('starting', {}))} of {N_FIELDS_STARTING} fields\n"
129
  st.markdown(completion_markdown)
datacards/overview.py CHANGED
@@ -222,7 +222,7 @@ def overview_summary():
222
  ):
223
  completion_markdown = ""
224
  completion_markdown += (
225
- f"- **Overall competion:**\n - {total_filled} of {N_FIELDS} fields\n"
226
  )
227
  completion_markdown += f"- **Sub-section - Where to find:**\n - {len(st.session_state.card_dict.get('overview', {}).get('where', {}))} of {N_FIELDS_WHERE} fields\n"
228
  completion_markdown += f"- **Sub-section - Languages and Intended Use:**\n - {len(st.session_state.card_dict.get('overview', {}).get('languages', {}))} of {N_FIELDS_LANGUAGES} fields\n"
 
222
  ):
223
  completion_markdown = ""
224
  completion_markdown += (
225
+ f"- **Overall completion:**\n - {total_filled} of {N_FIELDS} fields\n"
226
  )
227
  completion_markdown += f"- **Sub-section - Where to find:**\n - {len(st.session_state.card_dict.get('overview', {}).get('where', {}))} of {N_FIELDS_WHERE} fields\n"
228
  completion_markdown += f"- **Sub-section - Languages and Intended Use:**\n - {len(st.session_state.card_dict.get('overview', {}).get('languages', {}))} of {N_FIELDS_LANGUAGES} fields\n"
datacards/results.py CHANGED
@@ -82,7 +82,7 @@ def results_summary():
82
  ):
83
  completion_markdown = ""
84
  completion_markdown += (
85
- f"- **Overall competion:**\n - {total_filled} of {N_FIELDS} fields\n"
86
  )
87
  completion_markdown += f"- **Sub-section - Previous Results:**\n - {len(st.session_state.card_dict.get('results', {}).get('results', {}))} of {N_FIELDS} fields\n"
88
  st.markdown(completion_markdown)
 
82
  ):
83
  completion_markdown = ""
84
  completion_markdown += (
85
+ f"- **Overall completion:**\n - {total_filled} of {N_FIELDS} fields\n"
86
  )
87
  completion_markdown += f"- **Sub-section - Previous Results:**\n - {len(st.session_state.card_dict.get('results', {}).get('results', {}))} of {N_FIELDS} fields\n"
88
  st.markdown(completion_markdown)