Yacine Jernite commited on
Commit
dd1054a
1 Parent(s): 05d58bc

considerations

Browse files
Files changed (1) hide show
  1. datacards/considerations.py +26 -25
datacards/considerations.py CHANGED
@@ -9,8 +9,8 @@ from .streamlit_utils import (
9
  )
10
 
11
  N_FIELDS_PII = 1
12
- N_FIELDS_LICENSES = 3
13
- N_FIELDS_LIMITATIONS = 4
14
 
15
  N_FIELDS = N_FIELDS_PII + N_FIELDS_LICENSES + N_FIELDS_LIMITATIONS
16
 
@@ -25,7 +25,7 @@ def considerations_page():
25
  "considerations"
26
  ].get("pii", {})
27
  make_text_area(
28
- label="Considering your answers to the PII part of the Data Curation Section, describe any potential privacy risks of using the data.",
29
  key_list=key_pref+["risks-description"],
30
  help="In terms for example of having models memorize private information of data subjects or other breaches of privacy."
31
  )
@@ -37,7 +37,7 @@ def considerations_page():
37
  ].get("licenses", {})
38
 
39
  make_multiselect(
40
- label="Are there restrictions on the dataset use?",
41
  options=[
42
  "public domain",
43
  "multiple licenses",
@@ -52,42 +52,43 @@ def considerations_page():
52
  help="Does the license restrict how the dataset can be used?",
53
  )
54
  make_multiselect(
55
- label="Are there restrictions on the underlying data?",
56
- options=["Open", "Non-Commercial", "Copyrighted", "Other"],
 
 
 
 
 
 
 
 
 
57
  key_list=key_pref + ["data-copyright"],
58
- help="Are there restructions on the underlying data?",
59
  )
60
 
61
- with st.expander("Known limitations", expanded=False):
62
  key_pref = ["considerations", "limitations"]
63
  st.session_state.card_dict["considerations"]["limitations"] = st.session_state.card_dict[
64
  "considerations"
65
  ].get("limitations", {})
66
-
67
- # TODO: Form proper language
68
-
69
  make_text_area(
70
- label="Technical limitations, annotation noise, etc.",
 
71
  key_list=key_pref + ["data-technical-limitations"],
72
  help="",
73
  )
74
-
75
  make_text_area(
76
- label="Particularly unsuited for applications",
 
77
  key_list=key_pref + ["data-unsuited-applications"],
78
- help="",
79
  )
80
-
81
  make_text_area(
82
- label="What are discouraged use cases of the dataset?",
 
83
  key_list=key_pref + ["data-discouraged-use"],
84
- help="",
85
- )
86
-
87
- make_text_area(
88
- label="Citation of work identifying these limitations",
89
- key_list=key_pref + ["data-citations-limitations"],
90
- help="",
91
  )
92
 
93
 
@@ -104,5 +105,5 @@ def considerations_summary():
104
  )
105
  completion_markdown += f"- **Sub-section - PII Risks and Liability:**\n - {len(st.session_state.card_dict.get('considerations', {}).get('pii', {}))} of {N_FIELDS_PII} fields\n"
106
  completion_markdown += f"- **Sub-section - Licenses:**\n - {len(st.session_state.card_dict.get('considerations', {}).get('licenses', {}))} of {N_FIELDS_LICENSES} fields\n"
107
- completion_markdown += f"- **Sub-section - Known limitations:**\n - {len(st.session_state.card_dict.get('considerations', {}).get('limitations', {}))} of {N_FIELDS_LIMITATIONS} fields\n"
108
  st.markdown(completion_markdown)
 
9
  )
10
 
11
  N_FIELDS_PII = 1
12
+ N_FIELDS_LICENSES = 2
13
+ N_FIELDS_LIMITATIONS = 3
14
 
15
  N_FIELDS = N_FIELDS_PII + N_FIELDS_LICENSES + N_FIELDS_LIMITATIONS
16
 
 
25
  "considerations"
26
  ].get("pii", {})
27
  make_text_area(
28
+ label="Considering your answers to the PII part of the Data Curation Section, describe any potential privacy to the data subjects and creators risks when using the dataset.",
29
  key_list=key_pref+["risks-description"],
30
  help="In terms for example of having models memorize private information of data subjects or other breaches of privacy."
31
  )
 
37
  ].get("licenses", {})
38
 
39
  make_multiselect(
40
+ label="Based on your answers in the Intended Use part of the Data Overview Section, which of the following best describe the copyright and licensing status of the dataset?",
41
  options=[
42
  "public domain",
43
  "multiple licenses",
 
52
  help="Does the license restrict how the dataset can be used?",
53
  )
54
  make_multiselect(
55
+ label="Based on your answers in the Language part of the Data Curation Section, which of the following best describe the copyright and licensing status of the underlying language data?",
56
+ options=[
57
+ "public domain",
58
+ "multiple licenses",
59
+ "copyright - all rights reserved",
60
+ "open license - commercial use allowed",
61
+ "research use only",
62
+ "non-commercial use only",
63
+ "do not distribute",
64
+ "other",
65
+ ],
66
  key_list=key_pref + ["data-copyright"],
67
+ help="For example if the dataset uses data from Wikipedia, we are asking about the status of Wikipedia text in general.",
68
  )
69
 
70
+ with st.expander("Known Technical Limitations", expanded=False):
71
  key_pref = ["considerations", "limitations"]
72
  st.session_state.card_dict["considerations"]["limitations"] = st.session_state.card_dict[
73
  "considerations"
74
  ].get("limitations", {})
 
 
 
75
  make_text_area(
76
+ label="Describe any known technical limitations, such as spurrious correlations, train/test overlap, annotation biases, or mis-annotations? " + \
77
+ "Describe them and cite the works that first identified these limitations when possible.",
78
  key_list=key_pref + ["data-technical-limitations"],
79
  help="",
80
  )
 
81
  make_text_area(
82
+ label="When using a model trained on this dataset in a setting where users or the public may interact with its predictions, what are some pitfalls to look out for? " + \
83
+ "In particular, describe some applications of the general task featured in this dataset that its curation or properties make it less suitable for.",
84
  key_list=key_pref + ["data-unsuited-applications"],
85
+ help="For example, outline language varieties or domains that the model might underperform for.",
86
  )
 
87
  make_text_area(
88
+ label="What are some discouraged use cases of a model trained to maximize the proposed metrics on this dataset? " +
89
+ "In particular, think about settings where decisions made by a model that performs reasonably well on the metric my still have strong negative consequences for user or members of the public.",
90
  key_list=key_pref + ["data-discouraged-use"],
91
+ help="For example, think about application settings where certain types of mistakes (such as missing a negation) might have a particularly strong negative impact but are not particularly singled out by the aggregated evaluation.",
 
 
 
 
 
 
92
  )
93
 
94
 
 
105
  )
106
  completion_markdown += f"- **Sub-section - PII Risks and Liability:**\n - {len(st.session_state.card_dict.get('considerations', {}).get('pii', {}))} of {N_FIELDS_PII} fields\n"
107
  completion_markdown += f"- **Sub-section - Licenses:**\n - {len(st.session_state.card_dict.get('considerations', {}).get('licenses', {}))} of {N_FIELDS_LICENSES} fields\n"
108
+ completion_markdown += f"- **Sub-section - Known Technical Limitations:**\n - {len(st.session_state.card_dict.get('considerations', {}).get('limitations', {}))} of {N_FIELDS_LIMITATIONS} fields\n"
109
  st.markdown(completion_markdown)