Spaces:

GEM
/

DatasetCardForm

Runtime error

App Files Files Community

Yacine Jernite commited on Nov 16, 2021

Commit

dd1054a

1 Parent(s): 05d58bc

considerations

Browse files

Files changed (1) hide show

datacards/considerations.py +26 -25

datacards/considerations.py CHANGED Viewed

@@ -9,8 +9,8 @@ from .streamlit_utils import (
 )
 N_FIELDS_PII = 1
-N_FIELDS_LICENSES = 3
-N_FIELDS_LIMITATIONS = 4
 N_FIELDS = N_FIELDS_PII + N_FIELDS_LICENSES + N_FIELDS_LIMITATIONS
@@ -25,7 +25,7 @@ def considerations_page():
             "considerations"
         ].get("pii", {})
         make_text_area(
-            label="Considering your answers to the PII part of the Data Curation Section, describe any potential privacy risks of using the data.",
             key_list=key_pref+["risks-description"],
             help="In terms for example of having models memorize private information of data subjects or other breaches of privacy."
         )
@@ -37,7 +37,7 @@ def considerations_page():
         ].get("licenses", {})
         make_multiselect(
-            label="Are there restrictions on the dataset use?",
             options=[
                 "public domain",
                 "multiple licenses",
@@ -52,42 +52,43 @@ def considerations_page():
             help="Does the license restrict how the dataset can be used?",
         )
         make_multiselect(
-            label="Are there restrictions on the underlying data?",
-            options=["Open", "Non-Commercial", "Copyrighted", "Other"],
             key_list=key_pref + ["data-copyright"],
-            help="Are there restructions on the underlying data?",
         )
-    with st.expander("Known limitations", expanded=False):
         key_pref = ["considerations", "limitations"]
         st.session_state.card_dict["considerations"]["limitations"] = st.session_state.card_dict[
             "considerations"
         ].get("limitations", {})
-        # TODO: Form proper language
         make_text_area(
-            label="Technical limitations, annotation noise, etc.",
             key_list=key_pref + ["data-technical-limitations"],
             help="",
         )
         make_text_area(
-            label="Particularly unsuited for applications",
             key_list=key_pref + ["data-unsuited-applications"],
-            help="",
         )
         make_text_area(
-            label="What are discouraged use cases of the dataset?",
             key_list=key_pref + ["data-discouraged-use"],
-            help="",
-        )
-        make_text_area(
-            label="Citation of work identifying these limitations",
-            key_list=key_pref + ["data-citations-limitations"],
-            help="",
         )
@@ -104,5 +105,5 @@ def considerations_summary():
         )
         completion_markdown += f"- **Sub-section - PII Risks and Liability:**\n  - {len(st.session_state.card_dict.get('considerations', {}).get('pii', {}))} of {N_FIELDS_PII} fields\n"
         completion_markdown += f"- **Sub-section - Licenses:**\n  - {len(st.session_state.card_dict.get('considerations', {}).get('licenses', {}))} of {N_FIELDS_LICENSES} fields\n"
-        completion_markdown += f"- **Sub-section - Known limitations:**\n  - {len(st.session_state.card_dict.get('considerations', {}).get('limitations', {}))} of {N_FIELDS_LIMITATIONS} fields\n"
         st.markdown(completion_markdown)

 )
 N_FIELDS_PII = 1
+N_FIELDS_LICENSES = 2
+N_FIELDS_LIMITATIONS = 3
 N_FIELDS = N_FIELDS_PII + N_FIELDS_LICENSES + N_FIELDS_LIMITATIONS
             "considerations"
         ].get("pii", {})
         make_text_area(
+            label="Considering your answers to the PII part of the Data Curation Section, describe any potential privacy to the data subjects and creators risks when using the dataset.",
             key_list=key_pref+["risks-description"],
             help="In terms for example of having models memorize private information of data subjects or other breaches of privacy."
         )
         ].get("licenses", {})
         make_multiselect(
+            label="Based on your answers in the Intended Use part of the Data Overview Section, which of the following best describe the copyright and licensing status of the dataset?",
             options=[
                 "public domain",
                 "multiple licenses",
             help="Does the license restrict how the dataset can be used?",
         )
         make_multiselect(
+            label="Based on your answers in the Language part of the Data Curation Section, which of the following best describe the copyright and licensing status of the underlying language data?",
+            options=[
+                "public domain",
+                "multiple licenses",
+                "copyright - all rights reserved",
+                "open license - commercial use allowed",
+                "research use only",
+                "non-commercial use only",
+                "do not distribute",
+                "other",
+            ],
             key_list=key_pref + ["data-copyright"],
+            help="For example if the dataset uses data from Wikipedia, we are asking about the status of Wikipedia text in general.",
         )
+    with st.expander("Known Technical Limitations", expanded=False):
         key_pref = ["considerations", "limitations"]
         st.session_state.card_dict["considerations"]["limitations"] = st.session_state.card_dict[
             "considerations"
         ].get("limitations", {})
         make_text_area(
+            label="Describe any known technical limitations, such as spurrious correlations, train/test overlap, annotation biases, or mis-annotations? " + \
+            "Describe them and cite the works that first identified these limitations when possible.",
             key_list=key_pref + ["data-technical-limitations"],
             help="",
         )
         make_text_area(
+            label="When using a model trained on this dataset in a setting where users or the public may interact with its predictions, what are some pitfalls to look out for? " + \
+            "In particular, describe some applications of the general task featured in this dataset that its curation or properties make it less suitable for.",
             key_list=key_pref + ["data-unsuited-applications"],
+            help="For example, outline language varieties or domains that the model might underperform for.",
         )
         make_text_area(
+            label="What are some discouraged use cases of a model trained to maximize the proposed metrics on this dataset? " +
+            "In particular, think about settings where decisions made by a model that performs reasonably well on the metric my still have strong negative consequences for user or members of the public.",
             key_list=key_pref + ["data-discouraged-use"],
+            help="For example, think about application settings where certain types of mistakes (such as missing a negation) might have a particularly strong negative impact but are not particularly singled out by the aggregated evaluation.",
         )
         )
         completion_markdown += f"- **Sub-section - PII Risks and Liability:**\n  - {len(st.session_state.card_dict.get('considerations', {}).get('pii', {}))} of {N_FIELDS_PII} fields\n"
         completion_markdown += f"- **Sub-section - Licenses:**\n  - {len(st.session_state.card_dict.get('considerations', {}).get('licenses', {}))} of {N_FIELDS_LICENSES} fields\n"
+        completion_markdown += f"- **Sub-section - Known Technical Limitations:**\n  - {len(st.session_state.card_dict.get('considerations', {}).get('limitations', {}))} of {N_FIELDS_LIMITATIONS} fields\n"
         st.markdown(completion_markdown)