Spaces:
Runtime error
Runtime error
import json | |
import streamlit as st | |
from os.path import join as pjoin | |
from .streamlit_utils import ( | |
make_multiselect, | |
make_selectbox, | |
make_text_area, | |
make_text_input, | |
make_radio, | |
) | |
N_FIELDS_WHERE = 9 | |
N_FIELDS_LANGUAGES = 8 | |
N_FIELDS_CREDIT = 5 | |
N_FIELDS_STRUCTURE = 7 | |
N_FIELDS = N_FIELDS_WHERE + N_FIELDS_LANGUAGES + N_FIELDS_CREDIT + N_FIELDS_STRUCTURE | |
languages_bcp47 = [ | |
x | |
for x in json.load(open(pjoin("resources", "bcp47.json"), encoding="utf-8"))[ | |
"subtags" | |
] | |
if x["type"] == "language" | |
] | |
license_list = json.load(open(pjoin("resources", "licenses.json"), encoding="utf-8")) | |
def overview_page(): | |
st.session_state.card_dict["overview"] = st.session_state.card_dict.get( | |
"overview", {} | |
) | |
with st.expander("Where to find the data and its documentation", expanded=True): | |
key_pref = ["overview", "what"] | |
st.session_state.card_dict["overview"]["what"] = st.session_state.card_dict[ | |
"overview" | |
].get("what", {}) | |
make_text_area( | |
label="Provide a summary of this dataset in 3-4 sentences.", | |
key_list=key_pref + ["dataset"], | |
help="[free text]", | |
) | |
with st.expander("Where to find the data and its documentation", expanded=False): | |
key_pref = ["overview", "where"] | |
st.session_state.card_dict["overview"]["where"] = st.session_state.card_dict[ | |
"overview" | |
].get("where", {}) | |
make_text_input( | |
label="What is the webpage for the dataset (if it exists)?", | |
key_list=key_pref + ["website"], | |
help="[URL]", | |
) | |
make_text_input( | |
label="What is the link to where the original dataset is hosted?", | |
key_list=key_pref + ["data-url"], | |
help="[URL]", | |
) | |
make_text_input( | |
label="What is the link to the paper describing the dataset (open access preferred)?", | |
key_list=key_pref + ["paper-url"], | |
help="[URL]", | |
) | |
make_text_area( | |
label="Provide the BibTex-formatted reference for the dataset. Please use the correct published version (ACL anthology, etc.) instead of google scholar created Bibtex.", | |
key_list=key_pref + ["paper-bibtext"], | |
help="[free text]", | |
) | |
make_radio( | |
label="Does the dataset have an active leaderboard?", | |
options=["no", "yes"], | |
key_list=key_pref + ["has-leaderboard"], | |
help="If no, enter N/A for the following two fields", | |
) | |
if st.session_state.card_dict["overview"]["where"]["has-leaderboard"] == "yes": | |
make_text_input( | |
label="Provide a link to the leaderboard.", | |
key_list=key_pref + ["leaderboard-url"], | |
help="[URL] or N/A", | |
) | |
make_text_area( | |
label="Briefly describe how the leaderboard evaluates models.", | |
key_list=key_pref + ["leaderboard-description"], | |
help="[free text; a paragraph] or N/A", | |
) | |
else: | |
st.session_state.card_dict["overview"]["where"]["leaderboard-url"] = "N/A" | |
st.session_state.card_dict["overview"]["where"]["leaderboard-description"] = "N/A" | |
make_text_input( | |
label="If known, provide the name of at least one person the reader can contact for questions about the dataset.", | |
key_list=key_pref + ["contact-name"], | |
help="[free text]", | |
) | |
make_text_input( | |
label="If known, provide the email of at least one person the reader can contact for questions about the dataset.", | |
key_list=key_pref + ["contact-email"], | |
help="[free text]", | |
) | |
with st.expander("Languages and Intended Use", expanded=False): | |
key_pref = ["overview", "languages"] | |
st.session_state.card_dict["overview"][ | |
"languages" | |
] = st.session_state.card_dict["overview"].get("languages", {}) | |
make_radio( | |
label="Is the dataset multilingual?", | |
options=["no", "yes"], | |
key_list=key_pref + ["is-multilingual"], | |
help="More than one language present in all of the text fields", | |
) | |
make_multiselect( | |
label="What languages/dialects are covered in the dataset?", | |
key_list=key_pref + ["language-names"], | |
options=[", ".join(x["description"]) for x in languages_bcp47], | |
help="This is a comprehensive list of languages obtained from the BCP-47 standard list.", | |
) | |
make_text_area( | |
label="What dialects are covered? Are there multiple dialects per language?", | |
key_list=key_pref + ["language-dialects"], | |
help="[free text, paragraphs] - Describe the dialect(s) as appropriate.", | |
) | |
make_text_area( | |
label="Whose language is in the dataset?", | |
key_list=key_pref + ["language-speakers"], | |
help="[free text, paragraphs] - Provide locally appropriate demographic information about the language producers, if available. Use ranges where reasonable in order to protect individuals’ privacy.", | |
) | |
make_text_area( | |
label="What is the intended use of the dataset?", | |
key_list=key_pref + ["intended-use"], | |
help="[free text, paragraphs] - Describe how the dataset creators describe its purpose and intended use.", | |
) | |
make_selectbox( | |
label="What is the license of the dataset?", | |
key_list=key_pref + ["license"], | |
options=license_list, | |
help="select `other` if missing from list, `unkown` if not provided.", | |
) | |
if "other" in st.session_state.card_dict["overview"]["languages"].get("license", []): | |
make_text_input( | |
label="What is the 'other' license of the dataset?", | |
key_list=key_pref + ["license-other"], | |
help="[free text]", | |
) | |
else: | |
st.session_state.card_dict["overview"]["languages"]["license-other"] = "N/A" | |
make_selectbox( | |
label="What primary task does the dataset support?", | |
key_list=key_pref + ["task"], | |
options=[ | |
"", # default needs to be invalid value to make sure people actually fill in | |
"Content Transfer", | |
"Data-to-Text", | |
"Dialog Response Generation", | |
"Paraphrasing", | |
"Question Generation", | |
"Reasoning", | |
"Simplification", | |
"Style Transfer", | |
"Summarization", | |
"Text-to-Slide", | |
"Other" | |
], | |
help="Select `other` if the task is not included in the list.", | |
) | |
if "Other" in st.session_state.card_dict["overview"]["languages"].get("task", []): | |
make_text_input( | |
label="What is the primary task?", | |
key_list=key_pref + ["task-other"], | |
help="[free text]", | |
) | |
else: | |
st.session_state.card_dict["overview"]["languages"]["task-other"] = "N/A" | |
make_text_area( | |
label="Provide a short description of the communicative goal of a model trained for this task on this dataset.", | |
key_list=key_pref + ["communicative"], | |
help="[free text, a paragraph] (e.g., describe a restaurant from a structured representation of its attributes)", | |
) | |
with st.expander("Credit", expanded=False): | |
key_pref = ["overview", "credit"] | |
st.session_state.card_dict["overview"][ | |
"credit" | |
] = st.session_state.card_dict.get("credit", {}) | |
make_multiselect( | |
label="In what kind of organization did the dataset curation happen?", | |
options=["industry", "academic", "independent", "other"], | |
key_list=key_pref + ["organization-type"], | |
) | |
make_text_input( | |
label="Name the organization(s).", | |
key_list=key_pref + ["organization-names"], | |
help="comma-separated", | |
) | |
make_text_input( | |
label="Who created the original dataset? List the people involved in collecting the dataset and their affiliation(s).", | |
key_list=key_pref + ["creators"], | |
help="name (affiliation); comma-separated", | |
) | |
make_text_input( | |
label="Who funded the data creation?", | |
key_list=key_pref + ["funding"], | |
help="[free text] enter N/A if unkown", | |
) | |
make_text_input( | |
label="Who contributed to the data card and adding the dataset to GEM? List the people+affiliations involved in creating this data card and who helped integrate this dataset into GEM.", | |
key_list=key_pref + ["gem-added-by"], | |
help="name (affiliation); comma-separated", | |
) | |
with st.expander("Structure", expanded=False): | |
key_pref = ["overview", "structure"] | |
st.session_state.card_dict["overview"]["structure"] = st.session_state.card_dict[ | |
"overview" | |
].get("structure", {}) | |
data_fields_help = """ | |
[free text; paragraphs] | |
- Mention their data type, and whether and how they are used as part of the generation pipeline. | |
- Describe each fields' attributes, such as whether they are at the character level or word level, whether they are contiguous or not, etc. | |
- If the datasets contain example IDs, state whether they have an inherent meaning, such as a mapping to other datasets or pointing to relationships between data points. | |
""" | |
make_text_area( | |
label="List and describe the fields present in the dataset.", | |
key_list=key_pref + ["data-fields"], | |
help=data_fields_help, | |
) | |
make_text_area( | |
label="How was the dataset structure determined?", | |
key_list=key_pref + ["structure-description"], | |
help="[free text; paragraph]", | |
) | |
make_text_area( | |
label="How were the labels chosen?", | |
key_list=key_pref + ["structure-labels"], | |
help="[free text; paragraph]", | |
) | |
make_text_area( | |
label="Provide a JSON formatted example of a typical instance in the dataset.", | |
key_list=key_pref + ["structure-example"], | |
help="[JSON]", | |
) | |
make_text_area( | |
label="Describe and name the splits in the dataset if there are more than one.", | |
key_list=key_pref + ["structure-splits"], | |
help="[free text, paragraphs] - As appropriate, provide any descriptive statistics for the features, such as size, average lengths of input and output.", | |
) | |
make_text_area( | |
label="Describe any criteria for splitting the data, if used. If there are differences between the splits (e.g., if the training annotations are machine-generated and the dev and test ones are created by humans, or if different numbers of annotators contributed to each example), describe them here.", | |
key_list=key_pref + ["structure-splits-criteria"], | |
help="[free text, paragraphs]", | |
) | |
make_text_area( | |
label="What does an outlier of the dataset in terms of length/perplexity/embedding look like?", | |
key_list=key_pref + ["structure-outlier"], | |
help="[free text + json formatted text/file for an example]", | |
) | |
def overview_summary(): | |
total_filled = sum( | |
[len(dct) for dct in st.session_state.card_dict.get("overview", {}).values()] | |
) | |
with st.expander( | |
f"Dataset Overview Completion - {total_filled} of {N_FIELDS}", expanded=False | |
): | |
completion_markdown = "" | |
completion_markdown += ( | |
f"- **Overall completion:**\n - {total_filled} of {N_FIELDS} fields\n" | |
) | |
completion_markdown += f"- **Sub-section - Where to find:**\n - {len(st.session_state.card_dict.get('overview', {}).get('where', {}))} of {N_FIELDS_WHERE} fields\n" | |
completion_markdown += f"- **Sub-section - Languages and Intended Use:**\n - {len(st.session_state.card_dict.get('overview', {}).get('languages', {}))} of {N_FIELDS_LANGUAGES} fields\n" | |
completion_markdown += f"- **Sub-section - Credit:**\n - {len(st.session_state.card_dict.get('overview', {}).get('credit', {}))} of {N_FIELDS_CREDIT} fields\n" | |
completion_markdown += f"- **Sub-section - Structure:**\n - {len(st.session_state.card_dict.get('overview', {}).get('structure', {}))} of {N_FIELDS_STRUCTURE} fields\n" | |
st.markdown(completion_markdown) | |