import json import streamlit as st from os.path import join as pjoin from .streamlit_utils import ( make_multiselect, make_selectbox, make_text_area, make_text_input, make_radio, ) N_FIELDS_WHERE = 9 N_FIELDS_LANGUAGES = 8 N_FIELDS_CREDIT = 5 N_FIELDS_STRUCTURE = 7 N_FIELDS = N_FIELDS_WHERE + N_FIELDS_LANGUAGES + N_FIELDS_CREDIT + N_FIELDS_STRUCTURE languages_bcp47 = [ x for x in json.load(open(pjoin("resources", "bcp47.json"), encoding="utf-8"))[ "subtags" ] if x["type"] == "language" ] license_list = json.load(open(pjoin("resources", "licenses.json"), encoding="utf-8")) def overview_page(): st.session_state.card_dict["overview"] = st.session_state.card_dict.get( "overview", {} ) with st.expander("What is this dataset?", expanded=True): key_pref = ["overview", "what"] st.session_state.card_dict["overview"]["what"] = st.session_state.card_dict[ "overview" ].get("what", {}) make_text_area( label="Provide a summary of this dataset in 3-4 sentences.", key_list=key_pref + ["dataset"], help="[free text]", ) with st.expander("Where to find the data and its documentation", expanded=False): key_pref = ["overview", "where"] st.session_state.card_dict["overview"]["where"] = st.session_state.card_dict[ "overview" ].get("where", {}) make_text_input( label="What is the webpage for the dataset (if it exists)?", key_list=key_pref + ["website"], help="[URL]", ) make_text_input( label="What is the link to where the original dataset is hosted?", key_list=key_pref + ["data-url"], help="[URL]", ) make_text_input( label="What is the link to the paper describing the dataset (open access preferred)?", key_list=key_pref + ["paper-url"], help="[URL]", ) make_text_area( label="Provide the BibTex-formatted reference for the dataset. Please use the correct published version (ACL anthology, etc.) instead of google scholar created Bibtex.", key_list=key_pref + ["paper-bibtext"], help="[free text]", ) make_radio( label="Does the dataset have an active leaderboard?", options=["no", "yes"], key_list=key_pref + ["has-leaderboard"], help="If no, enter N/A for the following two fields", ) if st.session_state.card_dict["overview"]["where"]["has-leaderboard"] == "yes": make_text_input( label="Provide a link to the leaderboard.", key_list=key_pref + ["leaderboard-url"], help="[URL] or N/A", ) make_text_area( label="Briefly describe how the leaderboard evaluates models.", key_list=key_pref + ["leaderboard-description"], help="[free text; a paragraph] or N/A", ) else: st.session_state.card_dict["overview"]["where"]["leaderboard-url"] = "N/A" st.session_state.card_dict["overview"]["where"]["leaderboard-description"] = "N/A" make_text_input( label="If known, provide the name of at least one person the reader can contact for questions about the dataset.", key_list=key_pref + ["contact-name"], help="[free text]", ) make_text_input( label="If known, provide the email of at least one person the reader can contact for questions about the dataset.", key_list=key_pref + ["contact-email"], help="[free text]", ) with st.expander("Languages and Intended Use", expanded=False): key_pref = ["overview", "languages"] st.session_state.card_dict["overview"][ "languages" ] = st.session_state.card_dict["overview"].get("languages", {}) make_radio( label="Is the dataset multilingual?", options=["no", "yes"], key_list=key_pref + ["is-multilingual"], help="More than one language present in all of the text fields", ) make_multiselect( label="What languages/dialects are covered in the dataset?", key_list=key_pref + ["language-names"], options=[", ".join(x["description"]) for x in languages_bcp47], help="This is a comprehensive list of languages obtained from the BCP-47 standard list.", ) make_text_area( label="What dialects are covered? Are there multiple dialects per language?", key_list=key_pref + ["language-dialects"], help="[free text, paragraphs] - Describe the dialect(s) as appropriate.", ) make_text_area( label="Whose language is in the dataset?", key_list=key_pref + ["language-speakers"], help="[free text, paragraphs] - Provide locally appropriate demographic information about the language producers, if available. Use ranges where reasonable in order to protect individuals’ privacy.", ) make_text_area( label="What is the intended use of the dataset?", key_list=key_pref + ["intended-use"], help="[free text, paragraphs] - Describe how the dataset creators describe its purpose and intended use.", ) make_selectbox( label="What is the license of the dataset?", key_list=key_pref + ["license"], options=license_list, help="select `other` if missing from list, `unkown` if not provided.", ) if "other" in st.session_state.card_dict["overview"]["languages"].get("license", []): make_text_input( label="What is the 'other' license of the dataset?", key_list=key_pref + ["license-other"], help="[free text]", ) else: st.session_state.card_dict["overview"]["languages"]["license-other"] = "N/A" make_selectbox( label="What primary task does the dataset support?", key_list=key_pref + ["task"], options=[ "", # default needs to be invalid value to make sure people actually fill in "Content Transfer", "Data-to-Text", "Dialog Response Generation", "Paraphrasing", "Question Generation", "Reasoning", "Simplification", "Style Transfer", "Summarization", "Text-to-Slide", "Other" ], help="Select `other` if the task is not included in the list.", ) if "Other" in st.session_state.card_dict["overview"]["languages"].get("task", []): make_text_input( label="What is the primary task?", key_list=key_pref + ["task-other"], help="[free text]", ) else: st.session_state.card_dict["overview"]["languages"]["task-other"] = "N/A" make_text_area( label="Provide a short description of the communicative goal of a model trained for this task on this dataset.", key_list=key_pref + ["communicative"], help="[free text, a paragraph] (e.g., describe a restaurant from a structured representation of its attributes)", ) with st.expander("Credit", expanded=False): key_pref = ["overview", "credit"] st.session_state.card_dict["overview"][ "credit" ] = st.session_state.card_dict["overview"].get("credit", {}) make_multiselect( label="In what kind of organization did the dataset curation happen?", options=["industry", "academic", "independent", "other"], key_list=key_pref + ["organization-type"], ) make_text_input( label="Name the organization(s).", key_list=key_pref + ["organization-names"], help="comma-separated", ) make_text_input( label="Who created the original dataset? List the people involved in collecting the dataset and their affiliation(s).", key_list=key_pref + ["creators"], help="name (affiliation); comma-separated", ) make_text_input( label="Who funded the data creation?", key_list=key_pref + ["funding"], help="[free text] enter N/A if unkown", ) make_text_input( label="Who contributed to the data card and adding the dataset to GEM? List the people+affiliations involved in creating this data card and who helped integrate this dataset into GEM.", key_list=key_pref + ["gem-added-by"], help="name (affiliation); comma-separated", ) with st.expander("Structure", expanded=False): key_pref = ["overview", "structure"] st.session_state.card_dict["overview"]["structure"] = st.session_state.card_dict[ "overview" ].get("structure", {}) data_fields_help = """ [free text; paragraphs] - Mention their data type, and whether and how they are used as part of the generation pipeline. - Describe each fields' attributes, such as whether they are at the character level or word level, whether they are contiguous or not, etc. - If the datasets contain example IDs, state whether they have an inherent meaning, such as a mapping to other datasets or pointing to relationships between data points. """ make_text_area( label="List and describe the fields present in the dataset.", key_list=key_pref + ["data-fields"], help=data_fields_help, ) make_text_area( label="How was the dataset structure determined?", key_list=key_pref + ["structure-description"], help="[free text; paragraph]", ) make_text_area( label="How were the labels chosen?", key_list=key_pref + ["structure-labels"], help="[free text; paragraph]", ) make_text_area( label="Provide a JSON formatted example of a typical instance in the dataset.", key_list=key_pref + ["structure-example"], help="[JSON]", ) make_text_area( label="Describe and name the splits in the dataset if there are more than one.", key_list=key_pref + ["structure-splits"], help="[free text, paragraphs] - As appropriate, provide any descriptive statistics for the features, such as size, average lengths of input and output.", ) make_text_area( label="Describe any criteria for splitting the data, if used. If there are differences between the splits (e.g., if the training annotations are machine-generated and the dev and test ones are created by humans, or if different numbers of annotators contributed to each example), describe them here.", key_list=key_pref + ["structure-splits-criteria"], help="[free text, paragraphs]", ) make_text_area( label="What does an outlier of the dataset in terms of length/perplexity/embedding look like?", key_list=key_pref + ["structure-outlier"], help="[free text + json formatted text/file for an example]", ) def overview_summary(): total_filled = sum( [len(dct) for dct in st.session_state.card_dict.get("overview", {}).values()] ) with st.expander( f"Dataset Overview Completion - {total_filled} of {N_FIELDS}", expanded=False ): completion_markdown = "" completion_markdown += ( f"- **Overall completion:**\n - {total_filled} of {N_FIELDS} fields\n" ) completion_markdown += f"- **Sub-section - Where to find:**\n - {len(st.session_state.card_dict.get('overview', {}).get('where', {}))} of {N_FIELDS_WHERE} fields\n" completion_markdown += f"- **Sub-section - Languages and Intended Use:**\n - {len(st.session_state.card_dict.get('overview', {}).get('languages', {}))} of {N_FIELDS_LANGUAGES} fields\n" completion_markdown += f"- **Sub-section - Credit:**\n - {len(st.session_state.card_dict.get('overview', {}).get('credit', {}))} of {N_FIELDS_CREDIT} fields\n" completion_markdown += f"- **Sub-section - Structure:**\n - {len(st.session_state.card_dict.get('overview', {}).get('structure', {}))} of {N_FIELDS_STRUCTURE} fields\n" st.markdown(completion_markdown)