Spaces:

GEM
/

DatasetCardForm

Running

App Files Files Community

DatasetCardForm / datacards /overview.py

Sebastian Gehrmann

Continue

b03f385 almost 3 years ago

raw

history blame

No virus

9.71 kB

	import json
	import streamlit as st

	from os.path import join as pjoin

	from .streamlit_utils import (
	make_multiselect,
	make_selectbox,
	make_text_area,
	make_text_input,
	make_radio,
	)

	N_FIELDS_WHERE = 9
	N_FIELDS_LANGUAGES = 6
	N_FIELDS_CREDIT = 3
	N_FIELDS_STRUCTURE = 7

	N_FIELDS = N_FIELDS_WHERE + \
	N_FIELDS_LANGUAGES + \
	N_FIELDS_CREDIT + \
	N_FIELDS_STRUCTURE


	languages_bcp47 = [
	x
	for x in json.load(open(pjoin("resources", "bcp47.json"), encoding="utf-8"))["subtags"]
	if x["type"] == "language"
	]

	license_list = json.load(open(pjoin("resources", "licenses.json"), encoding="utf-8"))

	def overview_page():
	st.session_state.card_dict["overview"] = st.session_state.card_dict.get("overview", {})
	with st.expander("Where to find the data and its documentation", expanded=False):
	key_pref = ["overview", "where"]
	st.session_state.card_dict["overview"]["where"] = st.session_state.card_dict["overview"].get("where", {})
	make_text_input(
	label="What is the webpage for the dataset (if it exists)?",
	key_list=key_pref + ["website"],
	help="[URL]",
	)
	make_text_input(
	label="What is the link to where the original dataset is hosted?",
	key_list=key_pref + ["data-url"],
	help="[URL]",
	)
	make_text_input(
	label="What is the link to the paper describing the dataset (open access preferred)?",
	key_list=key_pref + ["paper-url"],
	help="[URL]",
	)
	make_text_area(
	label="Provide the BibTex-formatted reference for the dataset.",
	key_list=key_pref + ["paper-bibtext"],
	help="[free text]",
	)
	make_radio(
	label="Does the dataset have an active leaderboard?",
	options=["no", "yes"],
	key_list=key_pref + ["has-leaderboard"],
	help="If no, enter N/A for the following two fields",
	)
	make_text_input(
	label="Provide a link to the leaderboard if it exists. Otherwise, enter N/A.",
	key_list=key_pref + ["leaderboard-url"],
	help="[URL] or N/A",
	)
	make_text_area(
	label="Briefly describe how the leaderboard evaluates models if it exists. Otherwise, enter N/A.",
	key_list=key_pref + ["leaderboard-description"],
	help="[free text; a paragraph] or N/A",
	)
	make_text_input(
	label="If known, provide the name of at least one person the reader can contact for questions about the dataset.",
	key_list=key_pref + ["contact-name"],
	help="[free text]",
	)
	make_text_input(
	label="If known, provide the email of at least one person the reader can contact for questions about the dataset.",
	key_list=key_pref + ["contact-email"],
	help="[free text]",
	)
	with st.expander("Languages and Intended Use", expanded=False):
	key_pref = ["overview", "languages"]
	st.session_state.card_dict["overview"]["languages"] = st.session_state.card_dict["overview"].get("languages", {})
	make_radio(
	label="Is the dataset multilingual?",
	options=["no", "yes"],
	key_list=key_pref + ["is-multilingual"],
	help="More than one language present in all of the text fields",
	)
	make_multiselect(
	label="What languages/dialects are covered in the dataset?",
	key_list=key_pref + ["language-names"],
	options=[
	", ".join(x["description"]) for x in languages_bcp47
	],
	help="This is a comprehensive list of languages obtained from the BCP-47 standard list.",
	)
	make_text_area(
	label="What is the intended use of the dataset?",
	key_list=key_pref + ["intended-use"],
	help="[free text, paragraphs]",
	)
	make_selectbox(
	label="What is the license of the dataset?",
	key_list=key_pref + ["license"],
	options=license_list,
	help="select `other` if missing from list, `unkown` if not provided."
	)
	make_selectbox(
	label="What primary task does the dataset support?",
	key_list=key_pref + ["task"],
	options=["Simplification", "Summarization", "Paraphrasing", "Dialog",
	"Data-to-Text", "Style Transfer", "Text-to-Slide",
	"Question Generation", "Reasoning", "Content Transfer"],
	help="Select `other` if the task is not included in the list.",
	)
	make_text_area(
	label="Provide a short description of the communicative goal of a model trained for this task on this dataset.",
	key_list=key_pref + ["communicative"],
	help="[free text, a paragraph] (e.g., describe a restaurant from a structured representation of its attributes)",
	)
	with st.expander("Credit", expanded=False):
	key_pref = ["overview", "credit"]
	st.session_state.card_dict["overview"]["credit"] = st.session_state.card_dict.get("credit", {})
	make_text_input(
	label="Who created the original dataset? List the people involved in collecting the dataset and their affiliation(s).",
	key_list=key_pref + ["creators"],
	help="name (affiliation); comma-separated",
	)
	make_text_input(
	label="Who funded the data creation?",
	key_list=key_pref + ["funding"],
	help="[free text] enter N/A if unkown",
	)
	make_text_input(
	label="Who contributed to the data card and adding the dataset to GEM? List the people+affiliations involved in creating this data card and who helped integrate this dataset into GEM.",
	key_list=key_pref + ["gem-added-by"],
	help="name (affiliation); comma-separated",
	)
	with st.expander("Structure", expanded=False):
	key_pref = ["overview", "structure"]
	st.session_state.card_dict["overview"]["structure"] = st.session_state.card_dict.get("structure", {})
	data_fields_help = """
	[free text; paragraphs]
	- Mention their data type, and whether and how they are used as part of the generation pipeline.
	- Describe each fields' attributes, such as whether they are at the character level or word level, whether they are contiguous or not, etc.
	- If the datasets contain example IDs, state whether they have an inherent meaning, such as a mapping to other datasets or pointing to relationships between data points.
	"""
	make_text_area(
	label="List and describe the fields present in the dataset.",
	key_list=key_pref + ["data-fields"],
	help=data_fields_help,
	)
	make_text_area(
	label="How was the dataset structure determined?",
	key_list=key_pref + ["structure-description"],
	help="[free text; paragraph]",
	)
	make_text_area(
	label="How were the labels chosen?",
	key_list=key_pref + ["structure-labels"],
	help="[free text; paragraph]",
	)
	make_text_area(
	label="Provide a JSON formatted example of a typical instance in the dataset.",
	key_list=key_pref + ["structure-example"],
	help="[JSON]",
	)
	make_text_area(
	label="Describe and name the splits in the dataset if there are more than one.",
	key_list=key_pref + ["structure-splits"],
	help="[free text, paragraphs] - As appropriate, provide any descriptive statistics for the features, such as size, average lengths of input and output.",
	)
	make_text_area(
	label="Describe any criteria for splitting the data, if used. If there are differences between the splits (e.g., if the training annotations are machine-generated and the dev and test ones are created by humans, or if different numbers of annotators contributed to each example), describe them here.",
	key_list=key_pref + ["structure-splits-criteria"],
	help="[free text, paragraphs]",
	)
	make_text_area(
	label="What does an outlier of the dataset in terms of length/perplexity/embedding look like?",
	key_list=key_pref + ["structure-outlier"],
	help="[free text + json formatted text/file for an example]",
	)


	def overview_summary():
	total_filled = sum([len(dct) for dct in st.session_state.card_dict.get('overview', {}).values()])
	with st.expander(f"Dataset Overview Completion - {total_filled} of {N_FIELDS}", expanded=False):
	completion_markdown = ""
	completion_markdown += f"- Overall competion:\n - {total_filled} of {N_FIELDS} fields\n"
	completion_markdown += f"- Sub-section - Where to find:\n - {len(st.session_state.card_dict.get('overview', {}).get('where', {}))} of {N_FIELDS_WHERE} fields\n"
	completion_markdown += f"- Sub-section - Languages and Intended Use:\n - {len(st.session_state.card_dict.get('overview', {}).get('languages', {}))} of {N_FIELDS_LANGUAGES} fields\n"
	completion_markdown += f"- Sub-section - Credit:\n - {len(st.session_state.card_dict.get('overview', {}).get('credit', {}))} of {N_FIELDS_CREDIT} fields\n"
	completion_markdown += f"- Sub-section - Structure:\n - {len(st.session_state.card_dict.get('overview', {}).get('structure', {}))} of {N_FIELDS_STRUCTURE} fields\n"
	st.markdown(completion_markdown)