import streamlit as st from .streamlit_utils import ( make_multiselect, make_selectbox, make_text_area, make_text_input, make_radio, ) N_FIELDS = 7 def results_page(): st.session_state.card_dict["results"] = st.session_state.card_dict.get( "results", {} ) with st.expander("Previous Results", expanded=False): key_pref = ["results", "results"] st.session_state.card_dict["results"]["results"] = st.session_state.card_dict[ "results" ].get("results", {}) make_text_area( label="What aspect of model ability can be measured with this dataset?", key_list=key_pref + ["model-abilities"], help="What kind of abilities should a model exhibit that performs well on the task of this dataset (e.g., reasoning capability, morphological inflection)?.", ) make_multiselect( label="What metrics are typically used for this task?", key_list=key_pref + ["metrics"], options=[ "BERT-Score", "BLEU", "BLEURT", "ChrF", "Entailment", "FeQA", "METEOR", "MoverScore", "QAGS", "ROUGE", "WER", "Other: Other Metrics" ], help="Select all metrics that are typically used when evaluating models for this task.", ) if "Other: Other Metrics" in st.session_state.card_dict["results"]["results"].get("metrics", []): make_text_area( label="Definitions of other metrics", key_list=key_pref + ["other-metrics-definitions"], help="If the evaluation strategies in the previous questions go beyond the list of metrics above, add descriptions and/or definitions for each metric.", ) else: st.session_state.card_dict["results"]["results"]["other-metrics-definitions"] = "N/A" make_text_area( label="List and describe the purpose of the metrics and evaluation methodology (including human evaluation) that the dataset creators used when introducing this task.", key_list=key_pref + ["original-evaluation"], help="When the generation task was not evaluated when this dataset was introduced, write N/A.", ) make_radio( label="Are previous results available?", options=["no", "yes"], key_list=key_pref + ["has-previous-results"], help="Have papers evaluated models on this task? If no, write N/A for the following three questions.", ) if st.session_state.card_dict["results"]["results"]["has-previous-results"] == "yes": make_text_area( label="What evaluation approaches have others used?", key_list=key_pref + ["current-evaluation"], help="If the current evaluation strategy diverts from the original, describe how models are being evaluated.", ) make_text_area( label="What are the most relevant previous results for this task/dataset?", key_list=key_pref + ["previous-results"], help="List and describe the source and performance metrics for models on this dataset.", ) else: st.session_state.card_dict["results"]["results"]["current-evaluation"] = "N/A" st.session_state.card_dict["results"]["results"]["previous-results"] = "N/A" def results_summary(): total_filled = sum( [len(dct) for dct in st.session_state.card_dict.get("results", {}).values()] ) with st.expander( f"Previous Results Completion - {total_filled} of {N_FIELDS}", expanded=False ): completion_markdown = "" completion_markdown += ( f"- **Overall completion:**\n - {total_filled} of {N_FIELDS} fields\n" ) completion_markdown += f"- **Sub-section - Previous Results:**\n - {len(st.session_state.card_dict.get('results', {}).get('results', {}))} of {N_FIELDS} fields\n" st.markdown(completion_markdown)