Spaces:

autoevaluate
/

model-evaluator

Runtime error

App Files Files Community

lewtun HF Staff commited on Nov 27, 2023

Commit

daea199

1 Parent(s): 9eeb453

Archive project

Browse files

Files changed (1) hide show

app.py +548 -545

app.py CHANGED Viewed

@@ -131,6 +131,9 @@ SUPPORTED_METRICS = [
 # APP #
 #######
 st.title("Evaluation on the Hub")
 st.markdown(
     """
     Welcome to Hugging Face's automatic model evaluator 👋!
@@ -146,548 +149,548 @@ st.markdown(
     """
 )
-all_datasets = [d.id for d in list_datasets()]
-query_params = st.experimental_get_query_params()
-if "first_query_params" not in st.session_state:
-    st.session_state.first_query_params = query_params
-first_query_params = st.session_state.first_query_params
-default_dataset = all_datasets[0]
-if "dataset" in first_query_params:
-    if len(first_query_params["dataset"]) > 0 and first_query_params["dataset"][0] in all_datasets:
-        default_dataset = first_query_params["dataset"][0]
-selected_dataset = st.selectbox(
-    "Select a dataset",
-    all_datasets,
-    index=all_datasets.index(default_dataset),
-    help="""Datasets with metadata can be evaluated with 1-click. Configure an evaluation job to add \
-        new metadata to a dataset card.""",
-)
-st.experimental_set_query_params(**{"dataset": [selected_dataset]})
-# Check if selected dataset can be streamed
-is_valid_dataset = http_get(
-    path="/is-valid",
-    domain=DATASETS_PREVIEW_API,
-    params={"dataset": selected_dataset},
-).json()
-if is_valid_dataset["viewer"] is False and is_valid_dataset["preview"] is False:
-    st.error(
-        """The dataset you selected is not currently supported. Open a \
-            [discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) for support."""
-    )
-metadata = get_metadata(selected_dataset, token=HF_TOKEN)
-print(f"INFO -- Dataset metadata: {metadata}")
-if metadata is None:
-    st.warning("No evaluation metadata found. Please configure the evaluation job below.")
-with st.expander("Advanced configuration"):
-    # Select task
-    selected_task = st.selectbox(
-        "Select a task",
-        SUPPORTED_TASKS,
-        index=SUPPORTED_TASKS.index(metadata[0]["task_id"]) if metadata is not None else 0,
-        help="""Don't see your favourite task here? Open a \
-            [discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) to request it!""",
-    )
-    # Select config
-    configs = get_dataset_config_names(selected_dataset)
-    selected_config = st.selectbox(
-        "Select a config",
-        configs,
-        help="""Some datasets contain several sub-datasets, known as _configurations_. \
-            Select one to evaluate your models on. \
-            See the [docs](https://huggingface.co/docs/datasets/master/en/load_hub#configurations) for more details.
-            """,
-    )
-    # Some datasets have multiple metadata (one per config), so we grab the one associated with the selected config
-    config_metadata = get_config_metadata(selected_config, metadata)
-    print(f"INFO -- Config metadata: {config_metadata}")
-    # Select splits
-    splits_resp = http_get(
-        path="/splits",
-        domain=DATASETS_PREVIEW_API,
-        params={"dataset": selected_dataset},
-    )
-    if splits_resp.status_code == 200:
-        split_names = []
-        all_splits = splits_resp.json()
-        for split in all_splits["splits"]:
-            if split["config"] == selected_config:
-                split_names.append(split["split"])
-        if config_metadata is not None:
-            eval_split = config_metadata["splits"].get("eval_split", None)
-        else:
-            eval_split = None
-        selected_split = st.selectbox(
-            "Select a split",
-            split_names,
-            index=split_names.index(eval_split) if eval_split is not None else 0,
-            help="Be wary when evaluating models on the `train` split.",
-        )
-    # Select columns
-    rows_resp = http_get(
-        path="/first-rows",
-        domain=DATASETS_PREVIEW_API,
-        params={
-            "dataset": selected_dataset,
-            "config": selected_config,
-            "split": selected_split,
-        },
-    ).json()
-    col_names = list(pd.json_normalize(rows_resp["rows"][0]["row"]).columns)
-    st.markdown("**Map your dataset columns**")
-    st.markdown(
-        """The model evaluator uses a standardised set of column names for the input examples and labels. \
-        Please define the mapping between your dataset columns (right) and the standardised column names (left)."""
-    )
-    col1, col2 = st.columns(2)
-    # TODO: find a better way to layout these items
-    # TODO: need graceful way of handling dataset <--> task mismatch for datasets with metadata
-    col_mapping = {}
-    if selected_task in ["binary_classification", "multi_class_classification"]:
-        with col1:
-            st.markdown("`text` column")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.markdown("`target` column")
-        with col2:
-            text_col = st.selectbox(
-                "This column should contain the text to be classified",
-                col_names,
-                index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
-                if config_metadata is not None
-                else 0,
-            )
-            target_col = st.selectbox(
-                "This column should contain the labels associated with the text",
-                col_names,
-                index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
-                if config_metadata is not None
-                else 0,
-            )
-            col_mapping[text_col] = "text"
-            col_mapping[target_col] = "target"
-    elif selected_task == "text_zero_shot_classification":
-        with col1:
-            st.markdown("`text` column")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.markdown("`classes` column")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.markdown("`target` column")
-        with col2:
-            text_col = st.selectbox(
-                "This column should contain the text to be classified",
-                col_names,
-                index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
-                if config_metadata is not None
-                else 0,
-            )
-            classes_col = st.selectbox(
-                "This column should contain the classes associated with the text",
-                col_names,
-                index=col_names.index(get_key(config_metadata["col_mapping"], "classes"))
-                if config_metadata is not None
-                else 0,
-            )
-            target_col = st.selectbox(
-                "This column should contain the index of the correct class",
-                col_names,
-                index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
-                if config_metadata is not None
-                else 0,
-            )
-            col_mapping[text_col] = "text"
-            col_mapping[classes_col] = "classes"
-            col_mapping[target_col] = "target"
-    if selected_task in ["natural_language_inference"]:
-        config_metadata = get_config_metadata(selected_config, metadata)
-        with col1:
-            st.markdown("`text1` column")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.markdown("`text2` column")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.markdown("`target` column")
-        with col2:
-            text1_col = st.selectbox(
-                "This column should contain the first text passage to be classified",
-                col_names,
-                index=col_names.index(get_key(config_metadata["col_mapping"], "text1"))
-                if config_metadata is not None
-                else 0,
-            )
-            text2_col = st.selectbox(
-                "This column should contain the second text passage to be classified",
-                col_names,
-                index=col_names.index(get_key(config_metadata["col_mapping"], "text2"))
-                if config_metadata is not None
-                else 0,
-            )
-            target_col = st.selectbox(
-                "This column should contain the labels associated with the text",
-                col_names,
-                index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
-                if config_metadata is not None
-                else 0,
-            )
-            col_mapping[text1_col] = "text1"
-            col_mapping[text2_col] = "text2"
-            col_mapping[target_col] = "target"
-    elif selected_task == "entity_extraction":
-        with col1:
-            st.markdown("`tokens` column")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.markdown("`tags` column")
-        with col2:
-            tokens_col = st.selectbox(
-                "This column should contain the array of tokens to be classified",
-                col_names,
-                index=col_names.index(get_key(config_metadata["col_mapping"], "tokens"))
-                if config_metadata is not None
-                else 0,
-            )
-            tags_col = st.selectbox(
-                "This column should contain the labels associated with each part of the text",
-                col_names,
-                index=col_names.index(get_key(config_metadata["col_mapping"], "tags"))
-                if config_metadata is not None
-                else 0,
-            )
-            col_mapping[tokens_col] = "tokens"
-            col_mapping[tags_col] = "tags"
-    elif selected_task == "translation":
-        with col1:
-            st.markdown("`source` column")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.markdown("`target` column")
-        with col2:
-            text_col = st.selectbox(
-                "This column should contain the text to be translated",
-                col_names,
-                index=col_names.index(get_key(config_metadata["col_mapping"], "source"))
-                if config_metadata is not None
-                else 0,
-            )
-            target_col = st.selectbox(
-                "This column should contain the target translation",
-                col_names,
-                index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
-                if config_metadata is not None
-                else 0,
-            )
-            col_mapping[text_col] = "source"
-            col_mapping[target_col] = "target"
-    elif selected_task == "summarization":
-        with col1:
-            st.markdown("`text` column")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.markdown("`target` column")
-        with col2:
-            text_col = st.selectbox(
-                "This column should contain the text to be summarized",
-                col_names,
-                index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
-                if config_metadata is not None
-                else 0,
-            )
-            target_col = st.selectbox(
-                "This column should contain the target summary",
-                col_names,
-                index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
-                if config_metadata is not None
-                else 0,
-            )
-            col_mapping[text_col] = "text"
-            col_mapping[target_col] = "target"
-    elif selected_task == "extractive_question_answering":
-        if config_metadata is not None:
-            col_mapping = config_metadata["col_mapping"]
-            # Hub YAML parser converts periods to hyphens, so we remap them here
-            col_mapping = format_col_mapping(col_mapping)
-        with col1:
-            st.markdown("`context` column")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.markdown("`question` column")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.markdown("`answers.text` column")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.markdown("`answers.answer_start` column")
-        with col2:
-            context_col = st.selectbox(
-                "This column should contain the question's context",
-                col_names,
-                index=col_names.index(get_key(col_mapping, "context")) if config_metadata is not None else 0,
-            )
-            question_col = st.selectbox(
-                "This column should contain the question to be answered, given the context",
-                col_names,
-                index=col_names.index(get_key(col_mapping, "question")) if config_metadata is not None else 0,
-            )
-            answers_text_col = st.selectbox(
-                "This column should contain example answers to the question, extracted from the context",
-                col_names,
-                index=col_names.index(get_key(col_mapping, "answers.text")) if config_metadata is not None else 0,
-            )
-            answers_start_col = st.selectbox(
-                "This column should contain the indices in the context of the first character of each `answers.text`",
-                col_names,
-                index=col_names.index(get_key(col_mapping, "answers.answer_start"))
-                if config_metadata is not None
-                else 0,
-            )
-            col_mapping[context_col] = "context"
-            col_mapping[question_col] = "question"
-            col_mapping[answers_text_col] = "answers.text"
-            col_mapping[answers_start_col] = "answers.answer_start"
-    elif selected_task in ["image_binary_classification", "image_multi_class_classification"]:
-        with col1:
-            st.markdown("`image` column")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.text("")
-            st.markdown("`target` column")
-        with col2:
-            image_col = st.selectbox(
-                "This column should contain the images to be classified",
-                col_names,
-                index=col_names.index(get_key(config_metadata["col_mapping"], "image"))
-                if config_metadata is not None
-                else 0,
-            )
-            target_col = st.selectbox(
-                "This column should contain the labels associated with the images",
-                col_names,
-                index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
-                if config_metadata is not None
-                else 0,
-            )
-            col_mapping[image_col] = "image"
-            col_mapping[target_col] = "target"
-    # Select metrics
-    st.markdown("**Select metrics**")
-    st.markdown("The following metrics will be computed")
-    html_string = " ".join(
-        [
-            '<div style="padding-right:5px;padding-left:5px;padding-top:5px;padding-bottom:5px;float:left">'
-            + '<div style="background-color:#D3D3D3;border-radius:5px;display:inline-block;padding-right:5px;'
-            + 'padding-left:5px;color:white">'
-            + metric
-            + "</div></div>"
-            for metric in TASK_TO_DEFAULT_METRICS[selected_task]
-        ]
-    )
-    st.markdown(html_string, unsafe_allow_html=True)
-    selected_metrics = st.multiselect(
-        "(Optional) Select additional metrics",
-        sorted(list(set(SUPPORTED_METRICS) - set(TASK_TO_DEFAULT_METRICS[selected_task]))),
-        help="""User-selected metrics will be computed with their default arguments. \
-            For example, `f1` will report results for binary labels. \
-            Check out the [available metrics](https://huggingface.co/metrics) for more details.""",
-    )
-with st.form(key="form"):
-    compatible_models = get_compatible_models(selected_task, [selected_dataset])
-    selected_models = st.multiselect(
-        "Select the models you wish to evaluate",
-        compatible_models,
-        help="""Don't see your favourite model in this list? Add the dataset and task it was trained on to the \
-            [model card metadata.](https://huggingface.co/docs/hub/models-cards#model-card-metadata)""",
-    )
-    print("INFO -- Selected models before filter:", selected_models)
-    hf_username = st.text_input("Enter your 🤗 Hub username to be notified when the evaluation is finished")
-    submit_button = st.form_submit_button("Evaluate models 🚀")
-    if submit_button:
-        if len(hf_username) == 0:
-            st.warning("No 🤗 Hub username provided! Please enter your username and try again.")
-        elif len(selected_models) == 0:
-            st.warning("⚠️ No models were selected for evaluation! Please select at least one model and try again.")
-        elif len(selected_models) > 10:
-            st.warning("Only 10 models can be evaluated at once. Please select fewer models and try again.")
-        else:
-            # Filter out previously evaluated models
-            selected_models = filter_evaluated_models(
-                selected_models,
-                selected_task,
-                selected_dataset,
-                selected_config,
-                selected_split,
-                selected_metrics,
-            )
-            print("INFO -- Selected models after filter:", selected_models)
-            if len(selected_models) > 0:
-                project_payload = {
-                    "username": AUTOTRAIN_USERNAME,
-                    "proj_name": create_autotrain_project_name(selected_dataset, selected_config),
-                    "task": TASK_TO_ID[selected_task],
-                    "config": {
-                        "language": AUTOTRAIN_TASK_TO_LANG[selected_task]
-                        if selected_task in AUTOTRAIN_TASK_TO_LANG
-                        else "en",
-                        "max_models": 5,
-                        "instance": {
-                            "provider": "sagemaker" if selected_task in AUTOTRAIN_MACHINE.keys() else "ovh",
-                            "instance_type": AUTOTRAIN_MACHINE[selected_task]
-                            if selected_task in AUTOTRAIN_MACHINE.keys()
-                            else "p3",
-                            "max_runtime_seconds": 172800,
-                            "num_instances": 1,
-                            "disk_size_gb": 200,
-                        },
-                        "evaluation": {
-                            "metrics": selected_metrics,
-                            "models": selected_models,
-                            "hf_username": hf_username,
-                        },
-                    },
-                }
-                print(f"INFO -- Payload: {project_payload}")
-                project_json_resp = http_post(
-                    path="/projects/create",
-                    payload=project_payload,
-                    token=HF_TOKEN,
-                    domain=AUTOTRAIN_BACKEND_API,
-                ).json()
-                print(f"INFO -- Project creation response: {project_json_resp}")
-                if project_json_resp["created"]:
-                    data_payload = {
-                        "split": 4,  # use "auto" split choice in AutoTrain
-                        "col_mapping": col_mapping,
-                        "load_config": {"max_size_bytes": 0, "shuffle": False},
-                        "dataset_id": selected_dataset,
-                        "dataset_config": selected_config,
-                        "dataset_split": selected_split,
-                    }
-                    data_json_resp = http_post(
-                        path=f"/projects/{project_json_resp['id']}/data/dataset",
-                        payload=data_payload,
-                        token=HF_TOKEN,
-                        domain=AUTOTRAIN_BACKEND_API,
-                    ).json()
-                    print(f"INFO -- Dataset creation response: {data_json_resp}")
-                    if data_json_resp["download_status"] == 1:
-                        train_json_resp = http_post(
-                            path=f"/projects/{project_json_resp['id']}/data/start_processing",
-                            token=HF_TOKEN,
-                            domain=AUTOTRAIN_BACKEND_API,
-                        ).json()
-                        # For local development we process and approve projects on-the-fly
-                        if "localhost" in AUTOTRAIN_BACKEND_API:
-                            with st.spinner("⏳ Waiting for data processing to complete ..."):
-                                is_data_processing_success = False
-                                while is_data_processing_success is not True:
-                                    project_status = http_get(
-                                        path=f"/projects/{project_json_resp['id']}",
-                                        token=HF_TOKEN,
-                                        domain=AUTOTRAIN_BACKEND_API,
-                                    ).json()
-                                    if project_status["status"] == 3:
-                                        is_data_processing_success = True
-                                    time.sleep(10)
-                            # Approve training job
-                            train_job_resp = http_post(
-                                path=f"/projects/{project_json_resp['id']}/start_training",
-                                token=HF_TOKEN,
-                                domain=AUTOTRAIN_BACKEND_API,
-                            ).json()
-                            st.success("✅  Data processing and project approval complete - go forth and evaluate!")
-                        else:
-                            # Prod/staging submissions are evaluated in a cron job via run_evaluation_jobs.py
-                            print(f"INFO -- AutoTrain job response: {train_json_resp}")
-                            if train_json_resp["success"]:
-                                train_eval_index = {
-                                    "train-eval-index": [
-                                        {
-                                            "config": selected_config,
-                                            "task": AUTOTRAIN_TASK_TO_HUB_TASK[selected_task],
-                                            "task_id": selected_task,
-                                            "splits": {"eval_split": selected_split},
-                                            "col_mapping": col_mapping,
-                                        }
-                                    ]
-                                }
-                                selected_metadata = yaml.dump(train_eval_index, sort_keys=False)
-                                dataset_card_url = get_dataset_card_url(selected_dataset)
-                                st.success("✅ Successfully submitted evaluation job!")
-                                st.markdown(
-                                    f"""
-                                Evaluation can take up to 1 hour to complete, so grab a ☕️ or 🍵 while you wait:
-                                * 🔔 A [Hub pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions) with the evaluation results will be opened for each model you selected. Check your email for notifications.
-                                * 📊 Click [here](https://hf.co/spaces/autoevaluate/leaderboards?dataset={selected_dataset}) to view the results from your submission once the Hub pull request is merged.
-                                * 🥱 Tired of configuring evaluations? Add the following metadata to the [dataset card]({dataset_card_url}) to enable 1-click evaluations:
-                                """  # noqa
-                                )
-                                st.markdown(
-                                    f"""
-                                ```yaml
-                                {selected_metadata}
-                                """
-                                )
-                                print("INFO -- Pushing evaluation job logs to the Hub")
-                                evaluation_log = {}
-                                evaluation_log["project_id"] = project_json_resp["id"]
-                                evaluation_log["autotrain_env"] = (
-                                    "staging" if "staging" in AUTOTRAIN_BACKEND_API else "prod"
-                                )
-                                evaluation_log["payload"] = project_payload
-                                evaluation_log["project_creation_response"] = project_json_resp
-                                evaluation_log["dataset_creation_response"] = data_json_resp
-                                evaluation_log["autotrain_job_response"] = train_json_resp
-                                commit_evaluation_log(evaluation_log, hf_access_token=HF_TOKEN)
-                            else:
-                                st.error("🙈 Oh no, there was an error submitting your evaluation job!")
-            else:
-                st.warning("⚠️ No models left to evaluate! Please select other models and try again.")

 # APP #
 #######
 st.title("Evaluation on the Hub")
+st.warning(
+    "**⚠️ This project has been archived. If you want to evaluate LLMs, checkout [this collection](https://huggingface.co/collections/clefourrier/llm-leaderboards-and-benchmarks-✨-64f99d2e11e92ca5568a7cce) of leaderboards.**"
+)
 st.markdown(
     """
     Welcome to Hugging Face's automatic model evaluator 👋!
     """
 )
+# all_datasets = [d.id for d in list_datasets()]
+# query_params = st.experimental_get_query_params()
+# if "first_query_params" not in st.session_state:
+#     st.session_state.first_query_params = query_params
+# first_query_params = st.session_state.first_query_params
+# default_dataset = all_datasets[0]
+# if "dataset" in first_query_params:
+#     if len(first_query_params["dataset"]) > 0 and first_query_params["dataset"][0] in all_datasets:
+#         default_dataset = first_query_params["dataset"][0]
+# selected_dataset = st.selectbox(
+#     "Select a dataset",
+#     all_datasets,
+#     index=all_datasets.index(default_dataset),
+#     help="""Datasets with metadata can be evaluated with 1-click. Configure an evaluation job to add \
+#         new metadata to a dataset card.""",
+# )
+# st.experimental_set_query_params(**{"dataset": [selected_dataset]})
+# # Check if selected dataset can be streamed
+# is_valid_dataset = http_get(
+#     path="/is-valid",
+#     domain=DATASETS_PREVIEW_API,
+#     params={"dataset": selected_dataset},
+# ).json()
+# if is_valid_dataset["viewer"] is False and is_valid_dataset["preview"] is False:
+#     st.error(
+#         """The dataset you selected is not currently supported. Open a \
+#             [discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) for support."""
+#     )
+# metadata = get_metadata(selected_dataset, token=HF_TOKEN)
+# print(f"INFO -- Dataset metadata: {metadata}")
+# if metadata is None:
+#     st.warning("No evaluation metadata found. Please configure the evaluation job below.")
+# with st.expander("Advanced configuration"):
+#     # Select task
+#     selected_task = st.selectbox(
+#         "Select a task",
+#         SUPPORTED_TASKS,
+#         index=SUPPORTED_TASKS.index(metadata[0]["task_id"]) if metadata is not None else 0,
+#         help="""Don't see your favourite task here? Open a \
+#             [discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) to request it!""",
+#     )
+#     # Select config
+#     configs = get_dataset_config_names(selected_dataset)
+#     selected_config = st.selectbox(
+#         "Select a config",
+#         configs,
+#         help="""Some datasets contain several sub-datasets, known as _configurations_. \
+#             Select one to evaluate your models on. \
+#             See the [docs](https://huggingface.co/docs/datasets/master/en/load_hub#configurations) for more details.
+#             """,
+#     )
+#     # Some datasets have multiple metadata (one per config), so we grab the one associated with the selected config
+#     config_metadata = get_config_metadata(selected_config, metadata)
+#     print(f"INFO -- Config metadata: {config_metadata}")
+#     # Select splits
+#     splits_resp = http_get(
+#         path="/splits",
+#         domain=DATASETS_PREVIEW_API,
+#         params={"dataset": selected_dataset},
+#     )
+#     if splits_resp.status_code == 200:
+#         split_names = []
+#         all_splits = splits_resp.json()
+#         for split in all_splits["splits"]:
+#             if split["config"] == selected_config:
+#                 split_names.append(split["split"])
+#         if config_metadata is not None:
+#             eval_split = config_metadata["splits"].get("eval_split", None)
+#         else:
+#             eval_split = None
+#         selected_split = st.selectbox(
+#             "Select a split",
+#             split_names,
+#             index=split_names.index(eval_split) if eval_split is not None else 0,
+#             help="Be wary when evaluating models on the `train` split.",
+#         )
+#     # Select columns
+#     rows_resp = http_get(
+#         path="/first-rows",
+#         domain=DATASETS_PREVIEW_API,
+#         params={
+#             "dataset": selected_dataset,
+#             "config": selected_config,
+#             "split": selected_split,
+#         },
+#     ).json()
+#     col_names = list(pd.json_normalize(rows_resp["rows"][0]["row"]).columns)
+#     st.markdown("**Map your dataset columns**")
+#     st.markdown(
+#         """The model evaluator uses a standardised set of column names for the input examples and labels. \
+#         Please define the mapping between your dataset columns (right) and the standardised column names (left)."""
+#     )
+#     col1, col2 = st.columns(2)
+#     # TODO: find a better way to layout these items
+#     # TODO: need graceful way of handling dataset <--> task mismatch for datasets with metadata
+#     col_mapping = {}
+#     if selected_task in ["binary_classification", "multi_class_classification"]:
+#         with col1:
+#             st.markdown("`text` column")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.markdown("`target` column")
+#         with col2:
+#             text_col = st.selectbox(
+#                 "This column should contain the text to be classified",
+#                 col_names,
+#                 index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
+#                 if config_metadata is not None
+#                 else 0,
+#             )
+#             target_col = st.selectbox(
+#                 "This column should contain the labels associated with the text",
+#                 col_names,
+#                 index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
+#                 if config_metadata is not None
+#                 else 0,
+#             )
+#             col_mapping[text_col] = "text"
+#             col_mapping[target_col] = "target"
+#     elif selected_task == "text_zero_shot_classification":
+#         with col1:
+#             st.markdown("`text` column")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.markdown("`classes` column")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.markdown("`target` column")
+#         with col2:
+#             text_col = st.selectbox(
+#                 "This column should contain the text to be classified",
+#                 col_names,
+#                 index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
+#                 if config_metadata is not None
+#                 else 0,
+#             )
+#             classes_col = st.selectbox(
+#                 "This column should contain the classes associated with the text",
+#                 col_names,
+#                 index=col_names.index(get_key(config_metadata["col_mapping"], "classes"))
+#                 if config_metadata is not None
+#                 else 0,
+#             )
+#             target_col = st.selectbox(
+#                 "This column should contain the index of the correct class",
+#                 col_names,
+#                 index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
+#                 if config_metadata is not None
+#                 else 0,
+#             )
+#             col_mapping[text_col] = "text"
+#             col_mapping[classes_col] = "classes"
+#             col_mapping[target_col] = "target"
+#     if selected_task in ["natural_language_inference"]:
+#         config_metadata = get_config_metadata(selected_config, metadata)
+#         with col1:
+#             st.markdown("`text1` column")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.markdown("`text2` column")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.markdown("`target` column")
+#         with col2:
+#             text1_col = st.selectbox(
+#                 "This column should contain the first text passage to be classified",
+#                 col_names,
+#                 index=col_names.index(get_key(config_metadata["col_mapping"], "text1"))
+#                 if config_metadata is not None
+#                 else 0,
+#             )
+#             text2_col = st.selectbox(
+#                 "This column should contain the second text passage to be classified",
+#                 col_names,
+#                 index=col_names.index(get_key(config_metadata["col_mapping"], "text2"))
+#                 if config_metadata is not None
+#                 else 0,
+#             )
+#             target_col = st.selectbox(
+#                 "This column should contain the labels associated with the text",
+#                 col_names,
+#                 index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
+#                 if config_metadata is not None
+#                 else 0,
+#             )
+#             col_mapping[text1_col] = "text1"
+#             col_mapping[text2_col] = "text2"
+#             col_mapping[target_col] = "target"
+#     elif selected_task == "entity_extraction":
+#         with col1:
+#             st.markdown("`tokens` column")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.markdown("`tags` column")
+#         with col2:
+#             tokens_col = st.selectbox(
+#                 "This column should contain the array of tokens to be classified",
+#                 col_names,
+#                 index=col_names.index(get_key(config_metadata["col_mapping"], "tokens"))
+#                 if config_metadata is not None
+#                 else 0,
+#             )
+#             tags_col = st.selectbox(
+#                 "This column should contain the labels associated with each part of the text",
+#                 col_names,
+#                 index=col_names.index(get_key(config_metadata["col_mapping"], "tags"))
+#                 if config_metadata is not None
+#                 else 0,
+#             )
+#             col_mapping[tokens_col] = "tokens"
+#             col_mapping[tags_col] = "tags"
+#     elif selected_task == "translation":
+#         with col1:
+#             st.markdown("`source` column")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.markdown("`target` column")
+#         with col2:
+#             text_col = st.selectbox(
+#                 "This column should contain the text to be translated",
+#                 col_names,
+#                 index=col_names.index(get_key(config_metadata["col_mapping"], "source"))
+#                 if config_metadata is not None
+#                 else 0,
+#             )
+#             target_col = st.selectbox(
+#                 "This column should contain the target translation",
+#                 col_names,
+#                 index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
+#                 if config_metadata is not None
+#                 else 0,
+#             )
+#             col_mapping[text_col] = "source"
+#             col_mapping[target_col] = "target"
+#     elif selected_task == "summarization":
+#         with col1:
+#             st.markdown("`text` column")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.markdown("`target` column")
+#         with col2:
+#             text_col = st.selectbox(
+#                 "This column should contain the text to be summarized",
+#                 col_names,
+#                 index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
+#                 if config_metadata is not None
+#                 else 0,
+#             )
+#             target_col = st.selectbox(
+#                 "This column should contain the target summary",
+#                 col_names,
+#                 index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
+#                 if config_metadata is not None
+#                 else 0,
+#             )
+#             col_mapping[text_col] = "text"
+#             col_mapping[target_col] = "target"
+#     elif selected_task == "extractive_question_answering":
+#         if config_metadata is not None:
+#             col_mapping = config_metadata["col_mapping"]
+#             # Hub YAML parser converts periods to hyphens, so we remap them here
+#             col_mapping = format_col_mapping(col_mapping)
+#         with col1:
+#             st.markdown("`context` column")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.markdown("`question` column")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.markdown("`answers.text` column")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.markdown("`answers.answer_start` column")
+#         with col2:
+#             context_col = st.selectbox(
+#                 "This column should contain the question's context",
+#                 col_names,
+#                 index=col_names.index(get_key(col_mapping, "context")) if config_metadata is not None else 0,
+#             )
+#             question_col = st.selectbox(
+#                 "This column should contain the question to be answered, given the context",
+#                 col_names,
+#                 index=col_names.index(get_key(col_mapping, "question")) if config_metadata is not None else 0,
+#             )
+#             answers_text_col = st.selectbox(
+#                 "This column should contain example answers to the question, extracted from the context",
+#                 col_names,
+#                 index=col_names.index(get_key(col_mapping, "answers.text")) if config_metadata is not None else 0,
+#             )
+#             answers_start_col = st.selectbox(
+#                 "This column should contain the indices in the context of the first character of each `answers.text`",
+#                 col_names,
+#                 index=col_names.index(get_key(col_mapping, "answers.answer_start"))
+#                 if config_metadata is not None
+#                 else 0,
+#             )
+#             col_mapping[context_col] = "context"
+#             col_mapping[question_col] = "question"
+#             col_mapping[answers_text_col] = "answers.text"
+#             col_mapping[answers_start_col] = "answers.answer_start"
+#     elif selected_task in ["image_binary_classification", "image_multi_class_classification"]:
+#         with col1:
+#             st.markdown("`image` column")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.text("")
+#             st.markdown("`target` column")
+#         with col2:
+#             image_col = st.selectbox(
+#                 "This column should contain the images to be classified",
+#                 col_names,
+#                 index=col_names.index(get_key(config_metadata["col_mapping"], "image"))
+#                 if config_metadata is not None
+#                 else 0,
+#             )
+#             target_col = st.selectbox(
+#                 "This column should contain the labels associated with the images",
+#                 col_names,
+#                 index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
+#                 if config_metadata is not None
+#                 else 0,
+#             )
+#             col_mapping[image_col] = "image"
+#             col_mapping[target_col] = "target"
+#     # Select metrics
+#     st.markdown("**Select metrics**")
+#     st.markdown("The following metrics will be computed")
+#     html_string = " ".join(
+#         [
+#             '<div style="padding-right:5px;padding-left:5px;padding-top:5px;padding-bottom:5px;float:left">'
+#             + '<div style="background-color:#D3D3D3;border-radius:5px;display:inline-block;padding-right:5px;'
+#             + 'padding-left:5px;color:white">'
+#             + metric
+#             + "</div></div>"
+#             for metric in TASK_TO_DEFAULT_METRICS[selected_task]
+#         ]
+#     )
+#     st.markdown(html_string, unsafe_allow_html=True)
+#     selected_metrics = st.multiselect(
+#         "(Optional) Select additional metrics",
+#         sorted(list(set(SUPPORTED_METRICS) - set(TASK_TO_DEFAULT_METRICS[selected_task]))),
+#         help="""User-selected metrics will be computed with their default arguments. \
+#             For example, `f1` will report results for binary labels. \
+#             Check out the [available metrics](https://huggingface.co/metrics) for more details.""",
+#     )
+# with st.form(key="form"):
+#     compatible_models = get_compatible_models(selected_task, [selected_dataset])
+#     selected_models = st.multiselect(
+#         "Select the models you wish to evaluate",
+#         compatible_models,
+#         help="""Don't see your favourite model in this list? Add the dataset and task it was trained on to the \
+#             [model card metadata.](https://huggingface.co/docs/hub/models-cards#model-card-metadata)""",
+#     )
+#     print("INFO -- Selected models before filter:", selected_models)
+#     hf_username = st.text_input("Enter your 🤗 Hub username to be notified when the evaluation is finished")
+#     submit_button = st.form_submit_button("Evaluate models 🚀")
+#     if submit_button:
+#         if len(hf_username) == 0:
+#             st.warning("No 🤗 Hub username provided! Please enter your username and try again.")
+#         elif len(selected_models) == 0:
+#             st.warning("⚠️ No models were selected for evaluation! Please select at least one model and try again.")
+#         elif len(selected_models) > 10:
+#             st.warning("Only 10 models can be evaluated at once. Please select fewer models and try again.")
+#         else:
+#             # Filter out previously evaluated models
+#             selected_models = filter_evaluated_models(
+#                 selected_models,
+#                 selected_task,
+#                 selected_dataset,
+#                 selected_config,
+#                 selected_split,
+#                 selected_metrics,
+#             )
+#             print("INFO -- Selected models after filter:", selected_models)
+#             if len(selected_models) > 0:
+#                 project_payload = {
+#                     "username": AUTOTRAIN_USERNAME,
+#                     "proj_name": create_autotrain_project_name(selected_dataset, selected_config),
+#                     "task": TASK_TO_ID[selected_task],
+#                     "config": {
+#                         "language": AUTOTRAIN_TASK_TO_LANG[selected_task]
+#                         if selected_task in AUTOTRAIN_TASK_TO_LANG
+#                         else "en",
+#                         "max_models": 5,
+#                         "instance": {
+#                             "provider": "sagemaker" if selected_task in AUTOTRAIN_MACHINE.keys() else "ovh",
+#                             "instance_type": AUTOTRAIN_MACHINE[selected_task]
+#                             if selected_task in AUTOTRAIN_MACHINE.keys()
+#                             else "p3",
+#                             "max_runtime_seconds": 172800,
+#                             "num_instances": 1,
+#                             "disk_size_gb": 200,
+#                         },
+#                         "evaluation": {
+#                             "metrics": selected_metrics,
+#                             "models": selected_models,
+#                             "hf_username": hf_username,
+#                         },
+#                     },
+#                 }
+#                 print(f"INFO -- Payload: {project_payload}")
+#                 project_json_resp = http_post(
+#                     path="/projects/create",
+#                     payload=project_payload,
+#                     token=HF_TOKEN,
+#                     domain=AUTOTRAIN_BACKEND_API,
+#                 ).json()
+#                 print(f"INFO -- Project creation response: {project_json_resp}")
+#                 if project_json_resp["created"]:
+#                     data_payload = {
+#                         "split": 4,  # use "auto" split choice in AutoTrain
+#                         "col_mapping": col_mapping,
+#                         "load_config": {"max_size_bytes": 0, "shuffle": False},
+#                         "dataset_id": selected_dataset,
+#                         "dataset_config": selected_config,
+#                         "dataset_split": selected_split,
+#                     }
+#                     data_json_resp = http_post(
+#                         path=f"/projects/{project_json_resp['id']}/data/dataset",
+#                         payload=data_payload,
+#                         token=HF_TOKEN,
+#                         domain=AUTOTRAIN_BACKEND_API,
+#                     ).json()
+#                     print(f"INFO -- Dataset creation response: {data_json_resp}")
+#                     if data_json_resp["download_status"] == 1:
+#                         train_json_resp = http_post(
+#                             path=f"/projects/{project_json_resp['id']}/data/start_processing",
+#                             token=HF_TOKEN,
+#                             domain=AUTOTRAIN_BACKEND_API,
+#                         ).json()
+#                         # For local development we process and approve projects on-the-fly
+#                         if "localhost" in AUTOTRAIN_BACKEND_API:
+#                             with st.spinner("⏳ Waiting for data processing to complete ..."):
+#                                 is_data_processing_success = False
+#                                 while is_data_processing_success is not True:
+#                                     project_status = http_get(
+#                                         path=f"/projects/{project_json_resp['id']}",
+#                                         token=HF_TOKEN,
+#                                         domain=AUTOTRAIN_BACKEND_API,
+#                                     ).json()
+#                                     if project_status["status"] == 3:
+#                                         is_data_processing_success = True
+#                                     time.sleep(10)
+#                             # Approve training job
+#                             train_job_resp = http_post(
+#                                 path=f"/projects/{project_json_resp['id']}/start_training",
+#                                 token=HF_TOKEN,
+#                                 domain=AUTOTRAIN_BACKEND_API,
+#                             ).json()
+#                             st.success("✅  Data processing and project approval complete - go forth and evaluate!")
+#                         else:
+#                             # Prod/staging submissions are evaluated in a cron job via run_evaluation_jobs.py
+#                             print(f"INFO -- AutoTrain job response: {train_json_resp}")
+#                             if train_json_resp["success"]:
+#                                 train_eval_index = {
+#                                     "train-eval-index": [
+#                                         {
+#                                             "config": selected_config,
+#                                             "task": AUTOTRAIN_TASK_TO_HUB_TASK[selected_task],
+#                                             "task_id": selected_task,
+#                                             "splits": {"eval_split": selected_split},
+#                                             "col_mapping": col_mapping,
+#                                         }
+#                                     ]
+#                                 }
+#                                 selected_metadata = yaml.dump(train_eval_index, sort_keys=False)
+#                                 dataset_card_url = get_dataset_card_url(selected_dataset)
+#                                 st.success("✅ Successfully submitted evaluation job!")
+#                                 st.markdown(
+#                                     f"""
+#                                 Evaluation can take up to 1 hour to complete, so grab a ☕️ or 🍵 while you wait:
+#                                 * 🔔 A [Hub pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions) with the evaluation results will be opened for each model you selected. Check your email for notifications.
+#                                 * 📊 Click [here](https://hf.co/spaces/autoevaluate/leaderboards?dataset={selected_dataset}) to view the results from your submission once the Hub pull request is merged.
+#                                 * 🥱 Tired of configuring evaluations? Add the following metadata to the [dataset card]({dataset_card_url}) to enable 1-click evaluations:
+#                                 """  # noqa
+#                                 )
+#                                 st.markdown(
+#                                     f"""
+#                                 ```yaml
+#                                 {selected_metadata}
+#                                 """
+#                                 )
+#                                 print("INFO -- Pushing evaluation job logs to the Hub")
+#                                 evaluation_log = {}
+#                                 evaluation_log["project_id"] = project_json_resp["id"]
+#                                 evaluation_log["autotrain_env"] = (
+#                                     "staging" if "staging" in AUTOTRAIN_BACKEND_API else "prod"
+#                                 )
+#                                 evaluation_log["payload"] = project_payload
+#                                 evaluation_log["project_creation_response"] = project_json_resp
+#                                 evaluation_log["dataset_creation_response"] = data_json_resp
+#                                 evaluation_log["autotrain_job_response"] = train_json_resp
+#                                 commit_evaluation_log(evaluation_log, hf_access_token=HF_TOKEN)
+#                             else:
+#                                 st.error("🙈 Oh no, there was an error submitting your evaluation job!")
+#             else:
+#                 st.warning("⚠️ No models left to evaluate! Please select other models and try again.")