import os
import time
from pathlib import Path

import pandas as pd
import streamlit as st
import yaml
from datasets import get_dataset_config_names
from dotenv import load_dotenv
from huggingface_hub import list_datasets

from evaluation import filter_evaluated_models
from utils import (
    AUTOTRAIN_TASK_TO_HUB_TASK,
    commit_evaluation_log,
    create_autotrain_project_name,
    format_col_mapping,
    get_compatible_models,
    get_config_metadata,
    get_dataset_card_url,
    get_key,
    get_metadata,
    http_get,
    http_post,
)

if Path(".env").is_file():
    load_dotenv(".env")

HF_TOKEN = os.getenv("HF_TOKEN")
AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME")
AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API")
DATASETS_PREVIEW_API = os.getenv("DATASETS_PREVIEW_API")

# Put image tasks on top
TASK_TO_ID = {
    "image_binary_classification": 17,
    "image_multi_class_classification": 18,
    "binary_classification": 1,
    "multi_class_classification": 2,
    "natural_language_inference": 22,
    "entity_extraction": 4,
    "extractive_question_answering": 5,
    "translation": 6,
    "summarization": 8,
    "text_zero_shot_classification": 23,
}

TASK_TO_DEFAULT_METRICS = {
    "binary_classification": ["f1", "precision", "recall", "auc", "accuracy"],
    "multi_class_classification": [
        "f1",
        "precision",
        "recall",
        "accuracy",
    ],
    "natural_language_inference": ["f1", "precision", "recall", "auc", "accuracy"],
    "entity_extraction": ["precision", "recall", "f1", "accuracy"],
    "extractive_question_answering": ["f1", "exact_match"],
    "translation": ["sacrebleu"],
    "summarization": ["rouge1", "rouge2", "rougeL", "rougeLsum"],
    "image_binary_classification": ["f1", "precision", "recall", "auc", "accuracy"],
    "image_multi_class_classification": [
        "f1",
        "precision",
        "recall",
        "accuracy",
    ],
    "text_zero_shot_classification": ["accuracy", "loss"],
}

AUTOTRAIN_TASK_TO_LANG = {
    "translation": "en2de",
    "image_binary_classification": "unk",
    "image_multi_class_classification": "unk",
}

AUTOTRAIN_MACHINE = {"text_zero_shot_classification": "r5.16x"}


SUPPORTED_TASKS = list(TASK_TO_ID.keys())

# Extracted from utils.get_supported_metrics
# Hardcoded for now due to speed / caching constraints
SUPPORTED_METRICS = [
    "accuracy",
    "bertscore",
    "bleu",
    "cer",
    "chrf",
    "code_eval",
    "comet",
    "competition_math",
    "coval",
    "cuad",
    "exact_match",
    "f1",
    "frugalscore",
    "google_bleu",
    "mae",
    "mahalanobis",
    "matthews_correlation",
    "mean_iou",
    "meteor",
    "mse",
    "pearsonr",
    "perplexity",
    "precision",
    "recall",
    "roc_auc",
    "rouge",
    "sacrebleu",
    "sari",
    "seqeval",
    "spearmanr",
    "squad",
    "squad_v2",
    "ter",
    "trec_eval",
    "wer",
    "wiki_split",
    "xnli",
    "angelina-wang/directional_bias_amplification",
    "jordyvl/ece",
    "lvwerra/ai4code",
    "lvwerra/amex",
]


#######
# APP #
#######
st.title("Evaluation on the Hub")
st.warning(
    "**⚠️ This project has been archived. If you want to evaluate LLMs, checkout [this collection](https://huggingface.co/collections/clefourrier/llm-leaderboards-and-benchmarks-✨-64f99d2e11e92ca5568a7cce) of leaderboards.**"
)
st.markdown(
    """
    Welcome to Hugging Face's automatic model evaluator 👋!

    This application allows you to evaluate 🤗 Transformers
    [models](https://huggingface.co/models?library=transformers&sort=downloads)
    across a wide variety of [datasets](https://huggingface.co/datasets) on the
    Hub. Please select the dataset and configuration below. The results of your
    evaluation will be displayed on the [public
    leaderboards](https://huggingface.co/spaces/autoevaluate/leaderboards). For
    more details, check out out our [blog
    post](https://huggingface.co/blog/eval-on-the-hub).
    """
)

# all_datasets = [d.id for d in list_datasets()]
# query_params = st.experimental_get_query_params()
# if "first_query_params" not in st.session_state:
#     st.session_state.first_query_params = query_params
# first_query_params = st.session_state.first_query_params
# default_dataset = all_datasets[0]
# if "dataset" in first_query_params:
#     if len(first_query_params["dataset"]) > 0 and first_query_params["dataset"][0] in all_datasets:
#         default_dataset = first_query_params["dataset"][0]

# selected_dataset = st.selectbox(
#     "Select a dataset",
#     all_datasets,
#     index=all_datasets.index(default_dataset),
#     help="""Datasets with metadata can be evaluated with 1-click. Configure an evaluation job to add \
#         new metadata to a dataset card.""",
# )
# st.experimental_set_query_params(**{"dataset": [selected_dataset]})

# # Check if selected dataset can be streamed
# is_valid_dataset = http_get(
#     path="/is-valid",
#     domain=DATASETS_PREVIEW_API,
#     params={"dataset": selected_dataset},
# ).json()
# if is_valid_dataset["viewer"] is False and is_valid_dataset["preview"] is False:
#     st.error(
#         """The dataset you selected is not currently supported. Open a \
#             [discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) for support."""
#     )

# metadata = get_metadata(selected_dataset, token=HF_TOKEN)
# print(f"INFO -- Dataset metadata: {metadata}")
# if metadata is None:
#     st.warning("No evaluation metadata found. Please configure the evaluation job below.")

# with st.expander("Advanced configuration"):
#     # Select task
#     selected_task = st.selectbox(
#         "Select a task",
#         SUPPORTED_TASKS,
#         index=SUPPORTED_TASKS.index(metadata[0]["task_id"]) if metadata is not None else 0,
#         help="""Don't see your favourite task here? Open a \
#             [discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) to request it!""",
#     )
#     # Select config
#     configs = get_dataset_config_names(selected_dataset)
#     selected_config = st.selectbox(
#         "Select a config",
#         configs,
#         help="""Some datasets contain several sub-datasets, known as _configurations_. \
#             Select one to evaluate your models on. \
#             See the [docs](https://huggingface.co/docs/datasets/master/en/load_hub#configurations) for more details.
#             """,
#     )
#     # Some datasets have multiple metadata (one per config), so we grab the one associated with the selected config
#     config_metadata = get_config_metadata(selected_config, metadata)
#     print(f"INFO -- Config metadata: {config_metadata}")

#     # Select splits
#     splits_resp = http_get(
#         path="/splits",
#         domain=DATASETS_PREVIEW_API,
#         params={"dataset": selected_dataset},
#     )
#     if splits_resp.status_code == 200:
#         split_names = []
#         all_splits = splits_resp.json()
#         for split in all_splits["splits"]:
#             if split["config"] == selected_config:
#                 split_names.append(split["split"])

#         if config_metadata is not None:
#             eval_split = config_metadata["splits"].get("eval_split", None)
#         else:
#             eval_split = None
#         selected_split = st.selectbox(
#             "Select a split",
#             split_names,
#             index=split_names.index(eval_split) if eval_split is not None else 0,
#             help="Be wary when evaluating models on the `train` split.",
#         )

#     # Select columns
#     rows_resp = http_get(
#         path="/first-rows",
#         domain=DATASETS_PREVIEW_API,
#         params={
#             "dataset": selected_dataset,
#             "config": selected_config,
#             "split": selected_split,
#         },
#     ).json()
#     col_names = list(pd.json_normalize(rows_resp["rows"][0]["row"]).columns)

#     st.markdown("**Map your dataset columns**")
#     st.markdown(
#         """The model evaluator uses a standardised set of column names for the input examples and labels. \
#         Please define the mapping between your dataset columns (right) and the standardised column names (left)."""
#     )
#     col1, col2 = st.columns(2)

#     # TODO: find a better way to layout these items
#     # TODO: need graceful way of handling dataset <--> task mismatch for datasets with metadata
#     col_mapping = {}
#     if selected_task in ["binary_classification", "multi_class_classification"]:
#         with col1:
#             st.markdown("`text` column")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.markdown("`target` column")
#         with col2:
#             text_col = st.selectbox(
#                 "This column should contain the text to be classified",
#                 col_names,
#                 index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
#                 if config_metadata is not None
#                 else 0,
#             )
#             target_col = st.selectbox(
#                 "This column should contain the labels associated with the text",
#                 col_names,
#                 index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
#                 if config_metadata is not None
#                 else 0,
#             )
#             col_mapping[text_col] = "text"
#             col_mapping[target_col] = "target"

#     elif selected_task == "text_zero_shot_classification":
#         with col1:
#             st.markdown("`text` column")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.markdown("`classes` column")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.markdown("`target` column")
#         with col2:
#             text_col = st.selectbox(
#                 "This column should contain the text to be classified",
#                 col_names,
#                 index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
#                 if config_metadata is not None
#                 else 0,
#             )
#             classes_col = st.selectbox(
#                 "This column should contain the classes associated with the text",
#                 col_names,
#                 index=col_names.index(get_key(config_metadata["col_mapping"], "classes"))
#                 if config_metadata is not None
#                 else 0,
#             )
#             target_col = st.selectbox(
#                 "This column should contain the index of the correct class",
#                 col_names,
#                 index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
#                 if config_metadata is not None
#                 else 0,
#             )
#             col_mapping[text_col] = "text"
#             col_mapping[classes_col] = "classes"
#             col_mapping[target_col] = "target"

#     if selected_task in ["natural_language_inference"]:
#         config_metadata = get_config_metadata(selected_config, metadata)
#         with col1:
#             st.markdown("`text1` column")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.markdown("`text2` column")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.markdown("`target` column")
#         with col2:
#             text1_col = st.selectbox(
#                 "This column should contain the first text passage to be classified",
#                 col_names,
#                 index=col_names.index(get_key(config_metadata["col_mapping"], "text1"))
#                 if config_metadata is not None
#                 else 0,
#             )
#             text2_col = st.selectbox(
#                 "This column should contain the second text passage to be classified",
#                 col_names,
#                 index=col_names.index(get_key(config_metadata["col_mapping"], "text2"))
#                 if config_metadata is not None
#                 else 0,
#             )
#             target_col = st.selectbox(
#                 "This column should contain the labels associated with the text",
#                 col_names,
#                 index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
#                 if config_metadata is not None
#                 else 0,
#             )
#             col_mapping[text1_col] = "text1"
#             col_mapping[text2_col] = "text2"
#             col_mapping[target_col] = "target"

#     elif selected_task == "entity_extraction":
#         with col1:
#             st.markdown("`tokens` column")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.markdown("`tags` column")
#         with col2:
#             tokens_col = st.selectbox(
#                 "This column should contain the array of tokens to be classified",
#                 col_names,
#                 index=col_names.index(get_key(config_metadata["col_mapping"], "tokens"))
#                 if config_metadata is not None
#                 else 0,
#             )
#             tags_col = st.selectbox(
#                 "This column should contain the labels associated with each part of the text",
#                 col_names,
#                 index=col_names.index(get_key(config_metadata["col_mapping"], "tags"))
#                 if config_metadata is not None
#                 else 0,
#             )
#             col_mapping[tokens_col] = "tokens"
#             col_mapping[tags_col] = "tags"

#     elif selected_task == "translation":
#         with col1:
#             st.markdown("`source` column")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.markdown("`target` column")
#         with col2:
#             text_col = st.selectbox(
#                 "This column should contain the text to be translated",
#                 col_names,
#                 index=col_names.index(get_key(config_metadata["col_mapping"], "source"))
#                 if config_metadata is not None
#                 else 0,
#             )
#             target_col = st.selectbox(
#                 "This column should contain the target translation",
#                 col_names,
#                 index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
#                 if config_metadata is not None
#                 else 0,
#             )
#             col_mapping[text_col] = "source"
#             col_mapping[target_col] = "target"

#     elif selected_task == "summarization":
#         with col1:
#             st.markdown("`text` column")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.markdown("`target` column")
#         with col2:
#             text_col = st.selectbox(
#                 "This column should contain the text to be summarized",
#                 col_names,
#                 index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
#                 if config_metadata is not None
#                 else 0,
#             )
#             target_col = st.selectbox(
#                 "This column should contain the target summary",
#                 col_names,
#                 index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
#                 if config_metadata is not None
#                 else 0,
#             )
#             col_mapping[text_col] = "text"
#             col_mapping[target_col] = "target"

#     elif selected_task == "extractive_question_answering":
#         if config_metadata is not None:
#             col_mapping = config_metadata["col_mapping"]
#             # Hub YAML parser converts periods to hyphens, so we remap them here
#             col_mapping = format_col_mapping(col_mapping)
#         with col1:
#             st.markdown("`context` column")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.markdown("`question` column")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.markdown("`answers.text` column")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.markdown("`answers.answer_start` column")
#         with col2:
#             context_col = st.selectbox(
#                 "This column should contain the question's context",
#                 col_names,
#                 index=col_names.index(get_key(col_mapping, "context")) if config_metadata is not None else 0,
#             )
#             question_col = st.selectbox(
#                 "This column should contain the question to be answered, given the context",
#                 col_names,
#                 index=col_names.index(get_key(col_mapping, "question")) if config_metadata is not None else 0,
#             )
#             answers_text_col = st.selectbox(
#                 "This column should contain example answers to the question, extracted from the context",
#                 col_names,
#                 index=col_names.index(get_key(col_mapping, "answers.text")) if config_metadata is not None else 0,
#             )
#             answers_start_col = st.selectbox(
#                 "This column should contain the indices in the context of the first character of each `answers.text`",
#                 col_names,
#                 index=col_names.index(get_key(col_mapping, "answers.answer_start"))
#                 if config_metadata is not None
#                 else 0,
#             )
#             col_mapping[context_col] = "context"
#             col_mapping[question_col] = "question"
#             col_mapping[answers_text_col] = "answers.text"
#             col_mapping[answers_start_col] = "answers.answer_start"
#     elif selected_task in ["image_binary_classification", "image_multi_class_classification"]:
#         with col1:
#             st.markdown("`image` column")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.text("")
#             st.markdown("`target` column")
#         with col2:
#             image_col = st.selectbox(
#                 "This column should contain the images to be classified",
#                 col_names,
#                 index=col_names.index(get_key(config_metadata["col_mapping"], "image"))
#                 if config_metadata is not None
#                 else 0,
#             )
#             target_col = st.selectbox(
#                 "This column should contain the labels associated with the images",
#                 col_names,
#                 index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
#                 if config_metadata is not None
#                 else 0,
#             )
#             col_mapping[image_col] = "image"
#             col_mapping[target_col] = "target"

#     # Select metrics
#     st.markdown("**Select metrics**")
#     st.markdown("The following metrics will be computed")
#     html_string = " ".join(
#         [
#             '<div style="padding-right:5px;padding-left:5px;padding-top:5px;padding-bottom:5px;float:left">'
#             + '<div style="background-color:#D3D3D3;border-radius:5px;display:inline-block;padding-right:5px;'
#             + 'padding-left:5px;color:white">'
#             + metric
#             + "</div></div>"
#             for metric in TASK_TO_DEFAULT_METRICS[selected_task]
#         ]
#     )
#     st.markdown(html_string, unsafe_allow_html=True)
#     selected_metrics = st.multiselect(
#         "(Optional) Select additional metrics",
#         sorted(list(set(SUPPORTED_METRICS) - set(TASK_TO_DEFAULT_METRICS[selected_task]))),
#         help="""User-selected metrics will be computed with their default arguments. \
#             For example, `f1` will report results for binary labels. \
#             Check out the [available metrics](https://huggingface.co/metrics) for more details.""",
#     )

# with st.form(key="form"):
#     compatible_models = get_compatible_models(selected_task, [selected_dataset])
#     selected_models = st.multiselect(
#         "Select the models you wish to evaluate",
#         compatible_models,
#         help="""Don't see your favourite model in this list? Add the dataset and task it was trained on to the \
#             [model card metadata.](https://huggingface.co/docs/hub/models-cards#model-card-metadata)""",
#     )
#     print("INFO -- Selected models before filter:", selected_models)

#     hf_username = st.text_input("Enter your 🤗 Hub username to be notified when the evaluation is finished")

#     submit_button = st.form_submit_button("Evaluate models 🚀")

#     if submit_button:
#         if len(hf_username) == 0:
#             st.warning("No 🤗 Hub username provided! Please enter your username and try again.")
#         elif len(selected_models) == 0:
#             st.warning("⚠️ No models were selected for evaluation! Please select at least one model and try again.")
#         elif len(selected_models) > 10:
#             st.warning("Only 10 models can be evaluated at once. Please select fewer models and try again.")
#         else:
#             # Filter out previously evaluated models
#             selected_models = filter_evaluated_models(
#                 selected_models,
#                 selected_task,
#                 selected_dataset,
#                 selected_config,
#                 selected_split,
#                 selected_metrics,
#             )
#             print("INFO -- Selected models after filter:", selected_models)
#             if len(selected_models) > 0:
#                 project_payload = {
#                     "username": AUTOTRAIN_USERNAME,
#                     "proj_name": create_autotrain_project_name(selected_dataset, selected_config),
#                     "task": TASK_TO_ID[selected_task],
#                     "config": {
#                         "language": AUTOTRAIN_TASK_TO_LANG[selected_task]
#                         if selected_task in AUTOTRAIN_TASK_TO_LANG
#                         else "en",
#                         "max_models": 5,
#                         "instance": {
#                             "provider": "sagemaker" if selected_task in AUTOTRAIN_MACHINE.keys() else "ovh",
#                             "instance_type": AUTOTRAIN_MACHINE[selected_task]
#                             if selected_task in AUTOTRAIN_MACHINE.keys()
#                             else "p3",
#                             "max_runtime_seconds": 172800,
#                             "num_instances": 1,
#                             "disk_size_gb": 200,
#                         },
#                         "evaluation": {
#                             "metrics": selected_metrics,
#                             "models": selected_models,
#                             "hf_username": hf_username,
#                         },
#                     },
#                 }
#                 print(f"INFO -- Payload: {project_payload}")
#                 project_json_resp = http_post(
#                     path="/projects/create",
#                     payload=project_payload,
#                     token=HF_TOKEN,
#                     domain=AUTOTRAIN_BACKEND_API,
#                 ).json()
#                 print(f"INFO -- Project creation response: {project_json_resp}")

#                 if project_json_resp["created"]:
#                     data_payload = {
#                         "split": 4,  # use "auto" split choice in AutoTrain
#                         "col_mapping": col_mapping,
#                         "load_config": {"max_size_bytes": 0, "shuffle": False},
#                         "dataset_id": selected_dataset,
#                         "dataset_config": selected_config,
#                         "dataset_split": selected_split,
#                     }
#                     data_json_resp = http_post(
#                         path=f"/projects/{project_json_resp['id']}/data/dataset",
#                         payload=data_payload,
#                         token=HF_TOKEN,
#                         domain=AUTOTRAIN_BACKEND_API,
#                     ).json()
#                     print(f"INFO -- Dataset creation response: {data_json_resp}")
#                     if data_json_resp["download_status"] == 1:
#                         train_json_resp = http_post(
#                             path=f"/projects/{project_json_resp['id']}/data/start_processing",
#                             token=HF_TOKEN,
#                             domain=AUTOTRAIN_BACKEND_API,
#                         ).json()
#                         # For local development we process and approve projects on-the-fly
#                         if "localhost" in AUTOTRAIN_BACKEND_API:
#                             with st.spinner("⏳ Waiting for data processing to complete ..."):
#                                 is_data_processing_success = False
#                                 while is_data_processing_success is not True:
#                                     project_status = http_get(
#                                         path=f"/projects/{project_json_resp['id']}",
#                                         token=HF_TOKEN,
#                                         domain=AUTOTRAIN_BACKEND_API,
#                                     ).json()
#                                     if project_status["status"] == 3:
#                                         is_data_processing_success = True
#                                     time.sleep(10)

#                             # Approve training job
#                             train_job_resp = http_post(
#                                 path=f"/projects/{project_json_resp['id']}/start_training",
#                                 token=HF_TOKEN,
#                                 domain=AUTOTRAIN_BACKEND_API,
#                             ).json()
#                             st.success("✅  Data processing and project approval complete - go forth and evaluate!")
#                         else:
#                             # Prod/staging submissions are evaluated in a cron job via run_evaluation_jobs.py
#                             print(f"INFO -- AutoTrain job response: {train_json_resp}")
#                             if train_json_resp["success"]:
#                                 train_eval_index = {
#                                     "train-eval-index": [
#                                         {
#                                             "config": selected_config,
#                                             "task": AUTOTRAIN_TASK_TO_HUB_TASK[selected_task],
#                                             "task_id": selected_task,
#                                             "splits": {"eval_split": selected_split},
#                                             "col_mapping": col_mapping,
#                                         }
#                                     ]
#                                 }
#                                 selected_metadata = yaml.dump(train_eval_index, sort_keys=False)
#                                 dataset_card_url = get_dataset_card_url(selected_dataset)
#                                 st.success("✅ Successfully submitted evaluation job!")
#                                 st.markdown(
#                                     f"""
#                                 Evaluation can take up to 1 hour to complete, so grab a ☕️ or 🍵 while you wait:

#                                 * 🔔 A [Hub pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions) with the evaluation results will be opened for each model you selected. Check your email for notifications.
#                                 * 📊 Click [here](https://hf.co/spaces/autoevaluate/leaderboards?dataset={selected_dataset}) to view the results from your submission once the Hub pull request is merged.
#                                 * 🥱 Tired of configuring evaluations? Add the following metadata to the [dataset card]({dataset_card_url}) to enable 1-click evaluations:
#                                 """  # noqa
#                                 )
#                                 st.markdown(
#                                     f"""
#                                 ```yaml
#                                 {selected_metadata}
#                                 """
#                                 )
#                                 print("INFO -- Pushing evaluation job logs to the Hub")
#                                 evaluation_log = {}
#                                 evaluation_log["project_id"] = project_json_resp["id"]
#                                 evaluation_log["autotrain_env"] = (
#                                     "staging" if "staging" in AUTOTRAIN_BACKEND_API else "prod"
#                                 )
#                                 evaluation_log["payload"] = project_payload
#                                 evaluation_log["project_creation_response"] = project_json_resp
#                                 evaluation_log["dataset_creation_response"] = data_json_resp
#                                 evaluation_log["autotrain_job_response"] = train_json_resp
#                                 commit_evaluation_log(evaluation_log, hf_access_token=HF_TOKEN)
#                             else:
#                                 st.error("🙈 Oh no, there was an error submitting your evaluation job!")
#             else:
#                 st.warning("⚠️ No models left to evaluate! Please select other models and try again.")