from fixed_f1 import FixedF1
from fixed_precision import FixedPrecision
from fixed_recall import FixedRecall
import evaluate
import gradio as gr
import pandas as pd

title = "'Combine' multiple metrics with this 🤗 Evaluate 🪲 Fix!"

description = """<p style='text-align: center'>
As I introduce myself to the entirety of the 🤗 ecosystem, I've put together this Space to show off a temporary fix for a current 🪲 in the 🤗 Evaluate library. \n

Check out the original, longstanding issue [here](https://github.com/huggingface/evaluate/issues/234). This details how it is currently impossible to \
`evaluate.combine()` multiple metrics related to multilabel text classification. Particularly, one cannot `combine` the `f1`, `precision`, and `recall` scores for \
evaluation. I encountered this issue specifically while training [RoBERTa-base-DReiFT](https://huggingface.co/MarioBarbeque/RoBERTa-base-DReiFT) for multilabel \
text classification of 805 labeled medical conditions based on drug reviews. The [following workaround](https://github.com/johngrahamreynolds/FixedMetricsForHF) was 
created to address this. \n

This Space shows how one can instantiate these custom `evaluate.Metric`s, each with their own unique methodology for averaging across labels, before `combine`-ing them into a  
HF `evaluate.CombinedEvaluations` object. From here, we can easily compute each of the metrics simultaneously using `compute`.</p>
"""


def populate_map(metric_df: pd.DataFrame, metric_set: set) -> dict:
    
    metric_map = dict()

    for key in metric_set:
        for val in metric_df.loc[metric_df["metric"] == key]["average"]:
            metric_map[key] = val
    
    return metric_map


def evaluation(predictions_df: pd.DataFrame, metrics_df: pd.DataFrame) -> str:

    metric_set = set(metrics_df["Metric"].to_list())
    metric_map = populate_map(metrics_df, metric_set)
    combined_list = []

    if "f1" in metric_set:
        f1 = FixedF1(average=metric_map["f1"])
        combined_list.append(f1)
    if "precision" in metric_set:
        precision = FixedPrecision(average=metric_map["f1"])
        combined_list.append(precision)
    if "recall" in metric_set:
        recall = FixedRecall(average=metric_map["f1"])
        combined_list.append(recall)
    
    combined = evaluate.combine(combined_list)

    predicted = [int(num) for num in predictions_df["Predicted Class Label"].to_list()]
    references = [int(num) for num in predictions_df["Actual Class Label"].to_list()]

    combined.add_batch(predictions=predicted, references=references)
    outputs = combined.compute()

    return f"Your metrics are as follows: \n {outputs}"


space = gr.Interface(
    fn=evaluation,
    inputs=[
        gr.Dataframe(
            headers=["Predicted Class Label", "Actual Class Label"],
            datatype=["number", "number"],
            row_count=5,
            col_count=(2, "fixed"),
            label="Table of Predicted vs Actual Class Labels"
        ),
        gr.Dataframe(
            headers=["Metric", "Averaging Type"],
            datatype=["str", "str"],
            row_count=(3, "fixed"),
            col_count=(2, "fixed"),
            label="Table of Metrics and Averaging Method across Labels "
        )
    ],
    outputs="text",
    title=title,
    description=description,
    examples=[
        [
            pd.DataFrame(columns=["Predicted Class Label", "Actual Class Label"], data=[[0,1],[1,1],[2,2],[1,0],[0,0]]),
            pd.DataFrame(columns=["Metric", "Averaging Type"], data=[["f1", "weighted"],["precision", "micro"],["recall", "macro"]])
        ]
    ],
    cache_examples=False
).launch()