from fixed_f1 import FixedF1
from fixed_precision import FixedPrecision
from fixed_recall import FixedRecall
import evaluate
import gradio as gr
import pandas as pd

title = "'Combine' multiple metrics with this 🤗 Evaluate 🪲 Fix!"

description = """<p style='text-align: center'>
As I introduce myself to the entirety of the 🤗 ecosystem, I've put together this Space to show off a temporary fix for a current 🪲 in the 🤗 Evaluate library. \n

Check out the original, longstanding issue [here](https://github.com/huggingface/evaluate/issues/234). This details how it is currently impossible to \
`evaluate.combine()` multiple metrics related to multilabel text classification. Particularly, one cannot `combine` the `f1`, `precision`, and `recall` scores for \
evaluation. I encountered this issue specifically while training [RoBERTa-base-DReiFT](https://huggingface.co/MarioBarbeque/RoBERTa-base-DReiFT) for multilabel \
text classification of 805 labeled medical conditions based on drug reviews. \n

This Space shows how one can instantiate these custom `evaluate.Metric`s, each with their own unique methodology for averaging across labels, before `combine`-ing them into a  
HF `evaluate.CombinedEvaluations` object. From here, we can easily compute each of the metrics simultaneously using `compute`.</p>
"""

article = """<p style='text-align: center'>Check out the [original repo](https://github.com/johngrahamreynolds/FixedMetricsForHF) housing this code, and a quickly \
trained [multilabel text classification model](https://github.com/johngrahamreynolds/RoBERTa-base-DReiFT/tree/main) that makes use of it during evaluation.</p>"""

def evaluation(predictions, metrics) -> str:

    metric_set = set(metrics["Metric"].to_list())
    combined_list = []

    if "f1" in metric_set:
        f1 = FixedF1(average=metrics.loc[metrics["Metric"] == "f1"]["Averaging Type"][0])
        combined_list.append(f1)
    if "precision" in metric_set:
        precision = FixedPrecision(average=metrics.loc[metrics["Metric"] == "precision"]["Averaging Type"][0])
        combined_list.append(precision)
    if "recall" in metric_set:
        recall = FixedRecall(average=metrics.loc[metrics["Metric"] == "recall"]["Averaging Type"][0])
        combined_list.append(recall)
    
    combined = evaluate.combine(combined_list)

    df = predictions.get_dataframe()
    predicted = df["Predicted Label"].to_list()
    references = df["Actual Label"].to_list()

    combined.add_batch(prediction=predicted, reference=references)
    outputs = combined.compute()

    return "Your metrics are as follows: \n" + outputs


# gr.Interface(
#     fn=show_off,
#     inputs=gr.Dataframe(type="array", datatype="number", row_count=5, col_count=1),
#     outputs="text",
#     title=title,
#     description=description,
#     article=article,
#     examples=[pd.DataFrame([1, 0, 2, 0, 1])],
#     cache_examples=False
# ).launch()

# use this to create examples

# data = {'Name':['Tony', 'Steve', 'Bruce', 'Peter' ],
#         'Age': [35, 70, 45, 20] }  

# # Creating DataFrame  
# df = pd.DataFrame(data) 


def filter_records(records, gender):
    return records[records["gender"] == gender]

space = gr.Interface(
    fn=evaluation,
    inputs=[
        gr.Dataframe(
            headers=["Predicted Label", "Actual Label"],
            datatype=["number", "number"],
            row_count=5,
            col_count=(2, "fixed"),
        ),
        gr.Dataframe(
            headers=["Metric", "Averaging Type"],
            datatype=["str", "str"],
            row_count=3,
            col_count=(2, "fixed"),
        )
    ],
    outputs="textbox",
    title=title,
    description=description,
    article=article,
    cache_examples=False
).launch()