from fixed_f1 import FixedF1
from fixed_precision import FixedPrecision
from fixed_recall import FixedRecall
import evaluate
import gradio as gr
import pandas as pd

title = "'Combine' multiple metrics with this 🤗 Evaluate 🪲 Fix!"

description = """<p style='text-align: center'>
As I introduce myself to the entirety of the 🤗 ecosystem, I've put together this Space to show off a temporary fix for a current 🪲 in the 🤗 Evaluate library. \n

Check out the original, longstanding issue [here](https://github.com/huggingface/evaluate/issues/234). This details how it is currently impossible to \
'evaluate.combine()' multiple metrics related to multilabel text classification. Particularly, one cannot 'combine()' the f1, precision, and recall scores for \
evaluation. I encountered this issue specifically while training [RoBERTa-base-DReiFT](https://huggingface.co/MarioBarbeque/RoBERTa-base-DReiFT) for multilabel \
text classification of 805 labeled medical conditions based on drug reviews. \n

This Space shows how one can instantiate these custom metrics each with their own unique methodology for averaging across labels, combine them into a single 
HF `evaluate.EvaluationModule` (or `Metric`), and compute them.</p>
"""

article = "<p style='text-align: center'>Check out the [original repo](https://github.com/johngrahamreynolds/FixedMetricsForHF) housing this code, and a quickly \
trained [multilabel text classification model](https://github.com/johngrahamreynolds/RoBERTa-base-DReiFT/tree/main) that makes use of it during evaluation.</p>"

def evaluation(predictions, metrics) -> str: 

    f1 = FixedF1(average=metrics["f1"])
    precision = FixedPrecision(average=metrics["precision"])
    recall = FixedRecall(average=metrics["recall"])
    combined = evaluate.combine([f1, recall, precision])

    df = predictions.get_dataframe()
    predicted = df["Predicted Label"].to_list()
    references = df["Actual Label"].to_list()

    combined.add_batch(prediction=predicted, reference=references)
    outputs =  combined.compute()

    return "Your metrics are as follows: \n" + outputs


# gr.Interface(
#     fn=show_off,
#     inputs=gr.Dataframe(type="array", datatype="number", row_count=5, col_count=1),
#     outputs="text",
#     title=title,
#     description=description,
#     article=article,
#     examples=[pd.DataFrame([1, 0, 2, 0, 1])],
#     cache_examples=False
# ).launch()

# use this to create examples

# data = {'Name':['Tony', 'Steve', 'Bruce', 'Peter' ],
#         'Age': [35, 70, 45, 20] }  

# # Creating DataFrame  
# df = pd.DataFrame(data) 


def filter_records(records, gender):
    return records[records["gender"] == gender]

space = gr.Interface(
    fn=evaluation,
    inputs=[
        gr.Dataframe(
            headers=["Predicted Label", "Actual Label"],
            datatype=["number", "number"],
            row_count=5,
            col_count=(2, "fixed"),
        ),
        gr.Dataframe(
            headers=["Metric", "Averaging Type"],
            datatype=["str", "str"],
            row_count=3,
            col_count=(2, "fixed"),
        )
    ],
    outputs="textbox",
    title=title,
    description=description,
    article=article,
    cache_examples=False
).launch()