from fixed_f1 import FixedF1 from fixed_precision import FixedPrecision from fixed_recall import FixedRecall import evaluate import gradio as gr import pandas as pd import numpy as np title = "'Combine' multiple metrics with this 🤗 Evaluate 🪲 Fix!" description = """

As I introduce myself to the entirety of the 🤗 ecosystem, I've put together this Space to show off a temporary fix for a current 🪲 in the 🤗 Evaluate library. \n Check out the original, longstanding issue [here](https://github.com/huggingface/evaluate/issues/234). This details how it is currently impossible to \ `evaluate.combine()` multiple metrics related to multilabel text classification. Particularly, one cannot `combine` the `f1`, `precision`, and `recall` scores for \ evaluation. I encountered this issue specifically while training [RoBERTa-base-DReiFT](https://huggingface.co/MarioBarbeque/RoBERTa-base-DReiFT) for multilabel \ text classification of 805 labeled medical conditions based on drug reviews. The [following workaround](https://github.com/johngrahamreynolds/FixedMetricsForHF) was created to address this - follow the link to view the source! \n This Space shows how one can instantiate these custom `evaluate.Metric`s, each with their own unique methodology for averaging across labels, before `combine`-ing them into a HF `evaluate.CombinedEvaluations` object. From here, we can easily compute each of the metrics simultaneously using `compute`. \n In general, one writes the following:\n ```python f1 = FixedF1(average=...) precision = FixedPrecision(average=...) recall = FixedRecall(average=...) combined = evaluate.combine([f1, precision, recall]) combined.add_batch(predictions=..., references=...) combined.compute() ```\n where the `average` parameter can be different at instantiation time for each of the metrics. Acceptable values include `[None, 'micro', 'macro', 'weighted']` ( or `binary` if there exist only two labels). \n Try it out using the examples below! Then try picking some various averaging methods yourself!

""" def populate_map(metric_df: pd.DataFrame, metric_set: set) -> dict: metric_map = dict() for key in metric_set: for val in metric_df.loc[metric_df["Metric"] == key]["Averaging Method"]: metric_map[key] = val return metric_map def evaluation(predictions_df: pd.DataFrame, metrics_df: pd.DataFrame) -> str: metric_set = set(metrics_df["Metric"].to_list()) metric_map = populate_map(metrics_df, metric_set) combined_list = [] if "f1" in metric_set: f1 = FixedF1(average=metric_map["f1"] if metric_map["f1"] != "None" else None) combined_list.append(f1) if "precision" in metric_set: precision = FixedPrecision(average=metric_map["precision"] if metric_map["precision"] != "None" else None, zero_division=np.nan) combined_list.append(precision) if "recall" in metric_set: recall = FixedRecall(average=metric_map["recall"] if metric_map["recall"] != "None" else None) combined_list.append(recall) combined = evaluate.combine(combined_list) predicted = [int(num) for num in predictions_df["Predicted Class Label"].to_list()] references = [int(num) for num in predictions_df["Actual Class Label"].to_list()] combined.add_batch(predictions=predicted, references=references) outputs = combined.compute() return f"Your metrics are as follows: \n {outputs}" space = gr.Interface( fn=evaluation, inputs=[ gr.Dataframe( headers=["Predicted Class Label", "Actual Class Label"], datatype=["number", "number"], row_count=5, col_count=(2, "fixed"), label="Table of Predicted vs Actual Class Labels" ), gr.Dataframe( headers=["Metric", "Averaging Method"], datatype=["str", "str"], row_count=(3, "fixed"), col_count=(2, "fixed"), label="Table of Metrics and Averaging Method across Labels " ) ], outputs="text", title=title, description=description, examples=[ [ pd.DataFrame(columns=["Predicted Class Label", "Actual Class Label"], data=[[0,1],[1,1],[2,2],[1,0],[0,0]]), pd.DataFrame(columns=["Metric", "Averaging Method"], data=[["f1", "weighted"],["precision", "micro"],["recall", "macro"]]) ] ], cache_examples=False ).launch()