from fixed_f1 import FixedF1 from fixed_precision import FixedPrecision from fixed_recall import FixedRecall import evaluate import gradio as gr import pandas as pd title = "'Combine' multiple metrics with this 🤗 Evaluate 🪲 Fix!" description = """
As I introduce myself to the entirety of the 🤗 ecosystem, I've put together this Space to show off a temporary fix for a current 🪲 in the 🤗 Evaluate library. \n Check out the original, longstanding issue [here](https://github.com/huggingface/evaluate/issues/234). This details how it is currently impossible to \ `evaluate.combine()` multiple metrics related to multilabel text classification. Particularly, one cannot `combine` the `f1`, `precision`, and `recall` scores for \ evaluation. I encountered this issue specifically while training [RoBERTa-base-DReiFT](https://huggingface.co/MarioBarbeque/RoBERTa-base-DReiFT) for multilabel \ text classification of 805 labeled medical conditions based on drug reviews. \n This Space shows how one can instantiate these custom `evaluate.Metric`s, each with their own unique methodology for averaging across labels, before `combine`-ing them into a HF `evaluate.CombinedEvaluations` object. From here, we can easily compute each of the metrics simultaneously using `compute`.
""" article = """Check out the [original repo](https://github.com/johngrahamreynolds/FixedMetricsForHF) housing this code, and a quickly \ trained [multilabel text classification model](https://github.com/johngrahamreynolds/RoBERTa-base-DReiFT/tree/main) that makes use of it during evaluation.
""" def evaluation(predictions, metrics) -> str: metric_set = set(metrics["Metric"].to_list()) combined_list = [] if "f1" in metric_set: f1 = FixedF1(average=metrics.loc[metrics["Metric"] == "f1"]["Averaging Type"][0]) combined_list.append(f1) if "precision" in metric_set: precision = FixedPrecision(average=metrics.loc[metrics["Metric"] == "precision"]["Averaging Type"][0]) combined_list.append(precision) if "recall" in metric_set: recall = FixedRecall(average=metrics.loc[metrics["Metric"] == "recall"]["Averaging Type"][0]) combined_list.append(recall) combined = evaluate.combine(combined_list) df = predictions.get_dataframe() predicted = df["Predicted Label"].to_list() references = df["Actual Label"].to_list() combined.add_batch(prediction=predicted, reference=references) outputs = combined.compute() return "Your metrics are as follows: \n" + outputs # gr.Interface( # fn=show_off, # inputs=gr.Dataframe(type="array", datatype="number", row_count=5, col_count=1), # outputs="text", # title=title, # description=description, # article=article, # examples=[pd.DataFrame([1, 0, 2, 0, 1])], # cache_examples=False # ).launch() # use this to create examples # data = {'Name':['Tony', 'Steve', 'Bruce', 'Peter' ], # 'Age': [35, 70, 45, 20] } # # Creating DataFrame # df = pd.DataFrame(data) def filter_records(records, gender): return records[records["gender"] == gender] space = gr.Interface( fn=evaluation, inputs=[ gr.Dataframe( headers=["Predicted Label", "Actual Label"], datatype=["number", "number"], row_count=5, col_count=(2, "fixed"), ), gr.Dataframe( headers=["Metric", "Averaging Type"], datatype=["str", "str"], row_count=3, col_count=(2, "fixed"), ) ], outputs="textbox", title=title, description=description, article=article, cache_examples=False ).launch()