John Graham Reynolds
update predicitions and add examples
41d497e
raw
history blame
4.25 kB
from fixed_f1 import FixedF1
from fixed_precision import FixedPrecision
from fixed_recall import FixedRecall
import evaluate
import gradio as gr
import pandas as pd
title = "'Combine' multiple metrics with this πŸ€— Evaluate πŸͺ² Fix!"
description = """<p style='text-align: center'>
As I introduce myself to the entirety of the πŸ€— ecosystem, I've put together this Space to show off a temporary fix for a current πŸͺ² in the πŸ€— Evaluate library. \n
Check out the original, longstanding issue [here](https://github.com/huggingface/evaluate/issues/234). This details how it is currently impossible to \
`evaluate.combine()` multiple metrics related to multilabel text classification. Particularly, one cannot `combine` the `f1`, `precision`, and `recall` scores for \
evaluation. I encountered this issue specifically while training [RoBERTa-base-DReiFT](https://huggingface.co/MarioBarbeque/RoBERTa-base-DReiFT) for multilabel \
text classification of 805 labeled medical conditions based on drug reviews. The [following workaround](https://github.com/johngrahamreynolds/FixedMetricsForHF) was
created to address this. \n
This Space shows how one can instantiate these custom `evaluate.Metric`s, each with their own unique methodology for averaging across labels, before `combine`-ing them into a
HF `evaluate.CombinedEvaluations` object. From here, we can easily compute each of the metrics simultaneously using `compute`.</p>
"""
article = """<p style='text-align: center'>Check out the [original repo](https://github.com/johngrahamreynolds/FixedMetricsForHF) housing this code, and a quickly \
trained [multilabel text classification model](https://github.com/johngrahamreynolds/RoBERTa-base-DReiFT/tree/main) that makes use of it during evaluation.</p>"""
def evaluation(predictions, metrics) -> str:
metric_set = set(metrics["Metric"].to_list())
combined_list = []
if "f1" in metric_set:
f1 = FixedF1(average=metrics.loc[metrics["Metric"] == "f1"]["Averaging Type"][0])
combined_list.append(f1)
if "precision" in metric_set:
precision = FixedPrecision(average=metrics.loc[metrics["Metric"] == "precision"]["Averaging Type"][0])
combined_list.append(precision)
if "recall" in metric_set:
recall = FixedRecall(average=metrics.loc[metrics["Metric"] == "recall"]["Averaging Type"][0])
combined_list.append(recall)
combined = evaluate.combine(combined_list)
predicted = [int(num) for num in predictions["Predicted Class Label"].to_list()]
references = [int(num) for num in predictions["Actual Class Label"].to_list()]
combined.add_batch(predictions=predicted, references=references)
outputs = combined.compute()
return "Your metrics are as follows: \n" + outputs
# gr.Interface(
# fn=show_off,
# inputs=gr.Dataframe(type="array", datatype="number", row_count=5, col_count=1),
# outputs="text",
# title=title,
# description=description,
# article=article,
# examples=[pd.DataFrame([1, 0, 2, 0, 1])],
# cache_examples=False
# ).launch()
# use this to create examples
# data = {'Name':['Tony', 'Steve', 'Bruce', 'Peter' ],
# 'Age': [35, 70, 45, 20] }
# # Creating DataFrame
# df = pd.DataFrame(data)
def filter_records(records, gender):
return records[records["gender"] == gender]
space = gr.Interface(
fn=evaluation,
inputs=[
gr.Dataframe(
headers=["Predicted Class Label", "Actual Class Label"],
datatype=["number", "number"],
row_count=5,
col_count=(2, "fixed"),
label_name="Table of Predicted vs Actual Class Labels"
),
gr.Dataframe(
headers=["Metric", "Averaging Type"],
datatype=["str", "str"],
row_count=(3, "fixed"),
col_count=(2, "fixed"),
label_name="Table of Metrics and Averaging Method across Labels "
)
],
outputs="text",
title=title,
description=description,
article=article,
examples=[
[
[[1,1], [1,0], [2,0], [1,2], [2,2]],
[["f1", "weighted"], ["precision", "micro"], ["recall", "weighted"]]
]
]
cache_examples=False
).launch()