File size: 4,825 Bytes
b71d998 1847cde b71d998 288a6ca 8b58e10 b71d998 bcbab79 9230b05 9b29e93 9230b05 9b29e93 217c111 9230b05 bcbab79 37efe23 0242c5c 0f86724 6fdccff dbc5be0 8b58e10 b71d998 1847cde ea607d4 ca651ac ea607d4 1847cde ea607d4 dbc5be0 8b58e10 dbc5be0 8b58e10 dbc5be0 8b58e10 dbc5be0 b71d998 ea607d4 9230b05 41d497e dbc5be0 9230b05 23e9f70 b71d998 a9c64ab 9b29e93 a9c64ab 31ed20e 9b29e93 a9c64ab 9b29e93 9ced07d a9c64ab 9b29e93 ca651ac 9b29e93 bcbab79 9b29e93 9ced07d 9b29e93 a9c64ab bcbab79 9b29e93 bcbab79 41d497e 718583d ca651ac 41d497e 1f58605 9b29e93 b71d998 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
from fixed_f1 import FixedF1
from fixed_precision import FixedPrecision
from fixed_recall import FixedRecall
import evaluate
import gradio as gr
import pandas as pd
import numpy as np
title = "'Combine' multiple metrics with this 🤗 Evaluate 🪲 Fix!"
description = """<p style='text-align: center'>
As I introduce myself to the entirety of the 🤗 ecosystem, I've put together this Space to show off a temporary fix for a current 🪲 in the 🤗 Evaluate library. \n
Check out the original, longstanding issue [here](https://github.com/huggingface/evaluate/issues/234). This details how it is currently impossible to \
`evaluate.combine()` multiple metrics related to multilabel text classification. Particularly, one cannot `combine` the `f1`, `precision`, and `recall` scores for \
evaluation. I encountered this issue specifically while training [RoBERTa-base-DReiFT](https://huggingface.co/MarioBarbeque/RoBERTa-base-DReiFT) for multilabel \
text classification of 805 labeled medical conditions based on drug reviews. The [following workaround](https://github.com/johngrahamreynolds/FixedMetricsForHF) was
created to address this - follow the link to view the source! To see each of these abstracted classes at work independently, view the 🤗 Space I've constructed for each:
[FixedF1📈](https://huggingface.co/spaces/MarioBarbeque/FixedF1), [FixedPrecision🎯](https://huggingface.co/spaces/MarioBarbeque/FixedPrecision),
[FixedRecall📉](https://huggingface.co/spaces/MarioBarbeque/FixedRecall).\n
This Space shows how one can instantiate these custom `evaluate.Metric`s, each with their own unique methodology for averaging across labels, before `combine`-ing them into a
HF `evaluate.CombinedEvaluations` object. From here, we can easily compute each of the metrics simultaneously using `compute`. \n
In general, one writes the following:\n
```python
f1 = FixedF1(average=...)
precision = FixedPrecision(average=...)
recall = FixedRecall(average=...)
combined = evaluate.combine([f1, precision, recall])
combined.add_batch(predictions=..., references=...)
combined.compute()
```\n
where the `average` parameter can be different at instantiation time for each of the metrics. Acceptable values include `[None, 'micro', 'macro', 'weighted']` (
or `binary` if there exist only two labels). \n
Try it out using the examples below! Then try picking some various averaging methods yourself!
</p>
"""
def populate_map(metric_df: pd.DataFrame, metric_set: set) -> dict:
metric_map = dict()
for key in metric_set:
for val in metric_df.loc[metric_df["Metric"] == key]["Averaging Method"]:
metric_map[key] = val
return metric_map
def evaluation(predictions_df: pd.DataFrame, metrics_df: pd.DataFrame) -> str:
metric_set = set(metrics_df["Metric"].to_list())
metric_map = populate_map(metrics_df, metric_set)
combined_list = []
if "f1" in metric_set:
f1 = FixedF1(average=metric_map["f1"] if metric_map["f1"] != "None" else None)
combined_list.append(f1)
if "precision" in metric_set:
precision = FixedPrecision(average=metric_map["precision"] if metric_map["precision"] != "None" else None, zero_division=np.nan)
combined_list.append(precision)
if "recall" in metric_set:
recall = FixedRecall(average=metric_map["recall"] if metric_map["recall"] != "None" else None)
combined_list.append(recall)
combined = evaluate.combine(combined_list)
predicted = [int(num) for num in predictions_df["Predicted Class Label"].to_list()]
references = [int(num) for num in predictions_df["Actual Class Label"].to_list()]
combined.add_batch(predictions=predicted, references=references)
outputs = combined.compute()
return f"Your metrics are as follows: \n {outputs}"
space = gr.Interface(
fn=evaluation,
inputs=[
gr.Dataframe(
headers=["Predicted Class Label", "Actual Class Label"],
datatype=["number", "number"],
row_count=5,
col_count=(2, "fixed"),
label="Table of Predicted vs Actual Class Labels"
),
gr.Dataframe(
headers=["Metric", "Averaging Method"],
datatype=["str", "str"],
row_count=(3, "fixed"),
col_count=(2, "fixed"),
label="Table of Metrics and Averaging Method across Labels "
)
],
outputs="text",
title=title,
description=description,
examples=[
[
pd.DataFrame(columns=["Predicted Class Label", "Actual Class Label"], data=[[0,1],[1,1],[2,2],[1,0],[0,0]]),
pd.DataFrame(columns=["Metric", "Averaging Method"], data=[["f1", "weighted"],["precision", "micro"],["recall", "macro"]])
]
],
cache_examples=False
).launch() |