John Graham Reynolds
commited on
Commit
β’
9b29e93
1
Parent(s):
a9c64ab
took notes from demos for another attempt
Browse files
app.py
CHANGED
@@ -8,34 +8,35 @@ import pandas as pd
|
|
8 |
title = "'Combine' multiple metrics with this π€ Evaluate πͺ² Fix!"
|
9 |
|
10 |
description = """<p style='text-align: center'>
|
11 |
-
As I introduce myself to the entirety of the π€ ecosystem, I've put together this
|
12 |
|
13 |
-
|
14 |
'evaluate.combine()' multiple metrics related to multilabel text classification. Particularly, one cannot 'combine()' the f1, precision, and recall scores for \
|
15 |
evaluation. I encountered this issue specifically while training [RoBERTa-base-DReiFT](https://huggingface.co/MarioBarbeque/RoBERTa-base-DReiFT) for multilabel \
|
16 |
text classification of 805 labeled medical conditions based on drug reviews. \n
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
"""
|
22 |
|
23 |
-
article = "<p style='text-align: center'>
|
24 |
trained [multilabel text classification model](https://github.com/johngrahamreynolds/RoBERTa-base-DReiFT/tree/main) that makes use of it during evaluation.</p>"
|
25 |
|
26 |
-
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
31 |
|
32 |
-
|
|
|
|
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
# outputs = predictions
|
37 |
|
38 |
-
|
39 |
|
40 |
|
41 |
# gr.Interface(
|
@@ -49,20 +50,38 @@ trained [multilabel text classification model](https://github.com/johngrahamreyn
|
|
49 |
# cache_examples=False
|
50 |
# ).launch()
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
def filter_records(records, gender):
|
53 |
return records[records["gender"] == gender]
|
54 |
|
55 |
-
|
56 |
-
|
57 |
-
[
|
58 |
gr.Dataframe(
|
59 |
-
headers=["
|
60 |
-
datatype=["
|
61 |
row_count=5,
|
62 |
-
col_count=(
|
63 |
),
|
64 |
-
gr.
|
|
|
|
|
|
|
|
|
|
|
65 |
],
|
66 |
-
"
|
67 |
-
|
|
|
|
|
|
|
68 |
).launch()
|
|
|
8 |
title = "'Combine' multiple metrics with this π€ Evaluate πͺ² Fix!"
|
9 |
|
10 |
description = """<p style='text-align: center'>
|
11 |
+
As I introduce myself to the entirety of the π€ ecosystem, I've put together this Space to show off a temporary fix for a current πͺ² in the π€ Evaluate library. \n
|
12 |
|
13 |
+
Check out the original, longstanding issue [here](https://github.com/huggingface/evaluate/issues/234). This details how it is currently impossible to \
|
14 |
'evaluate.combine()' multiple metrics related to multilabel text classification. Particularly, one cannot 'combine()' the f1, precision, and recall scores for \
|
15 |
evaluation. I encountered this issue specifically while training [RoBERTa-base-DReiFT](https://huggingface.co/MarioBarbeque/RoBERTa-base-DReiFT) for multilabel \
|
16 |
text classification of 805 labeled medical conditions based on drug reviews. \n
|
17 |
|
18 |
+
This Space shows how one can instantiate these custom metrics each with their own unique methodology for averaging across labels, combine them into a single
|
19 |
+
HF `evaluate.EvaluationModule` (or `Metric`), and compute them.</p>
|
|
|
20 |
"""
|
21 |
|
22 |
+
article = "<p style='text-align: center'>Check out the [original repo](https://github.com/johngrahamreynolds/FixedMetricsForHF) housing this code, and a quickly \
|
23 |
trained [multilabel text classification model](https://github.com/johngrahamreynolds/RoBERTa-base-DReiFT/tree/main) that makes use of it during evaluation.</p>"
|
24 |
|
25 |
+
def evaluation(predictions, metrics) -> str:
|
26 |
|
27 |
+
f1 = FixedF1(average=metrics["f1"])
|
28 |
+
precision = FixedPrecision(average=metrics["precision"])
|
29 |
+
recall = FixedRecall(average=metrics["recall"])
|
30 |
+
combined = evaluate.combine([f1, recall, precision])
|
31 |
|
32 |
+
df = predictions.get_dataframe()
|
33 |
+
predicted = df["Predicted Label"].to_list()
|
34 |
+
references = df["Actual Label"].to_list()
|
35 |
|
36 |
+
combined.add_batch(prediction=predicted, reference=references)
|
37 |
+
outputs = combined.compute()
|
|
|
38 |
|
39 |
+
return "Your metrics are as follows: \n" + outputs
|
40 |
|
41 |
|
42 |
# gr.Interface(
|
|
|
50 |
# cache_examples=False
|
51 |
# ).launch()
|
52 |
|
53 |
+
# use this to create examples
|
54 |
+
|
55 |
+
# data = {'Name':['Tony', 'Steve', 'Bruce', 'Peter' ],
|
56 |
+
# 'Age': [35, 70, 45, 20] }
|
57 |
+
|
58 |
+
# # Creating DataFrame
|
59 |
+
# df = pd.DataFrame(data)
|
60 |
+
|
61 |
+
|
62 |
+
|
63 |
def filter_records(records, gender):
|
64 |
return records[records["gender"] == gender]
|
65 |
|
66 |
+
space = gr.Interface(
|
67 |
+
fn=evaluation,
|
68 |
+
inputs=[
|
69 |
gr.Dataframe(
|
70 |
+
headers=["Predicted Label", "Actual Label"],
|
71 |
+
datatype=["number", "number"],
|
72 |
row_count=5,
|
73 |
+
col_count=(2, "fixed"),
|
74 |
),
|
75 |
+
gr.Dataframe(
|
76 |
+
headers=["Metric", "Averaging Type"],
|
77 |
+
datatype=["str", "str"],
|
78 |
+
row_count=3,
|
79 |
+
col_count=(2, "fixed"),
|
80 |
+
)
|
81 |
],
|
82 |
+
outputs="textbox",
|
83 |
+
title=title,
|
84 |
+
description=description,
|
85 |
+
article=article,
|
86 |
+
cache_examples=False
|
87 |
).launch()
|