John Graham Reynolds commited on
Commit
9b29e93
β€’
1 Parent(s): a9c64ab

took notes from demos for another attempt

Browse files
Files changed (1) hide show
  1. app.py +43 -24
app.py CHANGED
@@ -8,34 +8,35 @@ import pandas as pd
8
  title = "'Combine' multiple metrics with this πŸ€— Evaluate πŸͺ² Fix!"
9
 
10
  description = """<p style='text-align: center'>
11
- As I introduce myself to the entirety of the πŸ€— ecosystem, I've put together this space to show off a temporary fix for a current πŸͺ² in the πŸ€— Evaluate library. \n
12
 
13
- Check out the original, longstanding issue [here](https://github.com/huggingface/evaluate/issues/234). This details how it is currently impossible to \
14
  'evaluate.combine()' multiple metrics related to multilabel text classification. Particularly, one cannot 'combine()' the f1, precision, and recall scores for \
15
  evaluation. I encountered this issue specifically while training [RoBERTa-base-DReiFT](https://huggingface.co/MarioBarbeque/RoBERTa-base-DReiFT) for multilabel \
16
  text classification of 805 labeled medical conditions based on drug reviews. \n
17
 
18
- Try to use \t to write some code? \t or how does that work? </p>
19
-
20
-
21
  """
22
 
23
- article = "<p style='text-align: center'> Check out the [original repo](https://github.com/johngrahamreynolds/FixedMetricsForHF) housing this code, and a quickly \
24
  trained [multilabel text classification model](https://github.com/johngrahamreynolds/RoBERTa-base-DReiFT/tree/main) that makes use of it during evaluation.</p>"
25
 
26
- # def show_off(predictions: list[list]) -> str:
27
 
28
- # # f1 = FixedF1(average=weighting_map["f1"])
29
- # # precision = FixedPrecision(average=weighting_map["precision"])
30
- # # recall = FixedRecall(average=weighting_map["recall"])
 
31
 
32
- # # combined = evaluate.combine([f1, recall, precision])
 
 
33
 
34
- # # combined.add_batch(prediction=predictions, reference=references)
35
- # # outputs = combined.compute()
36
- # outputs = predictions
37
 
38
- # return "Your metrics are as follows: \n" + outputs
39
 
40
 
41
  # gr.Interface(
@@ -49,20 +50,38 @@ trained [multilabel text classification model](https://github.com/johngrahamreyn
49
  # cache_examples=False
50
  # ).launch()
51
 
 
 
 
 
 
 
 
 
 
 
52
  def filter_records(records, gender):
53
  return records[records["gender"] == gender]
54
 
55
- demo = gr.Interface(
56
- filter_records,
57
- [
58
  gr.Dataframe(
59
- headers=["name", "age", "gender"],
60
- datatype=["str", "number", "str"],
61
  row_count=5,
62
- col_count=(3, "fixed"),
63
  ),
64
- gr.Dropdown(["M", "F", "O"]),
 
 
 
 
 
65
  ],
66
- "dataframe",
67
- description="Enter gender as 'M', 'F', or 'O' for other.",
 
 
 
68
  ).launch()
 
8
  title = "'Combine' multiple metrics with this πŸ€— Evaluate πŸͺ² Fix!"
9
 
10
  description = """<p style='text-align: center'>
11
+ As I introduce myself to the entirety of the πŸ€— ecosystem, I've put together this Space to show off a temporary fix for a current πŸͺ² in the πŸ€— Evaluate library. \n
12
 
13
+ Check out the original, longstanding issue [here](https://github.com/huggingface/evaluate/issues/234). This details how it is currently impossible to \
14
  'evaluate.combine()' multiple metrics related to multilabel text classification. Particularly, one cannot 'combine()' the f1, precision, and recall scores for \
15
  evaluation. I encountered this issue specifically while training [RoBERTa-base-DReiFT](https://huggingface.co/MarioBarbeque/RoBERTa-base-DReiFT) for multilabel \
16
  text classification of 805 labeled medical conditions based on drug reviews. \n
17
 
18
+ This Space shows how one can instantiate these custom metrics each with their own unique methodology for averaging across labels, combine them into a single
19
+ HF `evaluate.EvaluationModule` (or `Metric`), and compute them.</p>
 
20
  """
21
 
22
+ article = "<p style='text-align: center'>Check out the [original repo](https://github.com/johngrahamreynolds/FixedMetricsForHF) housing this code, and a quickly \
23
  trained [multilabel text classification model](https://github.com/johngrahamreynolds/RoBERTa-base-DReiFT/tree/main) that makes use of it during evaluation.</p>"
24
 
25
+ def evaluation(predictions, metrics) -> str:
26
 
27
+ f1 = FixedF1(average=metrics["f1"])
28
+ precision = FixedPrecision(average=metrics["precision"])
29
+ recall = FixedRecall(average=metrics["recall"])
30
+ combined = evaluate.combine([f1, recall, precision])
31
 
32
+ df = predictions.get_dataframe()
33
+ predicted = df["Predicted Label"].to_list()
34
+ references = df["Actual Label"].to_list()
35
 
36
+ combined.add_batch(prediction=predicted, reference=references)
37
+ outputs = combined.compute()
 
38
 
39
+ return "Your metrics are as follows: \n" + outputs
40
 
41
 
42
  # gr.Interface(
 
50
  # cache_examples=False
51
  # ).launch()
52
 
53
+ # use this to create examples
54
+
55
+ # data = {'Name':['Tony', 'Steve', 'Bruce', 'Peter' ],
56
+ # 'Age': [35, 70, 45, 20] }
57
+
58
+ # # Creating DataFrame
59
+ # df = pd.DataFrame(data)
60
+
61
+
62
+
63
  def filter_records(records, gender):
64
  return records[records["gender"] == gender]
65
 
66
+ space = gr.Interface(
67
+ fn=evaluation,
68
+ inputs=[
69
  gr.Dataframe(
70
+ headers=["Predicted Label", "Actual Label"],
71
+ datatype=["number", "number"],
72
  row_count=5,
73
+ col_count=(2, "fixed"),
74
  ),
75
+ gr.Dataframe(
76
+ headers=["Metric", "Averaging Type"],
77
+ datatype=["str", "str"],
78
+ row_count=3,
79
+ col_count=(2, "fixed"),
80
+ )
81
  ],
82
+ outputs="textbox",
83
+ title=title,
84
+ description=description,
85
+ article=article,
86
+ cache_examples=False
87
  ).launch()