Spaces:

wadood
/

ner_evaluation_metrics

Sleeping

App Files Files Community

wadood commited on Jul 20, 2024

Commit

19d19d4

1 Parent(s): 1c461bc

added exact span eval metric

Browse files

Files changed (4) hide show

.gitignore +1 -0
app.py +11 -8
evaluation_metrics.py +84 -42
predefined_example.py +4 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ from annotated_text import annotated_text
 from annotated_text.util import get_annotated_html
 from streamlit_annotation_tools import text_labeler
-from evaluation_metrics import EVALUATION_METRICS, get_evaluation_metric
 from predefined_example import EXAMPLES
 from span_dataclass_converters import (
     get_highlight_spans_from_ner_spans,
@@ -20,12 +20,13 @@ def get_examples_attributes(selected_example):
         selected_example.gt_labels,
         selected_example.gt_spans,
         selected_example.predictions,
     )
 if __name__ == "__main__":
     st.set_page_config(layout="wide")
-    st.title("NER Evaluation Metrics Comparison")
     st.write(
         "Evaluation for the NER task requires a ground truth and a prediction that will be evaluated. The ground truth is shown below, add predictions in the next section to compare the evaluation metrics."
@@ -40,7 +41,9 @@ if __name__ == "__main__":
         format_func=lambda ex: ex.text,
     )
-    text, gt_labels, gt_spans, predictions = get_examples_attributes(selected_example)
     annotated_text(
         get_highlight_spans_from_ner_spans(
@@ -116,17 +119,17 @@ Add predictions to the list of predictions on which the evaluation metric will b
         st.write(predictions_df.to_html(escape=False), unsafe_allow_html=True)
     if st.button("Get Metrics!"):
-        for evaluation_metric_type in EVALUATION_METRICS:
-            predictions_df[evaluation_metric_type] = predictions_df.ner_spans.apply(
-                lambda ner_spans: get_evaluation_metric(
-                    metric_type=evaluation_metric_type,
                     gt_ner_span=gt_spans,
                     pred_ner_span=ner_spans,
                     text=text,
                 )
             )
         metrics_df = predictions_df.drop(["ner_spans"], axis=1)
         st.write(metrics_df.to_html(escape=False), unsafe_allow_html=True)
-        print("compared")

 from annotated_text.util import get_annotated_html
 from streamlit_annotation_tools import text_labeler
+from evaluation_metrics import EVALUATION_METRICS
 from predefined_example import EXAMPLES
 from span_dataclass_converters import (
     get_highlight_spans_from_ner_spans,
         selected_example.gt_labels,
         selected_example.gt_spans,
         selected_example.predictions,
+        selected_example.tags,
     )
 if __name__ == "__main__":
     st.set_page_config(layout="wide")
+    st.title("NER Metrics Comparison")
     st.write(
         "Evaluation for the NER task requires a ground truth and a prediction that will be evaluated. The ground truth is shown below, add predictions in the next section to compare the evaluation metrics."
         format_func=lambda ex: ex.text,
     )
+    text, gt_labels, gt_spans, predictions, tags = get_examples_attributes(
+        selected_example
+    )
     annotated_text(
         get_highlight_spans_from_ner_spans(
         st.write(predictions_df.to_html(escape=False), unsafe_allow_html=True)
     if st.button("Get Metrics!"):
+        for evaluation_metric in EVALUATION_METRICS:
+            predictions_df[evaluation_metric.name] = predictions_df.ner_spans.apply(
+                lambda ner_spans: evaluation_metric.get_evaluation_metric(
+                    # metric_type=evaluation_metric_type,
                     gt_ner_span=gt_spans,
                     pred_ner_span=ner_spans,
                     text=text,
+                    tags=tags,
                 )
             )
         metrics_df = predictions_df.drop(["ner_spans"], axis=1)
         st.write(metrics_df.to_html(escape=False), unsafe_allow_html=True)

evaluation_metrics.py CHANGED Viewed

@@ -1,49 +1,91 @@
 from nervaluate import Evaluator
 from sklearn.metrics import classification_report
 from token_level_output import get_token_output_labels
-EVALUATION_METRICS = [
-    "Span Based Evaluation with Partial Overlap",
-    "Token Based Evaluation with Micro Avg",
-    "Token Based Evaluation with Macro Avg",
-]
-def get_span_eval(gt_ner_span, pred_ner_span, text):
-    evaluator = Evaluator([gt_ner_span], [pred_ner_span], tags=["Disease", "Drug"])
-    return round(evaluator.evaluate()[0]["ent_type"]["f1"], 2)
-def get_token_micro_eval(gt_ner_span, pred_ner_span, text):
-    return round(
-        classification_report(
-            get_token_output_labels(gt_ner_span, text),
-            get_token_output_labels(pred_ner_span, text),
-            labels=["Disease", "Drug"],
-            output_dict=True,
-        )["micro avg"]["f1-score"],
-        2,
-    )
-def get_token_macro_eval(gt_ner_span, pred_ner_span, text):
-    return round(
-        classification_report(
-            get_token_output_labels(gt_ner_span, text),
-            get_token_output_labels(pred_ner_span, text),
-            labels=["Disease", "Drug"],
-            output_dict=True,
-        )["macro avg"]["f1-score"],
-        2,
-    )
-def get_evaluation_metric(metric_type, gt_ner_span, pred_ner_span, text):
-    match metric_type:
-        case "Span Based Evaluation with Partial Overlap":
-            return get_span_eval(gt_ner_span, pred_ner_span, text)
-        case "Token Based Evaluation with Micro Avg":
-            return get_token_micro_eval(gt_ner_span, pred_ner_span, text)
-        case "Token Based Evaluation with Macro Avg":
-            return get_token_macro_eval(gt_ner_span, pred_ner_span, text)

+from abc import ABC, abstractmethod
 from nervaluate import Evaluator
 from sklearn.metrics import classification_report
 from token_level_output import get_token_output_labels
+class EvaluationMetric(ABC):
+    """Base class defining the attributes & methods of an evaluation metric"""
+    name: str
+    description: str
+    @abstractmethod
+    def get_evaluation_metric(gt_ner_span, pred_ner_span, text, tags) -> float:
+        pass
+class PartialSpanOverlapMetric(EvaluationMetric):
+    def __init__(self) -> None:
+        super().__init__()
+        self.name = "Span Based Evaluation with Partial Overlap"
+        self.description = ""
+    @staticmethod
+    def get_evaluation_metric(gt_ner_span, pred_ner_span, text, tags) -> float:
+        evaluator = Evaluator([gt_ner_span], [pred_ner_span], tags=tags)
+        return round(evaluator.evaluate()[0]["ent_type"]["f1"], 2)
+class ExactSpanOverlapMetric(EvaluationMetric):
+    def __init__(self) -> None:
+        super().__init__()
+        self.name = "Span Based Evaluation with Exact Overlap"
+        self.description = ""
+    @staticmethod
+    def get_evaluation_metric(gt_ner_span, pred_ner_span, text, tags) -> float:
+        evaluator = Evaluator([gt_ner_span], [pred_ner_span], tags=tags)
+        return round(evaluator.evaluate()[0]["strict"]["f1"], 2)
+class TokenMicroMetric(EvaluationMetric):
+    def __init__(self) -> None:
+        super().__init__()
+        self.name = "Span Based Evaluation with Micro Average"
+        self.description = ""
+    @staticmethod
+    def get_evaluation_metric(gt_ner_span, pred_ner_span, text, tags) -> float:
+        return round(
+            classification_report(
+                get_token_output_labels(gt_ner_span, text),
+                get_token_output_labels(pred_ner_span, text),
+                labels=tags,
+                output_dict=True,
+            )["micro avg"]["f1-score"],
+            2,
+        )
+class TokenMacroMetric(EvaluationMetric):
+    def __init__(self) -> None:
+        super().__init__()
+        self.name = "Token Based Evaluation with Macro Average"
+        self.description = ""
+    @staticmethod
+    def get_evaluation_metric(gt_ner_span, pred_ner_span, text, tags) -> float:
+        return round(
+            classification_report(
+                get_token_output_labels(gt_ner_span, text),
+                get_token_output_labels(pred_ner_span, text),
+                labels=tags,
+                output_dict=True,
+            )["macro avg"]["f1-score"],
+            2,
+        )
+EVALUATION_METRICS = [
+    PartialSpanOverlapMetric(),
+    ExactSpanOverlapMetric(),
+    TokenMicroMetric(),
+    TokenMacroMetric(),
+]

predefined_example.py CHANGED Viewed

@@ -21,6 +21,10 @@ class PredefinedExample:
     def predictions(self):
         return [self.gt_spans]
 small_example = PredefinedExample(
     text="The patient was diagnosed with bronchitis and was prescribed a mucolytic",

     def predictions(self):
         return [self.gt_spans]
+    @property
+    def tags(self):
+        return list(self.gt_labels.keys())
 small_example = PredefinedExample(
     text="The patient was diagnosed with bronchitis and was prescribed a mucolytic",