Spaces:

wadood
/

ner_evaluation_metrics

Running

App Files Files Community

wadood commited on 13 days ago

Commit

44921ac

•

1 Parent(s): 4bdf1ab

init working commit

Browse files

Files changed (7) hide show

README.md +0 -12
app.py +132 -0
evaluation_metrics.py +49 -0
predefined_example.py +58 -0
requirements.txt +2 -0
span_dataclass_converters.py +30 -0
token_level_output.py +77 -0

README.md CHANGED Viewed

@@ -1,12 +0,0 @@
----
-title: Ner Evaluation Metrics
-emoji: 👁
-colorFrom: purple
-colorTo: green
-sdk: streamlit
-sdk_version: 1.36.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import pandas as pd
+import streamlit as st
+from annotated_text import annotated_text
+from annotated_text.util import get_annotated_html
+from streamlit_annotation_tools import text_labeler
+from evaluation_metrics import EVALUATION_METRICS, get_evaluation_metric
+from predefined_example import EXAMPLES
+from span_dataclass_converters import (
+    get_highlight_spans_from_ner_spans,
+    get_ner_spans_from_annotations,
+)
+@st.cache_resource
+def get_examples_attributes(selected_example):
+    "Return example attributes so that they are not refreshed on every interaction"
+    return (
+        selected_example.text,
+        selected_example.gt_labels,
+        selected_example.gt_spans,
+        selected_example.predictions,
+    )
+if __name__ == "__main__":
+    st.set_page_config(layout="wide")
+    st.title("NER Evaluation Metrics Comparison")
+    st.write(
+        "Evaluation for the NER task requires a ground truth and a prediction that will be evaluated. The ground truth is shown below, add predictions in the next section to compare the evaluation metrics."
+    )
+    # with st.container():
+    st.subheader("Ground Truth")  # , divider='rainbow')
+    selected_example = st.selectbox(
+        "Select an example text from the drop down below",
+        [example for example in EXAMPLES],
+        format_func=lambda ex: ex.text,
+    )
+    text, gt_labels, gt_spans, predictions = get_examples_attributes(selected_example)
+    annotated_text(
+        get_highlight_spans_from_ner_spans(
+            get_ner_spans_from_annotations(gt_labels), text
+        )
+    )
+    annotated_predictions = [
+        get_annotated_html(get_highlight_spans_from_ner_spans(ner_span, text))
+        for ner_span in predictions
+    ]
+    predictions_df = pd.DataFrame(
+        {
+            # "ID": [f"Prediction_{index}" for index in range(len(predictions))],
+            "Prediction": annotated_predictions,
+            "ner_spans": predictions,
+        },
+        index=[f"Prediction_{index}" for index in range(len(predictions))],
+    )
+    st.subheader("Predictions")  # , divider='rainbow')
+    with st.expander("Click to Add Predictions"):
+        st.subheader("Adding predictions")
+        st.markdown(
+            """
+Add predictions to the list of predictions on which the evaluation metric will be caculated.
+- Select the entity type/label name and then highlight the span in the text below.
+- To remove a span, double click on the higlighted text.
+- Once you have your desired prediction, click on the 'Add' button.(The prediction created is shown in a json below)
+"""
+        )
+        st.write(
+            "Note: Only the spans of the selected label name is shown at a given instance.",
+        )
+        labels = text_labeler(text, gt_labels)
+        st.json(labels, expanded=False)
+        # if st.button("Add Prediction"):
+        # labels = text_labeler(text)
+        if st.button("Add!"):
+            spans = get_ner_spans_from_annotations(labels)
+            spans = sorted(spans, key=lambda span: span["start"])
+            predictions.append(spans)
+            annotated_predictions.append(
+                get_annotated_html(get_highlight_spans_from_ner_spans(spans, text))
+            )
+            predictions_df = pd.DataFrame(
+                {
+                    # "ID": [f"Prediction_{index}" for index in range(len(predictions))],
+                    "Prediction": annotated_predictions,
+                    "ner_spans": predictions,
+                },
+                index=[f"Prediction_{index}" for index in range(len(predictions))],
+            )
+            print("added")
+    highlighted_predictions_df = predictions_df[["Prediction"]]
+    st.write(highlighted_predictions_df.to_html(escape=False), unsafe_allow_html=True)
+    st.divider()
+    ### EVALUATION METRICS COMPARISION ###
+    st.subheader("Evaluation Metrics Comparision")  # , divider='rainbow')
+    st.markdown("""
+    The different evaluation metrics we have for the NER task are
+    - Span Based Evaluation with Partial Overlap
+    - Token Based Evaluation with Micro Avg
+    - Token Based Evaluation with Macro Avg
+    """)
+    with st.expander("View Predictions Details"):
+        st.write(predictions_df.to_html(escape=False), unsafe_allow_html=True)
+    if st.button("Get Metrics!"):
+        for evaluation_metric_type in EVALUATION_METRICS:
+            predictions_df[evaluation_metric_type] = predictions_df.ner_spans.apply(
+                lambda ner_spans: get_evaluation_metric(
+                    metric_type=evaluation_metric_type,
+                    gt_ner_span=gt_spans,
+                    pred_ner_span=ner_spans,
+                    text=text,
+                )
+            )
+        metrics_df = predictions_df.drop(["ner_spans"], axis=1)
+        st.write(metrics_df.to_html(escape=False), unsafe_allow_html=True)
+        print("compared")

evaluation_metrics.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from nervaluate import Evaluator
+from sklearn.metrics import classification_report
+from token_level_output import get_token_output_labels
+EVALUATION_METRICS = [
+    "Span Based Evaluation with Partial Overlap",
+    "Token Based Evaluation with Micro Avg",
+    "Token Based Evaluation with Macro Avg",
+]
+def get_span_eval(gt_ner_span, pred_ner_span, text):
+    evaluator = Evaluator([gt_ner_span], [pred_ner_span], tags=["Disease", "Drug"])
+    return round(evaluator.evaluate()[0]["ent_type"]["f1"], 2)
+def get_token_micro_eval(gt_ner_span, pred_ner_span, text):
+    return round(
+        classification_report(
+            get_token_output_labels(gt_ner_span, text),
+            get_token_output_labels(pred_ner_span, text),
+            labels=["Disease", "Drug"],
+            output_dict=True,
+        )["micro avg"]["f1-score"],
+        2,
+    )
+def get_token_macro_eval(gt_ner_span, pred_ner_span, text):
+    return round(
+        classification_report(
+            get_token_output_labels(gt_ner_span, text),
+            get_token_output_labels(pred_ner_span, text),
+            labels=["Disease", "Drug"],
+            output_dict=True,
+        )["macro avg"]["f1-score"],
+        2,
+    )
+def get_evaluation_metric(metric_type, gt_ner_span, pred_ner_span, text):
+    match metric_type:
+        case "Span Based Evaluation with Partial Overlap":
+            return get_span_eval(gt_ner_span, pred_ner_span, text)
+        case "Token Based Evaluation with Micro Avg":
+            return get_token_micro_eval(gt_ner_span, pred_ner_span, text)
+        case "Token Based Evaluation with Macro Avg":
+            return get_token_macro_eval(gt_ner_span, pred_ner_span, text)

predefined_example.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from dataclasses import dataclass
+from span_dataclass_converters import get_ner_spans_from_annotations
+@dataclass
+class PredefinedExample:
+    text: str
+    gt_labels: dict
+    # gt_spans: list
+    # predictions: list
+    @property
+    def gt_spans(self):
+        return sorted(
+            get_ner_spans_from_annotations(self.gt_labels),
+            key=lambda span: span["start"],
+        )
+    @property
+    def predictions(self):
+        return [self.gt_spans]
+small_example = PredefinedExample(
+    text="The patient was diagnosed with bronchitis and was prescribed a mucolytic",
+    gt_labels={
+        "Disease": [
+            {"start": 31, "end": 41, "label": "bronchitis"},
+        ],
+        "Drug": [
+            {"start": 63, "end": 72, "label": "mucolytic"},
+        ],
+    },
+)
+big_example = PredefinedExample(
+    text=(
+        "The patient was experiencing stomach pain and flu like symptoms for 3 days. "
+        "Upon investigation, the chest xray revealed acute bronchitis disease. "
+        "The patient was asked to take rest for a week and was prescribed a mucolytic along with paracetamol for body pains."
+    ),
+    gt_labels={
+        "Disease": [
+            {"start": 120, "end": 144, "label": "acute bronchitis disease"},
+        ],
+        "Drug": [
+            {"start": 213, "end": 222, "label": "mucolytic"},
+            {"start": 234, "end": 245, "label": "paracetamol"},
+        ],
+        "Symptoms": [
+            {"start": 29, "end": 41, "label": "stomach pain"},
+            {"start": 46, "end": 63, "label": "flu like symptoms"},
+        ],
+    },
+)
+EXAMPLES = [small_example, big_example]

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ streamlit_annotation_tools
2	+ annotated_text

span_dataclass_converters.py ADDED Viewed

	@@ -0,0 +1,30 @@

+def get_ner_spans_from_annotations(annotated_labels):
+    spans = []
+    for entity_type, spans_list in annotated_labels.items():
+        for spans_dict in spans_list:
+            ner_span_dict = {
+                **spans_dict,
+                "label": entity_type,
+                "span_text": spans_dict["label"],
+            }
+            spans.append(ner_span_dict)
+    return spans
+def get_highlight_spans_from_ner_spans(ner_spans, parent_text):
+    if not ner_spans:
+        return [parent_text]
+    output_list = []
+    prev_span_end = 0
+    # output_list = [parent_text[ner_spans[0]["start"]]]
+    for span in ner_spans:
+        output_list.append(parent_text[prev_span_end : span["start"]])
+        tup = (span["span_text"], span["label"])
+        output_list.append(tup)
+        prev_span_end = span["end"]
+    if prev_span_end != len(parent_text):
+        output_list.append(parent_text[prev_span_end:])
+    return output_list

token_level_output.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import re
+class WhitespaceTokenSplitter:
+    def __init__(self):
+        self.whitespace_pattern = re.compile(r"\w+(?:[-_]\w+)*|\S")
+    def __call__(self, text):
+        for match in self.whitespace_pattern.finditer(text):
+            yield match.group(), match.start(), match.end()
+tokenizer = WhitespaceTokenSplitter()
+def get_char_label_map(ner_spans: list):
+    """return a dict with char indices(int) as keys and the label they belong to as values
+    example -- {1:'label1', 2: 'label1', 5:'label2', 5:'label2'}
+    note: the char indices that do not belong to a span do not exist in the map
+    """
+    char_label_map = {}
+    for span in ner_spans:
+        char_label_map = {
+            **char_label_map,
+            **{
+                char_index: span["label"]
+                for char_index in range(span["start"], span["end"])
+            },
+        }
+    return char_label_map
+def get_tokens(text: str) -> list[str]:
+    tokens_with_offsets = list(tokenizer(text))
+    return [token for token, start, end in tokens_with_offsets]
+def get_token_offsets(text: str) -> list[tuple[int, int]]:
+    tokens_with_offsets = list(tokenizer(text))
+    return [(start, end) for token, start, end in tokens_with_offsets]
+def get_list_of_token_label_tuples(
+    tokens: list[str],
+    token_spans: list[tuple[int, int]],
+    char_label_map: dict[int, str],
+) -> list[tuple[str, str]]:
+    """
+    returns a list of tuples with first element as token and second element as the label
+    example - [('a', 'O'), ('cat', 'ANIMAL'), ('sits', 'O')]
+    note: the label of a token is decided based on the max chars in the token belonging to a span
+    """
+    token_labels = []
+    for token, offsets in zip(tokens, token_spans):
+        if offsets[0] == offsets[1]:
+            token_labels.append((token, "O"))
+            continue
+        char_labels = [
+            char_label_map.get(char_index, "O") for char_index in range(*offsets)
+        ]
+        token_label = max(set(char_labels), key=char_labels.count)
+        token_labels.append((token, token_label))
+    return token_labels
+def get_token_outputs(ner_spans, parent_text):
+    char_label_map = get_char_label_map(ner_spans)
+    token_offsets = get_token_offsets(parent_text)
+    tokens = get_tokens(parent_text)
+    return get_list_of_token_label_tuples(tokens, token_offsets, char_label_map)
+def get_token_output_labels(ner_spans, parent_text):
+    token_output = get_token_outputs(ner_spans, parent_text)
+    return [label for token, label in token_output]