wadood commited on
Commit
44921ac
1 Parent(s): 4bdf1ab

init working commit

Browse files
README.md CHANGED
@@ -1,12 +0,0 @@
1
- ---
2
- title: Ner Evaluation Metrics
3
- emoji: 👁
4
- colorFrom: purple
5
- colorTo: green
6
- sdk: streamlit
7
- sdk_version: 1.36.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+ from annotated_text import annotated_text
4
+ from annotated_text.util import get_annotated_html
5
+ from streamlit_annotation_tools import text_labeler
6
+
7
+ from evaluation_metrics import EVALUATION_METRICS, get_evaluation_metric
8
+ from predefined_example import EXAMPLES
9
+ from span_dataclass_converters import (
10
+ get_highlight_spans_from_ner_spans,
11
+ get_ner_spans_from_annotations,
12
+ )
13
+
14
+
15
+ @st.cache_resource
16
+ def get_examples_attributes(selected_example):
17
+ "Return example attributes so that they are not refreshed on every interaction"
18
+ return (
19
+ selected_example.text,
20
+ selected_example.gt_labels,
21
+ selected_example.gt_spans,
22
+ selected_example.predictions,
23
+ )
24
+
25
+
26
+ if __name__ == "__main__":
27
+ st.set_page_config(layout="wide")
28
+ st.title("NER Evaluation Metrics Comparison")
29
+
30
+ st.write(
31
+ "Evaluation for the NER task requires a ground truth and a prediction that will be evaluated. The ground truth is shown below, add predictions in the next section to compare the evaluation metrics."
32
+ )
33
+
34
+ # with st.container():
35
+ st.subheader("Ground Truth") # , divider='rainbow')
36
+
37
+ selected_example = st.selectbox(
38
+ "Select an example text from the drop down below",
39
+ [example for example in EXAMPLES],
40
+ format_func=lambda ex: ex.text,
41
+ )
42
+
43
+ text, gt_labels, gt_spans, predictions = get_examples_attributes(selected_example)
44
+
45
+ annotated_text(
46
+ get_highlight_spans_from_ner_spans(
47
+ get_ner_spans_from_annotations(gt_labels), text
48
+ )
49
+ )
50
+
51
+ annotated_predictions = [
52
+ get_annotated_html(get_highlight_spans_from_ner_spans(ner_span, text))
53
+ for ner_span in predictions
54
+ ]
55
+ predictions_df = pd.DataFrame(
56
+ {
57
+ # "ID": [f"Prediction_{index}" for index in range(len(predictions))],
58
+ "Prediction": annotated_predictions,
59
+ "ner_spans": predictions,
60
+ },
61
+ index=[f"Prediction_{index}" for index in range(len(predictions))],
62
+ )
63
+
64
+ st.subheader("Predictions") # , divider='rainbow')
65
+
66
+ with st.expander("Click to Add Predictions"):
67
+ st.subheader("Adding predictions")
68
+ st.markdown(
69
+ """
70
+ Add predictions to the list of predictions on which the evaluation metric will be caculated.
71
+ - Select the entity type/label name and then highlight the span in the text below.
72
+ - To remove a span, double click on the higlighted text.
73
+ - Once you have your desired prediction, click on the 'Add' button.(The prediction created is shown in a json below)
74
+ """
75
+ )
76
+ st.write(
77
+ "Note: Only the spans of the selected label name is shown at a given instance.",
78
+ )
79
+ labels = text_labeler(text, gt_labels)
80
+ st.json(labels, expanded=False)
81
+
82
+ # if st.button("Add Prediction"):
83
+ # labels = text_labeler(text)
84
+ if st.button("Add!"):
85
+ spans = get_ner_spans_from_annotations(labels)
86
+ spans = sorted(spans, key=lambda span: span["start"])
87
+ predictions.append(spans)
88
+ annotated_predictions.append(
89
+ get_annotated_html(get_highlight_spans_from_ner_spans(spans, text))
90
+ )
91
+ predictions_df = pd.DataFrame(
92
+ {
93
+ # "ID": [f"Prediction_{index}" for index in range(len(predictions))],
94
+ "Prediction": annotated_predictions,
95
+ "ner_spans": predictions,
96
+ },
97
+ index=[f"Prediction_{index}" for index in range(len(predictions))],
98
+ )
99
+ print("added")
100
+
101
+ highlighted_predictions_df = predictions_df[["Prediction"]]
102
+ st.write(highlighted_predictions_df.to_html(escape=False), unsafe_allow_html=True)
103
+ st.divider()
104
+
105
+ ### EVALUATION METRICS COMPARISION ###
106
+
107
+ st.subheader("Evaluation Metrics Comparision") # , divider='rainbow')
108
+ st.markdown("""
109
+ The different evaluation metrics we have for the NER task are
110
+ - Span Based Evaluation with Partial Overlap
111
+ - Token Based Evaluation with Micro Avg
112
+ - Token Based Evaluation with Macro Avg
113
+ """)
114
+
115
+ with st.expander("View Predictions Details"):
116
+ st.write(predictions_df.to_html(escape=False), unsafe_allow_html=True)
117
+
118
+ if st.button("Get Metrics!"):
119
+ for evaluation_metric_type in EVALUATION_METRICS:
120
+ predictions_df[evaluation_metric_type] = predictions_df.ner_spans.apply(
121
+ lambda ner_spans: get_evaluation_metric(
122
+ metric_type=evaluation_metric_type,
123
+ gt_ner_span=gt_spans,
124
+ pred_ner_span=ner_spans,
125
+ text=text,
126
+ )
127
+ )
128
+
129
+ metrics_df = predictions_df.drop(["ner_spans"], axis=1)
130
+
131
+ st.write(metrics_df.to_html(escape=False), unsafe_allow_html=True)
132
+ print("compared")
evaluation_metrics.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nervaluate import Evaluator
2
+ from sklearn.metrics import classification_report
3
+
4
+ from token_level_output import get_token_output_labels
5
+
6
+ EVALUATION_METRICS = [
7
+ "Span Based Evaluation with Partial Overlap",
8
+ "Token Based Evaluation with Micro Avg",
9
+ "Token Based Evaluation with Macro Avg",
10
+ ]
11
+
12
+
13
+ def get_span_eval(gt_ner_span, pred_ner_span, text):
14
+ evaluator = Evaluator([gt_ner_span], [pred_ner_span], tags=["Disease", "Drug"])
15
+ return round(evaluator.evaluate()[0]["ent_type"]["f1"], 2)
16
+
17
+
18
+ def get_token_micro_eval(gt_ner_span, pred_ner_span, text):
19
+ return round(
20
+ classification_report(
21
+ get_token_output_labels(gt_ner_span, text),
22
+ get_token_output_labels(pred_ner_span, text),
23
+ labels=["Disease", "Drug"],
24
+ output_dict=True,
25
+ )["micro avg"]["f1-score"],
26
+ 2,
27
+ )
28
+
29
+
30
+ def get_token_macro_eval(gt_ner_span, pred_ner_span, text):
31
+ return round(
32
+ classification_report(
33
+ get_token_output_labels(gt_ner_span, text),
34
+ get_token_output_labels(pred_ner_span, text),
35
+ labels=["Disease", "Drug"],
36
+ output_dict=True,
37
+ )["macro avg"]["f1-score"],
38
+ 2,
39
+ )
40
+
41
+
42
+ def get_evaluation_metric(metric_type, gt_ner_span, pred_ner_span, text):
43
+ match metric_type:
44
+ case "Span Based Evaluation with Partial Overlap":
45
+ return get_span_eval(gt_ner_span, pred_ner_span, text)
46
+ case "Token Based Evaluation with Micro Avg":
47
+ return get_token_micro_eval(gt_ner_span, pred_ner_span, text)
48
+ case "Token Based Evaluation with Macro Avg":
49
+ return get_token_macro_eval(gt_ner_span, pred_ner_span, text)
predefined_example.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ from span_dataclass_converters import get_ner_spans_from_annotations
4
+
5
+
6
+ @dataclass
7
+ class PredefinedExample:
8
+ text: str
9
+ gt_labels: dict
10
+ # gt_spans: list
11
+ # predictions: list
12
+
13
+ @property
14
+ def gt_spans(self):
15
+ return sorted(
16
+ get_ner_spans_from_annotations(self.gt_labels),
17
+ key=lambda span: span["start"],
18
+ )
19
+
20
+ @property
21
+ def predictions(self):
22
+ return [self.gt_spans]
23
+
24
+
25
+ small_example = PredefinedExample(
26
+ text="The patient was diagnosed with bronchitis and was prescribed a mucolytic",
27
+ gt_labels={
28
+ "Disease": [
29
+ {"start": 31, "end": 41, "label": "bronchitis"},
30
+ ],
31
+ "Drug": [
32
+ {"start": 63, "end": 72, "label": "mucolytic"},
33
+ ],
34
+ },
35
+ )
36
+
37
+ big_example = PredefinedExample(
38
+ text=(
39
+ "The patient was experiencing stomach pain and flu like symptoms for 3 days. "
40
+ "Upon investigation, the chest xray revealed acute bronchitis disease. "
41
+ "The patient was asked to take rest for a week and was prescribed a mucolytic along with paracetamol for body pains."
42
+ ),
43
+ gt_labels={
44
+ "Disease": [
45
+ {"start": 120, "end": 144, "label": "acute bronchitis disease"},
46
+ ],
47
+ "Drug": [
48
+ {"start": 213, "end": 222, "label": "mucolytic"},
49
+ {"start": 234, "end": 245, "label": "paracetamol"},
50
+ ],
51
+ "Symptoms": [
52
+ {"start": 29, "end": 41, "label": "stomach pain"},
53
+ {"start": 46, "end": 63, "label": "flu like symptoms"},
54
+ ],
55
+ },
56
+ )
57
+
58
+ EXAMPLES = [small_example, big_example]
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ streamlit_annotation_tools
2
+ annotated_text
span_dataclass_converters.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def get_ner_spans_from_annotations(annotated_labels):
2
+ spans = []
3
+ for entity_type, spans_list in annotated_labels.items():
4
+ for spans_dict in spans_list:
5
+ ner_span_dict = {
6
+ **spans_dict,
7
+ "label": entity_type,
8
+ "span_text": spans_dict["label"],
9
+ }
10
+ spans.append(ner_span_dict)
11
+ return spans
12
+
13
+
14
+ def get_highlight_spans_from_ner_spans(ner_spans, parent_text):
15
+ if not ner_spans:
16
+ return [parent_text]
17
+
18
+ output_list = []
19
+ prev_span_end = 0
20
+ # output_list = [parent_text[ner_spans[0]["start"]]]
21
+ for span in ner_spans:
22
+ output_list.append(parent_text[prev_span_end : span["start"]])
23
+ tup = (span["span_text"], span["label"])
24
+ output_list.append(tup)
25
+ prev_span_end = span["end"]
26
+
27
+ if prev_span_end != len(parent_text):
28
+ output_list.append(parent_text[prev_span_end:])
29
+
30
+ return output_list
token_level_output.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+
4
+ class WhitespaceTokenSplitter:
5
+ def __init__(self):
6
+ self.whitespace_pattern = re.compile(r"\w+(?:[-_]\w+)*|\S")
7
+
8
+ def __call__(self, text):
9
+ for match in self.whitespace_pattern.finditer(text):
10
+ yield match.group(), match.start(), match.end()
11
+
12
+
13
+ tokenizer = WhitespaceTokenSplitter()
14
+
15
+
16
+ def get_char_label_map(ner_spans: list):
17
+ """return a dict with char indices(int) as keys and the label they belong to as values
18
+ example -- {1:'label1', 2: 'label1', 5:'label2', 5:'label2'}
19
+ note: the char indices that do not belong to a span do not exist in the map
20
+ """
21
+ char_label_map = {}
22
+ for span in ner_spans:
23
+ char_label_map = {
24
+ **char_label_map,
25
+ **{
26
+ char_index: span["label"]
27
+ for char_index in range(span["start"], span["end"])
28
+ },
29
+ }
30
+ return char_label_map
31
+
32
+
33
+ def get_tokens(text: str) -> list[str]:
34
+ tokens_with_offsets = list(tokenizer(text))
35
+ return [token for token, start, end in tokens_with_offsets]
36
+
37
+
38
+ def get_token_offsets(text: str) -> list[tuple[int, int]]:
39
+ tokens_with_offsets = list(tokenizer(text))
40
+ return [(start, end) for token, start, end in tokens_with_offsets]
41
+
42
+
43
+ def get_list_of_token_label_tuples(
44
+ tokens: list[str],
45
+ token_spans: list[tuple[int, int]],
46
+ char_label_map: dict[int, str],
47
+ ) -> list[tuple[str, str]]:
48
+ """
49
+ returns a list of tuples with first element as token and second element as the label
50
+ example - [('a', 'O'), ('cat', 'ANIMAL'), ('sits', 'O')]
51
+ note: the label of a token is decided based on the max chars in the token belonging to a span
52
+ """
53
+ token_labels = []
54
+ for token, offsets in zip(tokens, token_spans):
55
+ if offsets[0] == offsets[1]:
56
+ token_labels.append((token, "O"))
57
+ continue
58
+ char_labels = [
59
+ char_label_map.get(char_index, "O") for char_index in range(*offsets)
60
+ ]
61
+ token_label = max(set(char_labels), key=char_labels.count)
62
+ token_labels.append((token, token_label))
63
+ return token_labels
64
+
65
+
66
+ def get_token_outputs(ner_spans, parent_text):
67
+ char_label_map = get_char_label_map(ner_spans)
68
+
69
+ token_offsets = get_token_offsets(parent_text)
70
+ tokens = get_tokens(parent_text)
71
+
72
+ return get_list_of_token_label_tuples(tokens, token_offsets, char_label_map)
73
+
74
+
75
+ def get_token_output_labels(ner_spans, parent_text):
76
+ token_output = get_token_outputs(ner_spans, parent_text)
77
+ return [label for token, label in token_output]