Spaces:
Running
Running
updated explanations
Browse files- app.py +19 -1
- constants.py +69 -0
app.py
CHANGED
@@ -12,6 +12,8 @@ from constants import (
|
|
12 |
EVAL_FUNCTION_PROPERTIES,
|
13 |
NER_TASK_EXPLAINER,
|
14 |
PREDICTION_ADDITION_INSTRUCTION,
|
|
|
|
|
15 |
)
|
16 |
from evaluation_metrics import EVALUATION_METRICS
|
17 |
from predefined_example import EXAMPLES
|
@@ -35,7 +37,11 @@ def get_examples_attributes(selected_example):
|
|
35 |
|
36 |
if __name__ == "__main__":
|
37 |
st.set_page_config(layout="wide")
|
38 |
-
st.title(APP_TITLE)
|
|
|
|
|
|
|
|
|
39 |
|
40 |
st.write(APP_INTRO)
|
41 |
explanation_tab, comparision_tab = st.tabs(["📙 Explanation", "⚖️ Comparision"])
|
@@ -57,7 +63,19 @@ if __name__ == "__main__":
|
|
57 |
"\n"
|
58 |
f"{metric_names}"
|
59 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
|
|
|
|
|
|
|
|
61 |
with comparision_tab:
|
62 |
# with st.container():
|
63 |
st.subheader("Ground Truth & Predictions") # , divider='rainbow')
|
|
|
12 |
EVAL_FUNCTION_PROPERTIES,
|
13 |
NER_TASK_EXPLAINER,
|
14 |
PREDICTION_ADDITION_INSTRUCTION,
|
15 |
+
SPAN_BASED_METRICS_EXPLANATION,
|
16 |
+
TOKEN_BASED_METRICS_EXPLANATION,
|
17 |
)
|
18 |
from evaluation_metrics import EVALUATION_METRICS
|
19 |
from predefined_example import EXAMPLES
|
|
|
37 |
|
38 |
if __name__ == "__main__":
|
39 |
st.set_page_config(layout="wide")
|
40 |
+
# st.title(APP_TITLE)
|
41 |
+
st.markdown(
|
42 |
+
f"<h1 style='text-align: center; color: grey;'>{APP_TITLE}</h1>",
|
43 |
+
unsafe_allow_html=True,
|
44 |
+
)
|
45 |
|
46 |
st.write(APP_INTRO)
|
47 |
explanation_tab, comparision_tab = st.tabs(["📙 Explanation", "⚖️ Comparision"])
|
|
|
63 |
"\n"
|
64 |
f"{metric_names}"
|
65 |
)
|
66 |
+
st.markdown(
|
67 |
+
"These metrics can be broadly classified as 'Span Based' and 'Token Based' metrics."
|
68 |
+
)
|
69 |
+
st.markdown("### Span Based Metrics")
|
70 |
+
st.markdown(SPAN_BASED_METRICS_EXPLANATION)
|
71 |
+
|
72 |
+
st.markdown("### Token Based Metrics")
|
73 |
+
st.markdown(TOKEN_BASED_METRICS_EXPLANATION)
|
74 |
|
75 |
+
st.divider()
|
76 |
+
st.markdown(
|
77 |
+
"Now that you have read the basics of the metrics calculation, head to the comparision section to try out some examples!"
|
78 |
+
)
|
79 |
with comparision_tab:
|
80 |
# with st.container():
|
81 |
st.subheader("Ground Truth & Predictions") # , divider='rainbow')
|
constants.py
CHANGED
@@ -15,8 +15,77 @@ Some basic properties of an evaluation function are -
|
|
15 |
NER_TASK_EXPLAINER = """
|
16 |
The output of the NER task can be represented in either token format or span format.
|
17 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
### COMPARISION TAB ###
|
19 |
|
|
|
20 |
PREDICTION_ADDITION_INSTRUCTION = """
|
21 |
Add predictions to the list of predictions on which the evaluation metric will be caculated.
|
22 |
- Select the entity type/label name and then highlight the span in the text below.
|
|
|
15 |
NER_TASK_EXPLAINER = """
|
16 |
The output of the NER task can be represented in either token format or span format.
|
17 |
"""
|
18 |
+
|
19 |
+
SPAN_BASED_METRICS_EXPLANATION = """
|
20 |
+
Span based metrics use the offsets & labels of the NER spans to compare the ground truths and predictions. These are present in the NER Span representation object, which looks like this
|
21 |
+
|
22 |
+
```
|
23 |
+
span_ner_object = {"start_offset": 3, "end_offset":5, "label":"label_name"}
|
24 |
+
```
|
25 |
+
|
26 |
+
Comparing the ground truth and predicted span objects we get the following broad categories of cases (detailed explanation can be found [here](https://www.davidsbatista.net/blog/2018/05/09/Named_Entity_Evaluation/))
|
27 |
+
|
28 |
+
##### Comaparision Categories
|
29 |
+
|
30 |
+
| Category | Explanation |
|
31 |
+
| --------------- | ------------------------------------------------------------------------ |
|
32 |
+
| Correct (COR) | both are the same |
|
33 |
+
| Incorrect (INC) | the output of a system and the golden annotation don’t match |
|
34 |
+
| Partial (PAR) | system and the golden annotation are somewhat “similar” but not the same |
|
35 |
+
| Missing (MIS) | a golden annotation is not captured by a system |
|
36 |
+
| Spurious (SPU) | system produces a response which doesn’t exist in the golden annotation |
|
37 |
+
|
38 |
+
The specifics of this categorization is defined based on our metric of choice. For example, in some cases we might want to consider a partial overlap of offsets correct and in other cases incorrect.
|
39 |
+
Based on this we have the Partial & Exact span based criterias. These categorization of these schemas are shown below
|
40 |
+
|
41 |
+
|
42 |
+
| Ground Truth Entity | Ground Truth String | Pred Entity | Pred String | Partial | Exact |
|
43 |
+
| ------------------- | ------------------- | ----------- | ------------------- | ------- | ----- |
|
44 |
+
| BRAND | tikosyn | - | - | MIS | MIS |
|
45 |
+
| - | - | BRAND | healthy | SPU | SPU |
|
46 |
+
| DRUG | warfarin | DRUG | of warfarin | COR | INC |
|
47 |
+
| DRUG | propranolol | BRAND | propranolol | INC | INC |
|
48 |
+
| DRUG | phenytoin | DRUG | phenytoin | COR | COR |
|
49 |
+
| GROUP | contraceptives | DRUG | oral contraceptives | INC | INC |
|
50 |
+
|
51 |
+
To compute precision, recall and f1-score from these cases,
|
52 |
+
|
53 |
+
$$ Precision = TP / (TP + FP) = COR / (COR + INC + PAR + SPU) $$
|
54 |
+
|
55 |
+
$$ Recall = TP / (TP+FN) = COR / (COR + INC + PAR + MIS) $$
|
56 |
+
|
57 |
+
This f1-score is then computed using the harmonic mean of precision and recall.
|
58 |
+
"""
|
59 |
+
|
60 |
+
TOKEN_BASED_METRICS_EXPLANATION = """
|
61 |
+
Token based metrics use the NER token based representation object, which tokenized the input text and assigns a label to each of the token. This essentially transforms the evaluation/modelling task to a classification task.
|
62 |
+
The token based representation object is shown below
|
63 |
+
|
64 |
+
```
|
65 |
+
# Here, O represents the null label
|
66 |
+
token_ner_object = [('My', O), ('name', O), ('is', O), ('John', NAME), ('.', O)]
|
67 |
+
```
|
68 |
+
Once we have the token_objects for ground truth and predictions, we compute a classification report comparing the labels.
|
69 |
+
The final evaluation score is calculated based on exact token metric of choice.
|
70 |
+
|
71 |
+
###### Macro Average
|
72 |
+
Calculates the metrics for each label, and finds their unweighted mean. This does not take label imbalance into account.
|
73 |
+
|
74 |
+
###### Micro Average
|
75 |
+
Calculates the metrics globally by counting the total true positives, false negatives and false positives.
|
76 |
+
|
77 |
+
###### Weighted Average
|
78 |
+
Calculates the metrics for each label, and finds their average weighted by support (the number of true instances for each label).
|
79 |
+
This alters ‘macro’ to account for label imbalance; it can result in an F-score that is not between precision and recall.
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
"""
|
84 |
+
|
85 |
+
|
86 |
### COMPARISION TAB ###
|
87 |
|
88 |
+
|
89 |
PREDICTION_ADDITION_INSTRUCTION = """
|
90 |
Add predictions to the list of predictions on which the evaluation metric will be caculated.
|
91 |
- Select the entity type/label name and then highlight the span in the text below.
|