File size: 1,580 Bytes
cd80277
 
 
 
 
 
 
 
 
44921ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
"""
There are 4 data formats for spans
1. annotations - this is what we obtain from the text_annotator, the format can be seen in the predefined_examples, gt_labels
2. higlight_spans - this is the format used by the highlighter to return the highlighted html text. This is a list of string/tuples("string", "label", color)
3. ner_spans - this is the standard format used for representing ner_spans, it is a dict of {"start":int, "end":int, "label":str, "span_text":str}
4. Token level output - this is delt with in the token_level_output file, this is either a list of tuples with [(token, label)] or just a list of [label, label]
"""


def get_ner_spans_from_annotations(annotated_labels):
    spans = []
    for entity_type, spans_list in annotated_labels.items():
        for spans_dict in spans_list:
            ner_span_dict = {
                **spans_dict,
                "label": entity_type,
                "span_text": spans_dict["label"],
            }
            spans.append(ner_span_dict)
    return spans


def get_highlight_spans_from_ner_spans(ner_spans, parent_text):
    if not ner_spans:
        return [parent_text]

    output_list = []
    prev_span_end = 0
    # output_list = [parent_text[ner_spans[0]["start"]]]
    for span in ner_spans:
        output_list.append(parent_text[prev_span_end : span["start"]])
        tup = (span["span_text"], span["label"])
        output_list.append(tup)
        prev_span_end = span["end"]

    if prev_span_end != len(parent_text):
        output_list.append(parent_text[prev_span_end:])

    return output_list