File size: 1,340 Bytes
06924e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def join_words(tokens):
    completed_word = ''
    if tokens:
        completed_word = tokens[0]
        for token in tokens[1:]:
            if not (token.isalpha() and completed_word[-1].isalpha()):
                completed_word += token
            else:
                completed_word += ' ' + token
    return completed_word


def ent_merge(data_frame):
    ent_result = []
    current_entity_tokens = []
    current_entity = None
    for token, tag in zip(data_frame['words'], data_frame["entities"]):
        if tag.startswith("B-"):
            if current_entity is not None:
                ent_result.append((join_words(current_entity_tokens), current_entity))
            current_entity = tag[2:]
            current_entity_tokens = [token]
        elif current_entity_tokens is not None and (
                tag == "I-" + str(current_entity) or tag == "L-" + str(current_entity)):
            current_entity_tokens.append(token)
        else:
            ent_result.append((join_words(current_entity_tokens), current_entity))
            ent_result.append([token, tag[2:]])
            current_entity_tokens = []
            current_entity = None
    if current_entity is not None:
        ent_result.append((join_words(current_entity_tokens), current_entity))
        # ent_result = sorted(ent_result)
    return ent_result