def join_words(tokens): completed_word = '' if tokens: completed_word = tokens[0] for token in tokens[1:]: if not (token.isalpha() and completed_word[-1].isalpha()): completed_word += token else: completed_word += ' ' + token return completed_word def ent_merge(data_frame): ent_result = [] current_entity_tokens = [] current_entity = None for token, tag in zip(data_frame['words'], data_frame["entities"]): if tag.startswith("B-"): if current_entity is not None: ent_result.append((join_words(current_entity_tokens), current_entity)) current_entity = tag[2:] current_entity_tokens = [token] elif current_entity_tokens is not None and ( tag == "I-" + str(current_entity) or tag == "L-" + str(current_entity)): current_entity_tokens.append(token) else: ent_result.append((join_words(current_entity_tokens), current_entity)) ent_result.append([token, tag[2:]]) current_entity_tokens = [] current_entity = None if current_entity is not None: ent_result.append((join_words(current_entity_tokens), current_entity)) # ent_result = sorted(ent_result) return ent_result