NER_Tagger / iblou_merger.py
kandysh
first upload
06924e0
raw
history blame
No virus
1.34 kB
def join_words(tokens):
completed_word = ''
if tokens:
completed_word = tokens[0]
for token in tokens[1:]:
if not (token.isalpha() and completed_word[-1].isalpha()):
completed_word += token
else:
completed_word += ' ' + token
return completed_word
def ent_merge(data_frame):
ent_result = []
current_entity_tokens = []
current_entity = None
for token, tag in zip(data_frame['words'], data_frame["entities"]):
if tag.startswith("B-"):
if current_entity is not None:
ent_result.append((join_words(current_entity_tokens), current_entity))
current_entity = tag[2:]
current_entity_tokens = [token]
elif current_entity_tokens is not None and (
tag == "I-" + str(current_entity) or tag == "L-" + str(current_entity)):
current_entity_tokens.append(token)
else:
ent_result.append((join_words(current_entity_tokens), current_entity))
ent_result.append([token, tag[2:]])
current_entity_tokens = []
current_entity = None
if current_entity is not None:
ent_result.append((join_words(current_entity_tokens), current_entity))
# ent_result = sorted(ent_result)
return ent_result