|
def format_predictions(words, predictions): |
|
''' |
|
Chuyển đổi danh sách từ và dự đoán sang định dạng (word, label) |
|
''' |
|
formatted = [] |
|
for word, label in zip(words, predictions): |
|
formatted.append((word, label)) |
|
return formatted |
|
|
|
def process_predictions(predictions): |
|
''' |
|
Tách các từ có dấu gạch dưới thành các từ riêng biệt với cùng nhãn |
|
''' |
|
formatted = [] |
|
for word, label in predictions: |
|
if '_' in word: |
|
formatted.append((word.replace('_', ' '), label)) |
|
else: |
|
formatted.append((word, label)) |
|
return formatted |
|
|
|
|
|
def combine_entities(predictions): |
|
combined = [] |
|
temp_entity = [] |
|
temp_label = None |
|
|
|
for word, label in predictions: |
|
if label.startswith('B-'): |
|
if temp_entity: |
|
combined.append((' '.join(temp_entity), temp_label)) |
|
temp_entity = [] |
|
temp_entity.append(word) |
|
temp_label = label |
|
elif label.startswith('I-') and temp_label and label[2:] == temp_label[2:]: |
|
temp_entity.append(word) |
|
else: |
|
if temp_entity: |
|
combined.append((' '.join(temp_entity), temp_label)) |
|
temp_entity = [] |
|
temp_label = None |
|
combined.append((word, label)) |
|
|
|
if temp_entity: |
|
combined.append((' '.join(temp_entity), temp_label)) |
|
|
|
return combined |
|
|
|
|
|
|
|
|
|
def remove_B_prefix(entities): |
|
modified_entities = [] |
|
for word, label in entities: |
|
if label.startswith('B-'): |
|
label = label[2:] |
|
modified_entities.append((word, label)) |
|
return modified_entities |
|
|
|
|
|
def combine_i_tags(tokens_labels): |
|
combined = [] |
|
current_combination = [] |
|
current_label = None |
|
|
|
for token, label in tokens_labels: |
|
if label.startswith('I-'): |
|
label = label[2:] |
|
if current_label is None: |
|
current_label = label |
|
current_combination.append(token) |
|
elif current_label == label: |
|
current_combination.append(token) |
|
else: |
|
combined.append((' '.join(current_combination), current_label)) |
|
current_combination = [token] |
|
current_label = label |
|
else: |
|
if current_combination: |
|
combined.append((' '.join(current_combination), current_label)) |
|
current_combination = [] |
|
current_label = None |
|
combined.append((token, label)) |
|
|
|
if current_combination: |
|
combined.append((' '.join(current_combination), current_label)) |
|
|
|
return combined |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|