SanDP / utils /tasks /seqeval.py
shivrajanand's picture
Upload folder using huggingface_hub
a7b3936 verified
# From https://github.com/chakki-works/seqeval/blob/master/seqeval/metrics/sequence_labeling.py
"""Metrics to assess performance on sequence labeling task given prediction
Functions named as ``*_score`` return a scalar value to maximize: the higher
the better
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import defaultdict
import numpy as np
def get_entities(seq, suffix=False):
"""Gets entities from sequence.
Args:
seq (list): sequence of labels.
Returns:
list: list of (chunk_type, chunk_start, chunk_end).
Example:
>>> from seqeval.metrics.sequence_labeling import get_entities
>>> seq = ['B-PER', 'I-PER', 'O', 'B-LOC']
>>> get_entities(seq)
[('PER', 0, 1), ('LOC', 3, 3)]
"""
# for nested list
if any(isinstance(s, list) for s in seq):
seq = [item for sublist in seq for item in sublist + ['O']]
prev_tag = 'O'
prev_type = ''
begin_offset = 0
chunks = []
for i, chunk in enumerate(seq + ['O']):
if suffix:
tag = chunk[-1]
type_ = chunk.split('-')[0]
else:
tag = chunk[0]
type_ = chunk.split('-')[-1]
if end_of_chunk(prev_tag, tag, prev_type, type_):
chunks.append((prev_type, begin_offset, i-1))
if start_of_chunk(prev_tag, tag, prev_type, type_):
begin_offset = i
prev_tag = tag
prev_type = type_
return chunks
def end_of_chunk(prev_tag, tag, prev_type, type_):
"""Checks if a chunk ended between the previous and current word.
Args:
prev_tag: previous chunk tag.
tag: current chunk tag.
prev_type: previous type.
type_: current type.
Returns:
chunk_end: boolean.
"""
chunk_end = False
if prev_tag == 'E': chunk_end = True
if prev_tag == 'S': chunk_end = True
if prev_tag == 'B' and tag == 'B': chunk_end = True
if prev_tag == 'B' and tag == 'S': chunk_end = True
if prev_tag == 'B' and tag == 'O': chunk_end = True
if prev_tag == 'I' and tag == 'B': chunk_end = True
if prev_tag == 'I' and tag == 'S': chunk_end = True
if prev_tag == 'I' and tag == 'O': chunk_end = True
if prev_tag != 'O' and prev_tag != '.' and prev_type != type_:
chunk_end = True
return chunk_end
def start_of_chunk(prev_tag, tag, prev_type, type_):
"""Checks if a chunk started between the previous and current word.
Args:
prev_tag: previous chunk tag.
tag: current chunk tag.
prev_type: previous type.
type_: current type.
Returns:
chunk_start: boolean.
"""
chunk_start = False
if tag == 'B': chunk_start = True
if tag == 'S': chunk_start = True
if prev_tag == 'E' and tag == 'E': chunk_start = True
if prev_tag == 'E' and tag == 'I': chunk_start = True
if prev_tag == 'S' and tag == 'E': chunk_start = True
if prev_tag == 'S' and tag == 'I': chunk_start = True
if prev_tag == 'O' and tag == 'E': chunk_start = True
if prev_tag == 'O' and tag == 'I': chunk_start = True
if tag != 'O' and tag != '.' and prev_type != type_:
chunk_start = True
return chunk_start
def f1_score(y_true, y_pred, average='micro', suffix=False):
"""Compute the F1 score.
The F1 score can be interpreted as a weighted average of the precision and
recall, where an F1 score reaches its best value at 1 and worst score at 0.
The relative contribution of precision and recall to the F1 score are
equal. The formula for the F1 score is::
F1 = 2 * (precision * recall) / (precision + recall)
Args:
y_true : 2d array. Ground truth (correct) target values.
y_pred : 2d array. Estimated targets as returned by a tagger.
Returns:
score : float.
Example:
>>> from seqeval.metrics import f1_score
>>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> f1_score(y_true, y_pred)
0.50
# """
# from sklearn.metrics import f1_score
# score = f1_score(y_true, y_pred, average='macro')
true_entities = set(get_entities(y_true, suffix))
pred_entities = set(get_entities(y_pred, suffix))
nb_correct = len(true_entities & pred_entities)
nb_pred = len(pred_entities)
nb_true = len(true_entities)
p = nb_correct / nb_pred if nb_pred > 0 else 0
r = nb_correct / nb_true if nb_true > 0 else 0
score = 2 * p * r / (p + r) if p + r > 0 else 0
return score
def accuracy_score(y_true, y_pred):
"""Accuracy classification score.
In multilabel classification, this function computes subset accuracy:
the set of labels predicted for a sample must *exactly* match the
corresponding set of labels in y_true.
Args:
y_true : 2d array. Ground truth (correct) target values.
y_pred : 2d array. Estimated targets as returned by a tagger.
Returns:
score : float.
Example:
>>> from seqeval.metrics import accuracy_score
>>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> accuracy_score(y_true, y_pred)
0.80
"""
if any(isinstance(s, list) for s in y_true):
y_true = [item for sublist in y_true for item in sublist]
y_pred = [item for sublist in y_pred for item in sublist]
nb_correct = sum(y_t==y_p for y_t, y_p in zip(y_true, y_pred))
nb_true = len(y_true)
score = nb_correct / nb_true
return score
def precision_score(y_true, y_pred, average='micro', suffix=False):
"""Compute the precision.
The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of
true positives and ``fp`` the number of false positives. The precision is
intuitively the ability of the sequence_tagger not to label as positive a sample.
The best value is 1 and the worst value is 0.
Args:
y_true : 2d array. Ground truth (correct) target values.
y_pred : 2d array. Estimated targets as returned by a tagger.
Returns:
score : float.
Example:
>>> from seqeval.metrics import precision_score
>>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> precision_score(y_true, y_pred)
0.50
"""
true_entities = set(get_entities(y_true, suffix))
pred_entities = set(get_entities(y_pred, suffix))
nb_correct = len(true_entities & pred_entities)
nb_pred = len(pred_entities)
score = nb_correct / nb_pred if nb_pred > 0 else 0
return score
def recall_score(y_true, y_pred, average='micro', suffix=False):
"""Compute the recall.
The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of
true positives and ``fn`` the number of false negatives. The recall is
intuitively the ability of the sequence_tagger to find all the positive samples.
The best value is 1 and the worst value is 0.
Args:
y_true : 2d array. Ground truth (correct) target values.
y_pred : 2d array. Estimated targets as returned by a tagger.
Returns:
score : float.
Example:
>>> from seqeval.metrics import recall_score
>>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> recall_score(y_true, y_pred)
0.50
"""
true_entities = set(get_entities(y_true, suffix))
pred_entities = set(get_entities(y_pred, suffix))
nb_correct = len(true_entities & pred_entities)
nb_true = len(true_entities)
score = nb_correct / nb_true if nb_true > 0 else 0
return score
def performance_measure(y_true, y_pred):
"""
Compute the performance metrics: TP, FP, FN, TN
Args:
y_true : 2d array. Ground truth (correct) target values.
y_pred : 2d array. Estimated targets as returned by a tagger.
Returns:
performance_dict : dict
Example:
>>> from seqeval.metrics import performance_measure
>>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'B-ORG'], ['B-PER', 'I-PER', 'O']]
>>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O', 'O'], ['B-PER', 'I-PER', 'O']]
>>> performance_measure(y_true, y_pred)
(3, 3, 1, 4)
"""
performace_dict = dict()
if any(isinstance(s, list) for s in y_true):
y_true = [item for sublist in y_true for item in sublist]
y_pred = [item for sublist in y_pred for item in sublist]
performace_dict['TP'] = sum(y_t == y_p for y_t, y_p in zip(y_true, y_pred)
if ((y_t != 'O') or (y_p != 'O')))
performace_dict['FP'] = sum(y_t != y_p for y_t, y_p in zip(y_true, y_pred))
performace_dict['FN'] = sum(((y_t != 'O') and (y_p == 'O'))
for y_t, y_p in zip(y_true, y_pred))
performace_dict['TN'] = sum((y_t == y_p == 'O')
for y_t, y_p in zip(y_true, y_pred))
return performace_dict
def classification_report(y_true, y_pred, digits=2, suffix=False):
"""Build a text report showing the main classification metrics.
Args:
y_true : 2d array. Ground truth (correct) target values.
y_pred : 2d array. Estimated targets as returned by a sequence_tagger.
digits : int. Number of digits for formatting output floating point values.
Returns:
report : string. Text summary of the precision, recall, F1 score for each class.
Examples:
>>> from seqeval.metrics import classification_report
>>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> print(classification_report(y_true, y_pred))
precision recall f1-score support
<BLANKLINE>
MISC 0.00 0.00 0.00 1
PER 1.00 1.00 1.00 1
<BLANKLINE>
avg / total 0.50 0.50 0.50 2
<BLANKLINE>
"""
true_entities = set(get_entities(y_true, suffix))
pred_entities = set(get_entities(y_pred, suffix))
name_width = 0
d1 = defaultdict(set)
d2 = defaultdict(set)
for e in true_entities:
d1[e[0]].add((e[1], e[2]))
name_width = max(name_width, len(e[0]))
for e in pred_entities:
d2[e[0]].add((e[1], e[2]))
last_line_heading = 'avg / total'
width = max(name_width, len(last_line_heading), digits)
headers = ["precision", "recall", "f1-score", "support"]
head_fmt = u'{:>{width}s} ' + u' {:>9}' * len(headers)
report = head_fmt.format(u'', *headers, width=width)
report += u'\n\n'
row_fmt = u'{:>{width}s} ' + u' {:>9.{digits}f}' * 3 + u' {:>9}\n'
ps, rs, f1s, s = [], [], [], []
for type_name, true_entities in d1.items():
pred_entities = d2[type_name]
nb_correct = len(true_entities & pred_entities)
nb_pred = len(pred_entities)
nb_true = len(true_entities)
p = nb_correct / nb_pred if nb_pred > 0 else 0
r = nb_correct / nb_true if nb_true > 0 else 0
f1 = 2 * p * r / (p + r) if p + r > 0 else 0
report += row_fmt.format(*[type_name, p, r, f1, nb_true], width=width, digits=digits)
ps.append(p)
rs.append(r)
f1s.append(f1)
s.append(nb_true)
report += u'\n'
# compute averages
report += row_fmt.format(last_line_heading,
np.average(ps, weights=s),
np.average(rs, weights=s),
np.average(f1s, weights=s),
np.sum(s),
width=width, digits=digits)
return report