Upload folder using huggingface_hub

a7b3936 verified 5 months ago

12.4 kB

	# From https://github.com/chakki-works/seqeval/blob/master/seqeval/metrics/sequence_labeling.py
	"""Metrics to assess performance on sequence labeling task given prediction
	Functions named as ``*_score`` return a scalar value to maximize: the higher
	the better
	"""

	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	from collections import defaultdict

	import numpy as np


	def get_entities(seq, suffix=False):
	"""Gets entities from sequence.
	Args:
	seq (list): sequence of labels.
	Returns:
	list: list of (chunk_type, chunk_start, chunk_end).
	Example:
	>>> from seqeval.metrics.sequence_labeling import get_entities
	>>> seq = ['B-PER', 'I-PER', 'O', 'B-LOC']
	>>> get_entities(seq)
	[('PER', 0, 1), ('LOC', 3, 3)]
	"""
	# for nested list
	if any(isinstance(s, list) for s in seq):
	seq = [item for sublist in seq for item in sublist + ['O']]

	prev_tag = 'O'
	prev_type = ''
	begin_offset = 0
	chunks = []
	for i, chunk in enumerate(seq + ['O']):
	if suffix:
	tag = chunk[-1]
	type_ = chunk.split('-')[0]
	else:
	tag = chunk[0]
	type_ = chunk.split('-')[-1]

	if end_of_chunk(prev_tag, tag, prev_type, type_):
	chunks.append((prev_type, begin_offset, i-1))
	if start_of_chunk(prev_tag, tag, prev_type, type_):
	begin_offset = i
	prev_tag = tag
	prev_type = type_

	return chunks


	def end_of_chunk(prev_tag, tag, prev_type, type_):
	"""Checks if a chunk ended between the previous and current word.
	Args:
	prev_tag: previous chunk tag.
	tag: current chunk tag.
	prev_type: previous type.
	type_: current type.
	Returns:
	chunk_end: boolean.
	"""
	chunk_end = False

	if prev_tag == 'E': chunk_end = True
	if prev_tag == 'S': chunk_end = True

	if prev_tag == 'B' and tag == 'B': chunk_end = True
	if prev_tag == 'B' and tag == 'S': chunk_end = True
	if prev_tag == 'B' and tag == 'O': chunk_end = True
	if prev_tag == 'I' and tag == 'B': chunk_end = True
	if prev_tag == 'I' and tag == 'S': chunk_end = True
	if prev_tag == 'I' and tag == 'O': chunk_end = True

	if prev_tag != 'O' and prev_tag != '.' and prev_type != type_:
	chunk_end = True

	return chunk_end


	def start_of_chunk(prev_tag, tag, prev_type, type_):
	"""Checks if a chunk started between the previous and current word.
	Args:
	prev_tag: previous chunk tag.
	tag: current chunk tag.
	prev_type: previous type.
	type_: current type.
	Returns:
	chunk_start: boolean.
	"""
	chunk_start = False

	if tag == 'B': chunk_start = True
	if tag == 'S': chunk_start = True

	if prev_tag == 'E' and tag == 'E': chunk_start = True
	if prev_tag == 'E' and tag == 'I': chunk_start = True
	if prev_tag == 'S' and tag == 'E': chunk_start = True
	if prev_tag == 'S' and tag == 'I': chunk_start = True
	if prev_tag == 'O' and tag == 'E': chunk_start = True
	if prev_tag == 'O' and tag == 'I': chunk_start = True

	if tag != 'O' and tag != '.' and prev_type != type_:
	chunk_start = True

	return chunk_start


	def f1_score(y_true, y_pred, average='micro', suffix=False):
	"""Compute the F1 score.
	The F1 score can be interpreted as a weighted average of the precision and
	recall, where an F1 score reaches its best value at 1 and worst score at 0.
	The relative contribution of precision and recall to the F1 score are
	equal. The formula for the F1 score is::
	F1 = 2 * (precision * recall) / (precision + recall)
	Args:
	y_true : 2d array. Ground truth (correct) target values.
	y_pred : 2d array. Estimated targets as returned by a tagger.
	Returns:
	score : float.
	Example:
	>>> from seqeval.metrics import f1_score
	>>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
	>>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
	>>> f1_score(y_true, y_pred)
	0.50
	# """
	# from sklearn.metrics import f1_score
	# score = f1_score(y_true, y_pred, average='macro')
	true_entities = set(get_entities(y_true, suffix))
	pred_entities = set(get_entities(y_pred, suffix))

	nb_correct = len(true_entities & pred_entities)
	nb_pred = len(pred_entities)
	nb_true = len(true_entities)

	p = nb_correct / nb_pred if nb_pred > 0 else 0
	r = nb_correct / nb_true if nb_true > 0 else 0
	score = 2 * p * r / (p + r) if p + r > 0 else 0

	return score


	def accuracy_score(y_true, y_pred):
	"""Accuracy classification score.
	In multilabel classification, this function computes subset accuracy:
	the set of labels predicted for a sample must exactly match the
	corresponding set of labels in y_true.
	Args:
	y_true : 2d array. Ground truth (correct) target values.
	y_pred : 2d array. Estimated targets as returned by a tagger.
	Returns:
	score : float.
	Example:
	>>> from seqeval.metrics import accuracy_score
	>>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
	>>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
	>>> accuracy_score(y_true, y_pred)
	0.80
	"""
	if any(isinstance(s, list) for s in y_true):
	y_true = [item for sublist in y_true for item in sublist]
	y_pred = [item for sublist in y_pred for item in sublist]

	nb_correct = sum(y_t==y_p for y_t, y_p in zip(y_true, y_pred))
	nb_true = len(y_true)

	score = nb_correct / nb_true

	return score


	def precision_score(y_true, y_pred, average='micro', suffix=False):
	"""Compute the precision.
	The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of
	true positives and ``fp`` the number of false positives. The precision is
	intuitively the ability of the sequence_tagger not to label as positive a sample.
	The best value is 1 and the worst value is 0.
	Args:
	y_true : 2d array. Ground truth (correct) target values.
	y_pred : 2d array. Estimated targets as returned by a tagger.
	Returns:
	score : float.
	Example:
	>>> from seqeval.metrics import precision_score
	>>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
	>>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
	>>> precision_score(y_true, y_pred)
	0.50
	"""
	true_entities = set(get_entities(y_true, suffix))
	pred_entities = set(get_entities(y_pred, suffix))

	nb_correct = len(true_entities & pred_entities)
	nb_pred = len(pred_entities)

	score = nb_correct / nb_pred if nb_pred > 0 else 0

	return score


	def recall_score(y_true, y_pred, average='micro', suffix=False):
	"""Compute the recall.
	The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of
	true positives and ``fn`` the number of false negatives. The recall is
	intuitively the ability of the sequence_tagger to find all the positive samples.
	The best value is 1 and the worst value is 0.
	Args:
	y_true : 2d array. Ground truth (correct) target values.
	y_pred : 2d array. Estimated targets as returned by a tagger.
	Returns:
	score : float.
	Example:
	>>> from seqeval.metrics import recall_score
	>>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
	>>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
	>>> recall_score(y_true, y_pred)
	0.50
	"""
	true_entities = set(get_entities(y_true, suffix))
	pred_entities = set(get_entities(y_pred, suffix))

	nb_correct = len(true_entities & pred_entities)
	nb_true = len(true_entities)

	score = nb_correct / nb_true if nb_true > 0 else 0

	return score


	def performance_measure(y_true, y_pred):
	"""
	Compute the performance metrics: TP, FP, FN, TN
	Args:
	y_true : 2d array. Ground truth (correct) target values.
	y_pred : 2d array. Estimated targets as returned by a tagger.
	Returns:
	performance_dict : dict
	Example:
	>>> from seqeval.metrics import performance_measure
	>>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'B-ORG'], ['B-PER', 'I-PER', 'O']]
	>>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O', 'O'], ['B-PER', 'I-PER', 'O']]
	>>> performance_measure(y_true, y_pred)
	(3, 3, 1, 4)
	"""
	performace_dict = dict()
	if any(isinstance(s, list) for s in y_true):
	y_true = [item for sublist in y_true for item in sublist]
	y_pred = [item for sublist in y_pred for item in sublist]
	performace_dict['TP'] = sum(y_t == y_p for y_t, y_p in zip(y_true, y_pred)
	if ((y_t != 'O') or (y_p != 'O')))
	performace_dict['FP'] = sum(y_t != y_p for y_t, y_p in zip(y_true, y_pred))
	performace_dict['FN'] = sum(((y_t != 'O') and (y_p == 'O'))
	for y_t, y_p in zip(y_true, y_pred))
	performace_dict['TN'] = sum((y_t == y_p == 'O')
	for y_t, y_p in zip(y_true, y_pred))

	return performace_dict


	def classification_report(y_true, y_pred, digits=2, suffix=False):
	"""Build a text report showing the main classification metrics.
	Args:
	y_true : 2d array. Ground truth (correct) target values.
	y_pred : 2d array. Estimated targets as returned by a sequence_tagger.
	digits : int. Number of digits for formatting output floating point values.
	Returns:
	report : string. Text summary of the precision, recall, F1 score for each class.
	Examples:
	>>> from seqeval.metrics import classification_report
	>>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
	>>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
	>>> print(classification_report(y_true, y_pred))
	precision recall f1-score support
	<BLANKLINE>
	MISC 0.00 0.00 0.00 1
	PER 1.00 1.00 1.00 1
	<BLANKLINE>
	avg / total 0.50 0.50 0.50 2
	<BLANKLINE>
	"""
	true_entities = set(get_entities(y_true, suffix))
	pred_entities = set(get_entities(y_pred, suffix))

	name_width = 0
	d1 = defaultdict(set)
	d2 = defaultdict(set)
	for e in true_entities:
	d1[e[0]].add((e[1], e[2]))
	name_width = max(name_width, len(e[0]))
	for e in pred_entities:
	d2[e[0]].add((e[1], e[2]))

	last_line_heading = 'avg / total'
	width = max(name_width, len(last_line_heading), digits)

	headers = ["precision", "recall", "f1-score", "support"]
	head_fmt = u'{:>{width}s} ' + u' {:>9}' * len(headers)
	report = head_fmt.format(u'', *headers, width=width)
	report += u'\n\n'

	row_fmt = u'{:>{width}s} ' + u' {:>9.{digits}f}' * 3 + u' {:>9}\n'

	ps, rs, f1s, s = [], [], [], []
	for type_name, true_entities in d1.items():
	pred_entities = d2[type_name]
	nb_correct = len(true_entities & pred_entities)
	nb_pred = len(pred_entities)
	nb_true = len(true_entities)

	p = nb_correct / nb_pred if nb_pred > 0 else 0
	r = nb_correct / nb_true if nb_true > 0 else 0
	f1 = 2 * p * r / (p + r) if p + r > 0 else 0

	report += row_fmt.format(*[type_name, p, r, f1, nb_true], width=width, digits=digits)

	ps.append(p)
	rs.append(r)
	f1s.append(f1)
	s.append(nb_true)

	report += u'\n'

	# compute averages
	report += row_fmt.format(last_line_heading,
	np.average(ps, weights=s),
	np.average(rs, weights=s),
	np.average(f1s, weights=s),
	np.sum(s),
	width=width, digits=digits)

	return report