|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" Segmentation scores evaluation metrics""" |
|
|
|
import evaluate |
|
import datasets |
|
|
|
|
|
|
|
_CITATION = """\ |
|
@InProceedings{huggingface:module, |
|
title = {A great new module}, |
|
authors={huggingface, Inc.}, |
|
year={2020} |
|
} |
|
""" |
|
|
|
|
|
_DESCRIPTION = """\ |
|
This module computes segmentation scores for a list of predicted segmentations and gold segmentations. |
|
""" |
|
|
|
|
|
|
|
_KWARGS_DESCRIPTION = """ |
|
Calculates how good are predicted segmentations, using boundary, token and type scores. |
|
Args: |
|
predictions: list of segmented utterances to score. Each predictions |
|
should be a string with phonemes separated by spaces and estimated word boundaries |
|
denoted by the token 'WORD_BOUNDARY'. |
|
references: list of segmented utterances to score. Each predictions |
|
should be a string with phonemes separated by spaces and gold word boundaries |
|
denoted by the token 'WORD_BOUNDARY'. |
|
Returns: |
|
type_fscore: lexicon f1 score |
|
type_precision: lexicon precision |
|
type_recall: lexicon recall |
|
token_fscore: token f1 score |
|
token_precision: token precision |
|
token_recall: token recall |
|
boundary_all_fscore: boundary f1 score, including utterance boundaries |
|
boundary_all_precision: boundary precision, including utterance boundaries |
|
boundary_all_recall: boundary recall, including utterance boundaries |
|
boundary_noedge_fscore: boundary f1 score, excluding utterance boundaries |
|
boundary_noedge_precision: boundary precision, excluding utterance boundaries |
|
boundary_noedge_recall: boundary recall, excluding utterance boundaries |
|
Examples: |
|
>>> segmentation_scores = evaluate.load("transformersegmentation/segmentation_scores") |
|
>>> results = segmentation_scores.compute(references=["w ɛ ɹ WORD_BOUNDARY ɪ z WORD_BOUNDARY ð ɪ s WORD_BOUNDARY", "l ɪ ɾ əl WORD_BOUNDARY aɪ z WORD_BOUNDARY"], predictions=["w ɛ ɹ WORD_BOUNDARY ɪ z WORD_BOUNDARY ð ɪ s WORD_BOUNDARY", "l ɪ ɾ əl WORD_BOUNDARY aɪ z WORD_BOUNDARY"]) |
|
>>> print(results) |
|
{'type_fscore': 1.0, 'type_precision': 1.0, 'type_recall': 1.0, 'token_fscore': 1.0, 'token_precision': 1.0, 'token_recall': 1.0, 'boundary_all_fscore': 1.0, 'boundary_all_precision': 1.0, 'boundary_all_recall': 1.0, 'boundary_noedge_fscore': 1.0, 'boundary_noedge_precision': 1.0, 'boundary_noedge_recall': 1.0} |
|
""" |
|
|
|
class TokenEvaluation(object): |
|
"""Evaluation of token f-score, precision and recall""" |
|
|
|
def __init__(self): |
|
self.test = 0 |
|
self.gold = 0 |
|
self.correct = 0 |
|
self.n = 0 |
|
self.n_exactmatch = 0 |
|
|
|
def precision(self): |
|
return float(self.correct) / self.test if self.test != 0 else None |
|
|
|
def recall(self): |
|
return float(self.correct) / self.gold if self.gold != 0 else None |
|
|
|
def fscore(self): |
|
total = self.test + self.gold |
|
return float(2 * self.correct) / total if total != 0 else None |
|
|
|
def exact_match(self): |
|
return float(self.n_exactmatch) / self.n if self.n else None |
|
|
|
def update(self, test_set, gold_set): |
|
self.n += 1 |
|
|
|
if test_set == gold_set: |
|
self.n_exactmatch += 1 |
|
|
|
|
|
|
|
|
|
self.test += len([x for x in test_set if x != "_"]) |
|
self.gold += len([x for x in gold_set if x != "_"]) |
|
self.correct += len(test_set & gold_set) |
|
|
|
def update_lists(self, test_sets, gold_sets): |
|
if len(test_sets) != len(gold_sets): |
|
raise ValueError( |
|
"#words different in test and gold: {} != {}".format( |
|
len(test_sets), len(gold_sets) |
|
) |
|
) |
|
|
|
for t, g in zip(test_sets, gold_sets): |
|
self.update(t, g) |
|
|
|
|
|
class TypeEvaluation(TokenEvaluation): |
|
"""Evaluation of type f-score, precision and recall""" |
|
|
|
@staticmethod |
|
def lexicon_check(textlex, goldlex): |
|
"""Compare hypothesis and gold lexicons""" |
|
textlist = [] |
|
goldlist = [] |
|
for w in textlex: |
|
if w in goldlex: |
|
|
|
textlist.append(w) |
|
goldlist.append(w) |
|
else: |
|
|
|
textlist.append(w) |
|
|
|
goldlist.append("_") |
|
|
|
for w in goldlex: |
|
if w not in goldlist: |
|
|
|
goldlist.append(w) |
|
|
|
textlist.append("_") |
|
|
|
textset = [{w} for w in textlist] |
|
goldset = [{w} for w in goldlist] |
|
return textset, goldset |
|
|
|
def update_lists(self, text, gold): |
|
lt, lg = self.lexicon_check(text, gold) |
|
super(TypeEvaluation, self).update_lists(lt, lg) |
|
|
|
|
|
class BoundaryEvaluation(TokenEvaluation): |
|
@staticmethod |
|
def get_boundary_positions(stringpos): |
|
return [{idx for pair in line for idx in pair} for line in stringpos] |
|
|
|
def update_lists(self, text, gold): |
|
lt = self.get_boundary_positions(text) |
|
lg = self.get_boundary_positions(gold) |
|
super(BoundaryEvaluation, self).update_lists(lt, lg) |
|
|
|
|
|
class BoundaryNoEdgeEvaluation(BoundaryEvaluation): |
|
@staticmethod |
|
def get_boundary_positions(stringpos): |
|
return [{left for left, _ in line if left > 0} for line in stringpos] |
|
|
|
|
|
class _StringPos(object): |
|
"""Compute start and stop index of words in an utterance""" |
|
|
|
def __init__(self): |
|
self.idx = 0 |
|
|
|
def __call__(self, n): |
|
"""Return the position of the current word given its length `n`""" |
|
start = self.idx |
|
self.idx += n |
|
return start, self.idx |
|
|
|
|
|
|
|
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
|
class segmentation_scores(evaluate.Metric): |
|
"""TODO: Short description of my evaluation module.""" |
|
|
|
def _info(self): |
|
|
|
return evaluate.MetricInfo( |
|
|
|
module_type="metric", |
|
description=_DESCRIPTION, |
|
citation=_CITATION, |
|
inputs_description=_KWARGS_DESCRIPTION, |
|
|
|
features=datasets.Features({ |
|
'predictions': datasets.Value('string'), |
|
'references': datasets.Value('string'), |
|
}), |
|
|
|
homepage="https://huggingface.co/spaces/transformersegmentation/segmentation_scores", |
|
|
|
codebase_urls=["http://github.com/codebyzeb/transformersegmentation"], |
|
reference_urls=["http://path.to.reference.url/new_module"] |
|
) |
|
|
|
def _download_and_prepare(self, dl_manager): |
|
"""Optional: download external resources useful to compute the scores""" |
|
|
|
pass |
|
|
|
def _process_data(self, text): |
|
""" Load text data for evaluation |
|
Parameters |
|
---------- |
|
text : list of str |
|
The list of utterances to read for the evaluation. |
|
|
|
Returns |
|
------- |
|
(words, positions, lexicon) : three lists |
|
where `words` are the input utterances with word separators |
|
removed, `positions` stores the start/stop index of each word |
|
for each utterance, and `lexicon` is the list of words. |
|
""" |
|
words = [] |
|
positions = [] |
|
lexicon = {} |
|
|
|
|
|
for utt in (utt for utt in text if utt.strip()): |
|
|
|
phone_in_utterance = [ |
|
phone for phone in utt.split(" ") if phone != "WORD_BOUNDARY" |
|
] |
|
words_in_utterance = ( |
|
"".join( |
|
" " if phone == "WORD_BOUNDARY" else phone for phone in utt.split(" ") |
|
) |
|
.strip() |
|
.split(" ") |
|
) |
|
|
|
words.append(phone_in_utterance) |
|
for word in words_in_utterance: |
|
lexicon[word] = 1 |
|
idx = _StringPos() |
|
positions.append({idx(len(word)) for word in words_in_utterance}) |
|
|
|
|
|
lexicon = sorted([k for k in lexicon.keys()]) |
|
return words, positions, lexicon |
|
|
|
def _compute(self, predictions, references): |
|
"""Scores a segmented text against its gold version |
|
Parameters |
|
---------- |
|
predictions : sequence of str |
|
A suite of word utterances, each string using 'WORD_BOUNDARY' as as word separator. |
|
references : sequence of str |
|
A suite of word utterances, each string using 'WORD_BOUNDARY' as as word separator. |
|
|
|
Returns |
|
------- |
|
scores : dict |
|
A dictionary with the following entries: |
|
* 'type_fscore' |
|
* 'type_precision' |
|
* 'type_recall' |
|
* 'token_fscore' |
|
* 'token_precision' |
|
* 'token_recall' |
|
* 'boundary_all_fscore' |
|
* 'boundary_all_precision' |
|
* 'boundary_all_recall' |
|
* 'boundary_noedge_fscore' |
|
* 'boundary_noedge_precision' |
|
* 'boundary_noedge_recall' |
|
|
|
Raises |
|
------ |
|
ValueError |
|
If `gold` and `text` have different size or differ in tokens |
|
""" |
|
text_words, text_stringpos, text_lex = self._process_data(predictions) |
|
gold_words, gold_stringpos, gold_lex = self._process_data(references) |
|
|
|
if len(gold_words) != len(text_words): |
|
raise ValueError( |
|
"gold and train have different size: len(gold)={}, len(train)={}".format( |
|
len(gold_words), len(text_words) |
|
) |
|
) |
|
|
|
for i, (g, t) in enumerate(zip(gold_words, text_words)): |
|
if g != t: |
|
raise ValueError( |
|
'gold and train differ at line {}: gold="{}", train="{}"'.format( |
|
i + 1, g, t |
|
) |
|
) |
|
|
|
|
|
token_eval = TokenEvaluation() |
|
token_eval.update_lists(text_stringpos, gold_stringpos) |
|
|
|
|
|
type_eval = TypeEvaluation() |
|
type_eval.update_lists(text_lex, gold_lex) |
|
|
|
|
|
boundary_eval = BoundaryEvaluation() |
|
boundary_eval.update_lists(text_stringpos, gold_stringpos) |
|
|
|
|
|
boundary_noedge_eval = BoundaryNoEdgeEvaluation() |
|
boundary_noedge_eval.update_lists(text_stringpos, gold_stringpos) |
|
|
|
return { |
|
"token_precision": token_eval.precision(), |
|
"token_recall": token_eval.recall(), |
|
"token_fscore": token_eval.fscore(), |
|
"type_precision": type_eval.precision(), |
|
"type_recall": type_eval.recall(), |
|
"type_fscore": type_eval.fscore(), |
|
"boundary_all_precision": boundary_eval.precision(), |
|
"boundary_all_recall": boundary_eval.recall(), |
|
"boundary_all_fscore": boundary_eval.fscore(), |
|
"boundary_noedge_precision": boundary_noedge_eval.precision(), |
|
"boundary_noedge_recall": boundary_noedge_eval.recall(), |
|
"boundary_noedge_fscore": boundary_noedge_eval.fscore(), |
|
} |