Spaces:

transformersegmentation
/

segmentation_scores

Sleeping

App Files Files Community

codebyzeb commited on Apr 25, 2023

Commit

313d779

•

1 Parent(s): 35ec54c

Update segmentation_scores.py

Browse files

Files changed (1) hide show

segmentation_scores.py +7 -7

segmentation_scores.py CHANGED Viewed

@@ -38,10 +38,10 @@ Calculates how good are predicted segmentations, using boundary, token and type
 Args:
     predictions: list of segmented utterances to score. Each predictions
         should be a string with phonemes separated by spaces and estimated word boundaries
-        denoted by the token ';eword'.
     references: list of segmented utterances to score. Each predictions
         should be a string with phonemes separated by spaces and gold word boundaries
-        denoted by the token ';eword'.
 Returns:
     type_fscore: lexicon f1 score
     type_precision: lexicon precision
@@ -57,7 +57,7 @@ Returns:
     boundary_noedge_recall: boundary recall, excluding utterance boundaries
 Examples:
     >>> segmentation_scores = evaluate.load("transformersegmentation/segmentation_scores")
-    >>> results = segmentation_scores.compute(references=["w ɛ ɹ ;eword ɪ z ;eword ð ɪ s ;eword", "l ɪ ɾ əl ;eword aɪ z ;eword"], predictions=["w ɛ ɹ ;eword ɪ z ;eword ð ɪ s ;eword", "l ɪ ɾ əl ;eword aɪ z ;eword"])
     >>> print(results)
     {'type_fscore': 1.0, 'type_precision': 1.0, 'type_recall': 1.0, 'token_fscore': 1.0, 'token_precision': 1.0, 'token_recall': 1.0, 'boundary_all_fscore': 1.0, 'boundary_all_precision': 1.0, 'boundary_all_recall': 1.0, 'boundary_noedge_fscore': 1.0, 'boundary_noedge_precision': 1.0, 'boundary_noedge_recall': 1.0}
 """
@@ -227,11 +227,11 @@ class segmentation_scores(evaluate.Metric):
         for utt in (utt for utt in text if utt.strip()):
             # list of phones in the utterance with word seperator removed
             phone_in_utterance = [
-                phone for phone in utt.split(" ") if phone != ";eword"
             ]
             words_in_utterance = (
                 "".join(
-                    " " if phone == ";eword" else phone for phone in utt.split(" ")
                 )
                 .strip()
                 .split(" ")
@@ -252,9 +252,9 @@ class segmentation_scores(evaluate.Metric):
         Parameters
         ----------
         predictions : sequence of str
-            A suite of word utterances, each string using ';eword' as as word separator.
         references : sequence of str
-            A suite of word utterances, each string using ';eword' as as word separator.
         Returns
         -------

 Args:
     predictions: list of segmented utterances to score. Each predictions
         should be a string with phonemes separated by spaces and estimated word boundaries
+        denoted by the token 'WORD_BOUNDARY'.
     references: list of segmented utterances to score. Each predictions
         should be a string with phonemes separated by spaces and gold word boundaries
+        denoted by the token 'WORD_BOUNDARY'.
 Returns:
     type_fscore: lexicon f1 score
     type_precision: lexicon precision
     boundary_noedge_recall: boundary recall, excluding utterance boundaries
 Examples:
     >>> segmentation_scores = evaluate.load("transformersegmentation/segmentation_scores")
+    >>> results = segmentation_scores.compute(references=["w ɛ ɹ WORD_BOUNDARY ɪ z WORD_BOUNDARY ð ɪ s WORD_BOUNDARY", "l ɪ ɾ əl WORD_BOUNDARY aɪ z WORD_BOUNDARY"], predictions=["w ɛ ɹ WORD_BOUNDARY ɪ z WORD_BOUNDARY ð ɪ s WORD_BOUNDARY", "l ɪ ɾ əl WORD_BOUNDARY aɪ z WORD_BOUNDARY"])
     >>> print(results)
     {'type_fscore': 1.0, 'type_precision': 1.0, 'type_recall': 1.0, 'token_fscore': 1.0, 'token_precision': 1.0, 'token_recall': 1.0, 'boundary_all_fscore': 1.0, 'boundary_all_precision': 1.0, 'boundary_all_recall': 1.0, 'boundary_noedge_fscore': 1.0, 'boundary_noedge_precision': 1.0, 'boundary_noedge_recall': 1.0}
 """
         for utt in (utt for utt in text if utt.strip()):
             # list of phones in the utterance with word seperator removed
             phone_in_utterance = [
+                phone for phone in utt.split(" ") if phone != "WORD_BOUNDARY"
             ]
             words_in_utterance = (
                 "".join(
+                    " " if phone == "WORD_BOUNDARY" else phone for phone in utt.split(" ")
                 )
                 .strip()
                 .split(" ")
         Parameters
         ----------
         predictions : sequence of str
+            A suite of word utterances, each string using 'WORD_BOUNDARY' as as word separator.
         references : sequence of str
+            A suite of word utterances, each string using 'WORD_BOUNDARY' as as word separator.
         Returns
         -------