ctl
/

wav2vec2-large-xlsr-cantonese

@@ -108,9 +108,9 @@ class CER(datasets.Metric):
             codebase_urls=["https://github.com/jitsi/jiwer/"],
             reference_urls=[
                 "https://en.wikipedia.org/wiki/Word_error_rate",
-                "https://sites.google.com/site/textdigitisation/qualitymeasures/computingerrorrates#whitespace",
             ],
         )
     def _compute(self, predictions, references):
-        return jiwer.wer(references, predictions, truth_transform=cer_transform, hypothesis_transform=cer_transform)

             codebase_urls=["https://github.com/jitsi/jiwer/"],
             reference_urls=[
                 "https://en.wikipedia.org/wiki/Word_error_rate",
+                "https://sites.google.com/site/textdigitisation/qualitymeasures/computingerrorrates",
             ],
         )
     def _compute(self, predictions, references):
+        return jiwer.wer(references, predictions, truth_transform=cer_transform, hypothesis_transform=cer_transform)

test_cer.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# coding=utf-8
+# Copyright 2021 The HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import cer
+cer = cer.CER()
+class TestCER(unittest.TestCase):
+    def test_cer_case_senstive(self):
+        refs = ['White House']
+        preds = ['white house']
+        # S = 2, D = 0, I = 0, N = 9, CER = 2 / 11
+        char_error_rate = cer.compute(predictions=preds, references=refs)
+        self.assertTrue(abs(char_error_rate - 0.1818181818) < 1e-6)
+    def test_cer_whitespace(self):
+        refs = ['were wolf']
+        preds = ['werewolf']
+        # S = 0, D = 0, I = 1, N = 9, CER = 1 / 9
+        char_error_rate = cer.compute(predictions=preds, references=refs)
+        self.assertTrue(abs(char_error_rate - 0.1111111) < 1e-6)
+        # consecutive whitespaces case 0
+        refs = ['werewolf']
+        preds = ['weae     wolf']
+        # S = 1, D = 1, I = 0, N = 8, CER = 0.25
+        char_error_rate = cer.compute(predictions=preds, references=refs)
+        self.assertTrue(abs(char_error_rate - 0.25) < 1e-6)
+        # consecutive whitespaces case 1
+        refs = ['were wolf']
+        preds = ['were               wolf']
+        # S = 0, D = 0, I = 0, N = 9, CER = 0
+        char_error_rate = cer.compute(predictions=preds, references=refs)
+        self.assertTrue(abs(char_error_rate - 0.0) < 1e-6)
+        # consecutive whitespaces case 2
+        refs = ['were   wolf']
+        preds = ['were               wolf']
+        # S = 0, D = 0, I = 0, N = 9, CER = 0
+        char_error_rate = cer.compute(predictions=preds, references=refs)
+        self.assertTrue(abs(char_error_rate - 0.0) < 1e-6)
+    def test_cer_sub(self):
+        refs = ['werewolf']
+        preds = ['weaewolf']
+        # S = 1, D = 0, I = 0, N = 8, CER = 0.125
+        char_error_rate = cer.compute(predictions=preds, references=refs)
+        self.assertTrue(abs(char_error_rate - 0.125) < 1e-6)
+    def test_cer_del(self):
+        refs = ['werewolf']
+        preds = ['wereawolf']
+        # S = 0, D = 1, I = 0, N = 8, CER = 0.125
+        char_error_rate = cer.compute(predictions=preds, references=refs)
+        self.assertTrue(abs(char_error_rate - 0.125) < 1e-6)
+    def test_cer_insert(self):
+        refs = ['werewolf']
+        preds = ['wereolf']
+        # S = 0, D = 0, I = 1, N = 8, CER = 0.125
+        char_error_rate = cer.compute(predictions=preds, references=refs)
+        self.assertTrue(abs(char_error_rate - 0.125) < 1e-6)
+    def test_cer_equal(self):
+        refs = ['werewolf']
+        char_error_rate = cer.compute(predictions=refs, references=refs)
+        self.assertEqual(char_error_rate, 0.0)
+    def test_cer_list_of_seqs(self):
+        refs = ['werewolf', 'I am your father']
+        char_error_rate = cer.compute(predictions=refs, references=refs)
+        self.assertEqual(char_error_rate, 0.0)
+        refs = ['werewolf', 'I am your father', 'doge']
+        preds = ['werxwolf', 'I       am your father', 'doge']
+        # S = 1, D = 0, I = 0, N = 28, CER = 1 / 28
+        char_error_rate = cer.compute(predictions=preds, references=refs)
+        self.assertTrue(abs(char_error_rate - 0.03571428) < 1e-6)
+    def test_cer_unicode(self):
+        ref = [u'我能吞下玻璃而不伤身体']
+        pred = [u' 能吞虾玻璃而 不霜身体啦']
+        # S = 3, D = 2, I = 0, N = 11
+        # CER = 5 / 11
+        char_error_rate = cer.compute(predictions=pred, references=ref)
+        self.assertTrue(abs(char_error_rate - 0.4545454545) < 1e-6)
+        ref = [u'我能吞', u'下玻璃而不伤身体']
+        pred = [u'我    能 吞 下 玻 璃', u'而不伤身体']
+        # S = 0, D = 5, I = 0, N = 11
+        # CER = 5 / 11
+        char_error_rate = cer.compute(predictions=pred, references=ref)
+        self.assertTrue(abs(char_error_rate - 0.454545454545) < 1e-6)
+        ref = [u'我能吞下玻璃而不伤身体']
+        char_error_rate = cer.compute(predictions=ref, references=ref)
+        self.assertFalse(char_error_rate, 0.0)
+    def test_cer_empty(self):
+        ref = ''
+        pred = 'Hypothesis'
+        with self.assertRaises(ValueError):
+            char_error_rate = cer.compute(predictions=pred, references=ref)
+if __name__ == '__main__':
+    unittest.main()