kotoba-tech
/

kotoba-whisper-v1.1

@@ -1,62 +1,59 @@
 ---
-license: apache-2.0
 language: ja
 tags:
 - audio
 - automatic-speech-recognition
 - hf-asr-leaderboard
 widget:
 - example_title: CommonVoice 8.0 (Test Split)
-  src: >-
-    https://huggingface.co/datasets/japanese-asr/ja_asr.common_voice_8_0/resolve/main/sample.flac
 - example_title: JSUT Basic 5000
-  src: >-
-    https://huggingface.co/datasets/japanese-asr/ja_asr.jsut_basic5000/resolve/main/sample.flac
 - example_title: ReazonSpeech (Test Split)
-  src: >-
-    https://huggingface.co/datasets/japanese-asr/ja_asr.reazonspeech_test/resolve/main/sample.flac
 pipeline_tag: automatic-speech-recognition
-metrics:
-- wer
 model-index:
-  - name: kotoba-tech/kotoba-whisper-v1.0
-    results:
-      - task:
-          type: automatic-speech-recognition
-        dataset:
-          name: CommonVoice_8.0 (Japanese)
-          type: japanese-asr/ja_asr.common_voice_8_0
-        metrics:
-          - name: WER
-            type: WER
-            value: 59.27
-          - name: CER
-            type: CER
-            value: 9.44
-      - task:
-          type: automatic-speech-recognition
-        dataset:
-          name: ReazonSpeech (Test)
-          type: japanese-asr/ja_asr.reazonspeech_test
-        metrics:
-          - name: WER
-            type: WER
-            value: 56.62
-          - name: CER
-            type: CER
-            value: 12.6
-      - task:
-          type: automatic-speech-recognition
-        dataset:
-          name: JSUT Basic5000
-          type: japanese-asr/ja_asr.jsut_basic5000
-        metrics:
-          - name: WER
-            type: WER
-            value: 64.36
-          - name: CER
-            type: CER
-            value: 8.48
 ---
 # Kotoba-Whisper-v1.1

 ---
 language: ja
+license: apache-2.0
 tags:
 - audio
 - automatic-speech-recognition
 - hf-asr-leaderboard
+metrics:
+- wer
 widget:
 - example_title: CommonVoice 8.0 (Test Split)
+  src: https://huggingface.co/datasets/japanese-asr/ja_asr.common_voice_8_0/resolve/main/sample.flac
 - example_title: JSUT Basic 5000
+  src: https://huggingface.co/datasets/japanese-asr/ja_asr.jsut_basic5000/resolve/main/sample.flac
 - example_title: ReazonSpeech (Test Split)
+  src: https://huggingface.co/datasets/japanese-asr/ja_asr.reazonspeech_test/resolve/main/sample.flac
 pipeline_tag: automatic-speech-recognition
 model-index:
+- name: kotoba-tech/kotoba-whisper-v1.0
+  results:
+  - task:
+      type: automatic-speech-recognition
+    dataset:
+      name: CommonVoice_8.0 (Japanese)
+      type: japanese-asr/ja_asr.common_voice_8_0
+    metrics:
+    - type: WER
+      value: 59.27
+      name: WER
+    - type: CER
+      value: 9.44
+      name: CER
+  - task:
+      type: automatic-speech-recognition
+    dataset:
+      name: ReazonSpeech (Test)
+      type: japanese-asr/ja_asr.reazonspeech_test
+    metrics:
+    - type: WER
+      value: 56.62
+      name: WER
+    - type: CER
+      value: 12.6
+      name: CER
+  - task:
+      type: automatic-speech-recognition
+    dataset:
+      name: JSUT Basic5000
+      type: japanese-asr/ja_asr.jsut_basic5000
+    metrics:
+    - type: WER
+      value: 64.36
+      name: WER
+    - type: CER
+      value: 8.48
+      name: CER
 ---
 # Kotoba-Whisper-v1.1

config.json CHANGED Viewed

@@ -54,7 +54,7 @@
   "pad_token_id": 50256,
   "scale_embedding": false,
   "torch_dtype": "float32",
-  "transformers_version": "4.41.0.dev0",
   "use_cache": true,
   "use_weighted_layer_sum": false,
   "vocab_size": 51866

   "pad_token_id": 50256,
   "scale_embedding": false,
   "torch_dtype": "float32",
+  "transformers_version": "4.40.1",
   "use_cache": true,
   "use_weighted_layer_sum": false,
   "vocab_size": 51866

generation_config.json CHANGED Viewed

@@ -261,5 +261,5 @@
     "transcribe": 50360,
     "translate": 50359
   },
-  "transformers_version": "4.41.0.dev0"
 }

     "transcribe": 50360,
     "translate": 50359
   },
+  "transformers_version": "4.40.1"
 }

kotoba_whisper.py CHANGED Viewed

@@ -24,7 +24,7 @@ class Punctuator:
     def punctuate(self, pipeline_chunk: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         def validate_punctuation(raw: str, punctuated: str):
-            if 'unk' in punctuated:
                 return raw
             if punctuated.count("。") > 1:
                 ind = punctuated.rfind("。")

     def punctuate(self, pipeline_chunk: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         def validate_punctuation(raw: str, punctuated: str):
+            if 'unk' in punctuated.lower() or any(p in raw for p in self.ja_punctuations):
                 return raw
             if punctuated.count("。") > 1:
                 ind = punctuated.rfind("。")