add whisper fine tune for chinese

Files changed (8) hide show

config.py +3 -2
moyoyo_asr_models/ggml-small-encoder.mlmodelc/analytics/coremldata.bin +3 -0
moyoyo_asr_models/ggml-small-encoder.mlmodelc/coremldata.bin +3 -0
moyoyo_asr_models/ggml-small-encoder.mlmodelc/metadata.json +64 -0
moyoyo_asr_models/ggml-small-encoder.mlmodelc/model.mil +0 -0
moyoyo_asr_models/ggml-small-encoder.mlmodelc/weights/weight.bin +3 -0
moyoyo_asr_models/ggml-small.bin +3 -0
transcribe/pipelines/pipe_vad.py +2 -1

config.py CHANGED Viewed

@@ -50,8 +50,9 @@ MAX_LENTH_ZH = 4
 WHISPER_PROMPT_EN = ""# "The following is an English sentence."
 MAX_LENGTH_EN= 8
-# WHISPER_MODEL = 'medium-q5_0'
-WHISPER_MODEL = 'large-v3-turbo-q5_0'
 # LLM
 LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()

 WHISPER_PROMPT_EN = ""# "The following is an English sentence."
 MAX_LENGTH_EN= 8
+WHISPER_MODEL = 'medium-q5_0'
+# WHISPER_MODEL = 'large-v3-turbo-q5_0'
+# WHISPER_MODEL = 'small'
 # LLM
 LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()

moyoyo_asr_models/ggml-small-encoder.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:18ad2072ae82872c2ba8a187071e1e7d6c1105253685e7aa95138adcf07874e0
+size 207

moyoyo_asr_models/ggml-small-encoder.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:05fe28591b40616fa0c34ad7b853133623f5300923ec812acb11459c411acf3b
+size 149

moyoyo_asr_models/ggml-small-encoder.mlmodelc/metadata.json ADDED Viewed

	@@ -0,0 +1,64 @@

+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float16",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32)",
+        "shortDescription" : "",
+        "shape" : "[]",
+        "name" : "output",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+    ],
+    "specificationVersion" : 6,
+    "mlProgramOperationTypeHistogram" : {
+      "Linear" : 72,
+      "Matmul" : 24,
+      "Cast" : 2,
+      "Conv" : 2,
+      "Softmax" : 12,
+      "Add" : 25,
+      "LayerNorm" : 25,
+      "Mul" : 24,
+      "Transpose" : 49,
+      "Gelu" : 14,
+      "Reshape" : 48
+    },
+    "computePrecision" : "Mixed (Float16, Float32, Int32)",
+    "isUpdatable" : "0",
+    "availability" : {
+      "macOS" : "12.0",
+      "tvOS" : "15.0",
+      "watchOS" : "8.0",
+      "iOS" : "15.0",
+      "macCatalyst" : "15.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 80 × 3000)",
+        "shortDescription" : "",
+        "shape" : "[1, 80, 3000]",
+        "name" : "logmel_data",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "coreml_encoder_small",
+    "method" : "predict"
+  }
+]

moyoyo_asr_models/ggml-small-encoder.mlmodelc/model.mil ADDED Viewed

The diff for this file is too large to render. See raw diff

moyoyo_asr_models/ggml-small-encoder.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:87eed4ae76f11a2d4a50786bc7423d4b45c2d0d9ca05577a3bd2557452072eaf
+size 176339456

moyoyo_asr_models/ggml-small.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f6ef171491de375b741059400ba9a0aead023122b7a7db731b4943f9baa0f97
+size 487601984

transcribe/pipelines/pipe_vad.py CHANGED Viewed

@@ -97,6 +97,7 @@ class VadPipe(BasePipe):
     def _process_speech_chunk(self, source_audio:np.ndarray):
         speech_dict = self.vac(source_audio, return_seconds=False)
         if speech_dict:
             relative_start_frame = None
             relative_end_frame = None
             start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
@@ -109,7 +110,7 @@ class VadPipe(BasePipe):
     def process(self, in_data: MetaItem) -> MetaItem:
         if self._offset == 0:
             self.vac.reset_states()
-        silence_audio_100ms = np.zeros(int(0.1*self.sample_rate))
         source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
         speech_data  = self._process_speech_chunk(source_audio)

     def _process_speech_chunk(self, source_audio:np.ndarray):
         speech_dict = self.vac(source_audio, return_seconds=False)
         if speech_dict:
+            logging.debug(f"🔊 {speech_dict} {self._offset}")
             relative_start_frame = None
             relative_end_frame = None
             start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
     def process(self, in_data: MetaItem) -> MetaItem:
         if self._offset == 0:
             self.vac.reset_states()
+        # silence_audio_100ms = np.zeros(int(0.1*self.sample_rate))
         source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
         speech_data  = self._process_speech_chunk(source_audio)