daihui.zhang commited on
Commit
85f1f06
·
1 Parent(s): c4470f1

add whisper fine tune for chinese

Browse files
config.py CHANGED
@@ -50,8 +50,9 @@ MAX_LENTH_ZH = 4
50
  WHISPER_PROMPT_EN = ""# "The following is an English sentence."
51
  MAX_LENGTH_EN= 8
52
 
53
- # WHISPER_MODEL = 'medium-q5_0'
54
- WHISPER_MODEL = 'large-v3-turbo-q5_0'
 
55
 
56
  # LLM
57
  LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
 
50
  WHISPER_PROMPT_EN = ""# "The following is an English sentence."
51
  MAX_LENGTH_EN= 8
52
 
53
+ WHISPER_MODEL = 'medium-q5_0'
54
+ # WHISPER_MODEL = 'large-v3-turbo-q5_0'
55
+ # WHISPER_MODEL = 'small'
56
 
57
  # LLM
58
  LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
moyoyo_asr_models/ggml-small-encoder.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18ad2072ae82872c2ba8a187071e1e7d6c1105253685e7aa95138adcf07874e0
3
+ size 207
moyoyo_asr_models/ggml-small-encoder.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05fe28591b40616fa0c34ad7b853133623f5300923ec812acb11459c411acf3b
3
+ size 149
moyoyo_asr_models/ggml-small-encoder.mlmodelc/metadata.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "metadataOutputVersion" : "3.0",
4
+ "storagePrecision" : "Float16",
5
+ "outputSchema" : [
6
+ {
7
+ "hasShapeFlexibility" : "0",
8
+ "isOptional" : "0",
9
+ "dataType" : "Float32",
10
+ "formattedType" : "MultiArray (Float32)",
11
+ "shortDescription" : "",
12
+ "shape" : "[]",
13
+ "name" : "output",
14
+ "type" : "MultiArray"
15
+ }
16
+ ],
17
+ "modelParameters" : [
18
+
19
+ ],
20
+ "specificationVersion" : 6,
21
+ "mlProgramOperationTypeHistogram" : {
22
+ "Linear" : 72,
23
+ "Matmul" : 24,
24
+ "Cast" : 2,
25
+ "Conv" : 2,
26
+ "Softmax" : 12,
27
+ "Add" : 25,
28
+ "LayerNorm" : 25,
29
+ "Mul" : 24,
30
+ "Transpose" : 49,
31
+ "Gelu" : 14,
32
+ "Reshape" : 48
33
+ },
34
+ "computePrecision" : "Mixed (Float16, Float32, Int32)",
35
+ "isUpdatable" : "0",
36
+ "availability" : {
37
+ "macOS" : "12.0",
38
+ "tvOS" : "15.0",
39
+ "watchOS" : "8.0",
40
+ "iOS" : "15.0",
41
+ "macCatalyst" : "15.0"
42
+ },
43
+ "modelType" : {
44
+ "name" : "MLModelType_mlProgram"
45
+ },
46
+ "userDefinedMetadata" : {
47
+
48
+ },
49
+ "inputSchema" : [
50
+ {
51
+ "hasShapeFlexibility" : "0",
52
+ "isOptional" : "0",
53
+ "dataType" : "Float32",
54
+ "formattedType" : "MultiArray (Float32 1 × 80 × 3000)",
55
+ "shortDescription" : "",
56
+ "shape" : "[1, 80, 3000]",
57
+ "name" : "logmel_data",
58
+ "type" : "MultiArray"
59
+ }
60
+ ],
61
+ "generatedClassName" : "coreml_encoder_small",
62
+ "method" : "predict"
63
+ }
64
+ ]
moyoyo_asr_models/ggml-small-encoder.mlmodelc/model.mil ADDED
The diff for this file is too large to render. See raw diff
 
moyoyo_asr_models/ggml-small-encoder.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87eed4ae76f11a2d4a50786bc7423d4b45c2d0d9ca05577a3bd2557452072eaf
3
+ size 176339456
moyoyo_asr_models/ggml-small.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f6ef171491de375b741059400ba9a0aead023122b7a7db731b4943f9baa0f97
3
+ size 487601984
transcribe/pipelines/pipe_vad.py CHANGED
@@ -97,6 +97,7 @@ class VadPipe(BasePipe):
97
  def _process_speech_chunk(self, source_audio:np.ndarray):
98
  speech_dict = self.vac(source_audio, return_seconds=False)
99
  if speech_dict:
 
100
  relative_start_frame = None
101
  relative_end_frame = None
102
  start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
@@ -109,7 +110,7 @@ class VadPipe(BasePipe):
109
  def process(self, in_data: MetaItem) -> MetaItem:
110
  if self._offset == 0:
111
  self.vac.reset_states()
112
- silence_audio_100ms = np.zeros(int(0.1*self.sample_rate))
113
  source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
114
  speech_data = self._process_speech_chunk(source_audio)
115
 
 
97
  def _process_speech_chunk(self, source_audio:np.ndarray):
98
  speech_dict = self.vac(source_audio, return_seconds=False)
99
  if speech_dict:
100
+ logging.debug(f"🔊 {speech_dict} {self._offset}")
101
  relative_start_frame = None
102
  relative_end_frame = None
103
  start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
 
110
  def process(self, in_data: MetaItem) -> MetaItem:
111
  if self._offset == 0:
112
  self.vac.reset_states()
113
+ # silence_audio_100ms = np.zeros(int(0.1*self.sample_rate))
114
  source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
115
  speech_data = self._process_speech_chunk(source_audio)
116