daihui.zhang
commited on
Commit
·
85f1f06
1
Parent(s):
c4470f1
add whisper fine tune for chinese
Browse files- config.py +3 -2
- moyoyo_asr_models/ggml-small-encoder.mlmodelc/analytics/coremldata.bin +3 -0
- moyoyo_asr_models/ggml-small-encoder.mlmodelc/coremldata.bin +3 -0
- moyoyo_asr_models/ggml-small-encoder.mlmodelc/metadata.json +64 -0
- moyoyo_asr_models/ggml-small-encoder.mlmodelc/model.mil +0 -0
- moyoyo_asr_models/ggml-small-encoder.mlmodelc/weights/weight.bin +3 -0
- moyoyo_asr_models/ggml-small.bin +3 -0
- transcribe/pipelines/pipe_vad.py +2 -1
config.py
CHANGED
|
@@ -50,8 +50,9 @@ MAX_LENTH_ZH = 4
|
|
| 50 |
WHISPER_PROMPT_EN = ""# "The following is an English sentence."
|
| 51 |
MAX_LENGTH_EN= 8
|
| 52 |
|
| 53 |
-
|
| 54 |
-
WHISPER_MODEL = 'large-v3-turbo-q5_0'
|
|
|
|
| 55 |
|
| 56 |
# LLM
|
| 57 |
LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
|
|
|
|
| 50 |
WHISPER_PROMPT_EN = ""# "The following is an English sentence."
|
| 51 |
MAX_LENGTH_EN= 8
|
| 52 |
|
| 53 |
+
WHISPER_MODEL = 'medium-q5_0'
|
| 54 |
+
# WHISPER_MODEL = 'large-v3-turbo-q5_0'
|
| 55 |
+
# WHISPER_MODEL = 'small'
|
| 56 |
|
| 57 |
# LLM
|
| 58 |
LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
|
moyoyo_asr_models/ggml-small-encoder.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:18ad2072ae82872c2ba8a187071e1e7d6c1105253685e7aa95138adcf07874e0
|
| 3 |
+
size 207
|
moyoyo_asr_models/ggml-small-encoder.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:05fe28591b40616fa0c34ad7b853133623f5300923ec812acb11459c411acf3b
|
| 3 |
+
size 149
|
moyoyo_asr_models/ggml-small-encoder.mlmodelc/metadata.json
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"metadataOutputVersion" : "3.0",
|
| 4 |
+
"storagePrecision" : "Float16",
|
| 5 |
+
"outputSchema" : [
|
| 6 |
+
{
|
| 7 |
+
"hasShapeFlexibility" : "0",
|
| 8 |
+
"isOptional" : "0",
|
| 9 |
+
"dataType" : "Float32",
|
| 10 |
+
"formattedType" : "MultiArray (Float32)",
|
| 11 |
+
"shortDescription" : "",
|
| 12 |
+
"shape" : "[]",
|
| 13 |
+
"name" : "output",
|
| 14 |
+
"type" : "MultiArray"
|
| 15 |
+
}
|
| 16 |
+
],
|
| 17 |
+
"modelParameters" : [
|
| 18 |
+
|
| 19 |
+
],
|
| 20 |
+
"specificationVersion" : 6,
|
| 21 |
+
"mlProgramOperationTypeHistogram" : {
|
| 22 |
+
"Linear" : 72,
|
| 23 |
+
"Matmul" : 24,
|
| 24 |
+
"Cast" : 2,
|
| 25 |
+
"Conv" : 2,
|
| 26 |
+
"Softmax" : 12,
|
| 27 |
+
"Add" : 25,
|
| 28 |
+
"LayerNorm" : 25,
|
| 29 |
+
"Mul" : 24,
|
| 30 |
+
"Transpose" : 49,
|
| 31 |
+
"Gelu" : 14,
|
| 32 |
+
"Reshape" : 48
|
| 33 |
+
},
|
| 34 |
+
"computePrecision" : "Mixed (Float16, Float32, Int32)",
|
| 35 |
+
"isUpdatable" : "0",
|
| 36 |
+
"availability" : {
|
| 37 |
+
"macOS" : "12.0",
|
| 38 |
+
"tvOS" : "15.0",
|
| 39 |
+
"watchOS" : "8.0",
|
| 40 |
+
"iOS" : "15.0",
|
| 41 |
+
"macCatalyst" : "15.0"
|
| 42 |
+
},
|
| 43 |
+
"modelType" : {
|
| 44 |
+
"name" : "MLModelType_mlProgram"
|
| 45 |
+
},
|
| 46 |
+
"userDefinedMetadata" : {
|
| 47 |
+
|
| 48 |
+
},
|
| 49 |
+
"inputSchema" : [
|
| 50 |
+
{
|
| 51 |
+
"hasShapeFlexibility" : "0",
|
| 52 |
+
"isOptional" : "0",
|
| 53 |
+
"dataType" : "Float32",
|
| 54 |
+
"formattedType" : "MultiArray (Float32 1 × 80 × 3000)",
|
| 55 |
+
"shortDescription" : "",
|
| 56 |
+
"shape" : "[1, 80, 3000]",
|
| 57 |
+
"name" : "logmel_data",
|
| 58 |
+
"type" : "MultiArray"
|
| 59 |
+
}
|
| 60 |
+
],
|
| 61 |
+
"generatedClassName" : "coreml_encoder_small",
|
| 62 |
+
"method" : "predict"
|
| 63 |
+
}
|
| 64 |
+
]
|
moyoyo_asr_models/ggml-small-encoder.mlmodelc/model.mil
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
moyoyo_asr_models/ggml-small-encoder.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:87eed4ae76f11a2d4a50786bc7423d4b45c2d0d9ca05577a3bd2557452072eaf
|
| 3 |
+
size 176339456
|
moyoyo_asr_models/ggml-small.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3f6ef171491de375b741059400ba9a0aead023122b7a7db731b4943f9baa0f97
|
| 3 |
+
size 487601984
|
transcribe/pipelines/pipe_vad.py
CHANGED
|
@@ -97,6 +97,7 @@ class VadPipe(BasePipe):
|
|
| 97 |
def _process_speech_chunk(self, source_audio:np.ndarray):
|
| 98 |
speech_dict = self.vac(source_audio, return_seconds=False)
|
| 99 |
if speech_dict:
|
|
|
|
| 100 |
relative_start_frame = None
|
| 101 |
relative_end_frame = None
|
| 102 |
start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
|
|
@@ -109,7 +110,7 @@ class VadPipe(BasePipe):
|
|
| 109 |
def process(self, in_data: MetaItem) -> MetaItem:
|
| 110 |
if self._offset == 0:
|
| 111 |
self.vac.reset_states()
|
| 112 |
-
silence_audio_100ms = np.zeros(int(0.1*self.sample_rate))
|
| 113 |
source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
|
| 114 |
speech_data = self._process_speech_chunk(source_audio)
|
| 115 |
|
|
|
|
| 97 |
def _process_speech_chunk(self, source_audio:np.ndarray):
|
| 98 |
speech_dict = self.vac(source_audio, return_seconds=False)
|
| 99 |
if speech_dict:
|
| 100 |
+
logging.debug(f"🔊 {speech_dict} {self._offset}")
|
| 101 |
relative_start_frame = None
|
| 102 |
relative_end_frame = None
|
| 103 |
start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
|
|
|
|
| 110 |
def process(self, in_data: MetaItem) -> MetaItem:
|
| 111 |
if self._offset == 0:
|
| 112 |
self.vac.reset_states()
|
| 113 |
+
# silence_audio_100ms = np.zeros(int(0.1*self.sample_rate))
|
| 114 |
source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
|
| 115 |
speech_data = self._process_speech_chunk(source_audio)
|
| 116 |
|