update
Browse files- examples/silero_vad_by_webrtcvad/run.sh +1 -1
- examples/silero_vad_by_webrtcvad/step_5_export_model.py +3 -1
- examples/silero_vad_by_webrtcvad/yaml/config-240-n10-20.yaml +4 -4
- examples/silero_vad_by_webrtcvad/yaml/config-256-0-20.yaml +43 -0
- toolbox/torchaudio/models/vad/native_silero_vad/check_model.py +145 -0
- toolbox/torchaudio/models/vad/native_silero_vad/inference_native_silero_vad_onnx.py +2 -0
examples/silero_vad_by_webrtcvad/run.sh
CHANGED
|
@@ -8,7 +8,7 @@ bash run.sh --stage 3 --stop_stage 5 --system_version centos \
|
|
| 8 |
--noise_patterns "/data/tianxing/HuggingDatasets/nx_noise/data/noise/**/*.wav" \
|
| 9 |
--speech_patterns "/data/tianxing/HuggingDatasets/nx_noise/data/speech/dns3-speech/**/*.wav \
|
| 10 |
/data/tianxing/HuggingDatasets/nx_noise/data/speech/nx-speech2/**/*.wav" \
|
| 11 |
-
--config_file yaml/config-
|
| 12 |
|
| 13 |
END
|
| 14 |
|
|
|
|
| 8 |
--noise_patterns "/data/tianxing/HuggingDatasets/nx_noise/data/noise/**/*.wav" \
|
| 9 |
--speech_patterns "/data/tianxing/HuggingDatasets/nx_noise/data/speech/dns3-speech/**/*.wav \
|
| 10 |
/data/tianxing/HuggingDatasets/nx_noise/data/speech/nx-speech2/**/*.wav" \
|
| 11 |
+
--config_file yaml/config-256-0-20.yaml
|
| 12 |
|
| 13 |
END
|
| 14 |
|
examples/silero_vad_by_webrtcvad/step_5_export_model.py
CHANGED
|
@@ -94,7 +94,9 @@ def main():
|
|
| 94 |
"new_lstm_hidden_state": {2: "batch_size"},
|
| 95 |
})
|
| 96 |
|
| 97 |
-
ort_session = ort.InferenceSession(
|
|
|
|
|
|
|
| 98 |
input_feed = {
|
| 99 |
"inputs": inputs.numpy(),
|
| 100 |
"encoder_in_cache": encoder_in_cache.numpy(),
|
|
|
|
| 94 |
"new_lstm_hidden_state": {2: "batch_size"},
|
| 95 |
})
|
| 96 |
|
| 97 |
+
ort_session = ort.InferenceSession(
|
| 98 |
+
output_file.as_posix()
|
| 99 |
+
)
|
| 100 |
input_feed = {
|
| 101 |
"inputs": inputs.numpy(),
|
| 102 |
"encoder_in_cache": encoder_in_cache.numpy(),
|
examples/silero_vad_by_webrtcvad/yaml/config-240-n10-20.yaml
CHANGED
|
@@ -3,8 +3,8 @@ model_name: "silero_vad"
|
|
| 3 |
# spec
|
| 4 |
sample_rate: 8000
|
| 5 |
nfft: 512
|
| 6 |
-
win_size:
|
| 7 |
-
hop_size:
|
| 8 |
win_type: hann
|
| 9 |
|
| 10 |
# model
|
|
@@ -19,12 +19,12 @@ decoder_num_layers: 2
|
|
| 19 |
|
| 20 |
# lsnr
|
| 21 |
n_frame: 3
|
| 22 |
-
min_local_snr_db: -
|
| 23 |
max_local_snr_db: 30
|
| 24 |
norm_tau: 1.
|
| 25 |
|
| 26 |
# data
|
| 27 |
-
min_snr_db:
|
| 28 |
max_snr_db: 20
|
| 29 |
|
| 30 |
# train
|
|
|
|
| 3 |
# spec
|
| 4 |
sample_rate: 8000
|
| 5 |
nfft: 512
|
| 6 |
+
win_size: 240
|
| 7 |
+
hop_size: 80
|
| 8 |
win_type: hann
|
| 9 |
|
| 10 |
# model
|
|
|
|
| 19 |
|
| 20 |
# lsnr
|
| 21 |
n_frame: 3
|
| 22 |
+
min_local_snr_db: -15
|
| 23 |
max_local_snr_db: 30
|
| 24 |
norm_tau: 1.
|
| 25 |
|
| 26 |
# data
|
| 27 |
+
min_snr_db: -10
|
| 28 |
max_snr_db: 20
|
| 29 |
|
| 30 |
# train
|
examples/silero_vad_by_webrtcvad/yaml/config-256-0-20.yaml
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_name: "silero_vad"
|
| 2 |
+
|
| 3 |
+
# spec
|
| 4 |
+
sample_rate: 8000
|
| 5 |
+
nfft: 256
|
| 6 |
+
win_size: 256
|
| 7 |
+
hop_size: 128
|
| 8 |
+
win_type: hann
|
| 9 |
+
|
| 10 |
+
# model
|
| 11 |
+
encoder_in_channels: 64
|
| 12 |
+
encoder_hidden_channels: 128
|
| 13 |
+
encoder_out_channels: 128
|
| 14 |
+
encoder_kernel_size: 3
|
| 15 |
+
encoder_num_layers: 4
|
| 16 |
+
|
| 17 |
+
decoder_hidden_size: 128
|
| 18 |
+
decoder_num_layers: 2
|
| 19 |
+
|
| 20 |
+
# lsnr
|
| 21 |
+
n_frame: 3
|
| 22 |
+
min_local_snr_db: -5
|
| 23 |
+
max_local_snr_db: 30
|
| 24 |
+
norm_tau: 1.
|
| 25 |
+
|
| 26 |
+
# data
|
| 27 |
+
min_snr_db: 0
|
| 28 |
+
max_snr_db: 20
|
| 29 |
+
|
| 30 |
+
# train
|
| 31 |
+
lr: 0.001
|
| 32 |
+
lr_scheduler: "CosineAnnealingLR"
|
| 33 |
+
lr_scheduler_kwargs:
|
| 34 |
+
T_max: 250000
|
| 35 |
+
eta_min: 0.0001
|
| 36 |
+
|
| 37 |
+
max_epochs: 100
|
| 38 |
+
clip_grad_norm: 10.0
|
| 39 |
+
seed: 1234
|
| 40 |
+
|
| 41 |
+
num_workers: 4
|
| 42 |
+
batch_size: 128
|
| 43 |
+
eval_steps: 25000
|
toolbox/torchaudio/models/vad/native_silero_vad/check_model.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import argparse
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import tempfile
|
| 6 |
+
import zipfile
|
| 7 |
+
|
| 8 |
+
import onnx
|
| 9 |
+
from onnx import shape_inference
|
| 10 |
+
|
| 11 |
+
from project_settings import project_path
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def get_args():
|
| 15 |
+
parser = argparse.ArgumentParser()
|
| 16 |
+
parser.add_argument(
|
| 17 |
+
"--model_path",
|
| 18 |
+
default=(project_path / "trained_models/native_silero_vad.zip").as_posix(),
|
| 19 |
+
type=str
|
| 20 |
+
)
|
| 21 |
+
parser.add_argument("--no-infer", action="store_true", help="不做 shape 推断")
|
| 22 |
+
|
| 23 |
+
args = parser.parse_args()
|
| 24 |
+
return args
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def shape2tuple(shape_proto):
|
| 28 |
+
dims = []
|
| 29 |
+
for d in shape_proto.dim:
|
| 30 |
+
dims.append(d.dim_value if (d.dim_value > 0) else None)
|
| 31 |
+
return tuple(dims)
|
| 32 |
+
|
| 33 |
+
def summarize_tensor_proto(tensor_proto):
|
| 34 |
+
dims = tuple(tensor_proto.dims)
|
| 35 |
+
data_type = tensor_proto.data_type
|
| 36 |
+
try:
|
| 37 |
+
arr = numpy_helper.to_array(tensor_proto)
|
| 38 |
+
sample = arr.flatten()[:5].tolist()
|
| 39 |
+
return f"tensor shape={dims}, dtype={data_type}, sample={sample}…"
|
| 40 |
+
except Exception:
|
| 41 |
+
return f"tensor shape={dims}, dtype={data_type}, (cannot parse values)"
|
| 42 |
+
|
| 43 |
+
def print_graph(graph: onnx.GraphProto, indent: int = 0, do_infer_shape: bool = True):
|
| 44 |
+
prefix = " " * indent
|
| 45 |
+
|
| 46 |
+
# 推断 shape
|
| 47 |
+
if do_infer_shape:
|
| 48 |
+
temp_model = onnx.helper.make_model(graph)
|
| 49 |
+
inferred = shape_inference.infer_shapes(temp_model)
|
| 50 |
+
graph_to_use = inferred.graph
|
| 51 |
+
value_info = {vi.name: vi for vi in graph_to_use.value_info}
|
| 52 |
+
else:
|
| 53 |
+
graph_to_use = graph
|
| 54 |
+
value_info = {vi.name: vi for vi in graph_to_use.value_info}
|
| 55 |
+
|
| 56 |
+
print(f"{prefix}Graph '{graph.name}' (nodes = {len(graph_to_use.node)})")
|
| 57 |
+
# 打印输入
|
| 58 |
+
for inp in graph_to_use.input:
|
| 59 |
+
name = inp.name
|
| 60 |
+
tp = inp.type.tensor_type
|
| 61 |
+
shape = shape2tuple(tp.shape)
|
| 62 |
+
print(f"{prefix} Input: {name}, shape={shape}, elem_type={tp.elem_type}")
|
| 63 |
+
# 打印输出
|
| 64 |
+
for out in graph_to_use.output:
|
| 65 |
+
name = out.name
|
| 66 |
+
tp = out.type.tensor_type
|
| 67 |
+
shape = shape2tuple(tp.shape)
|
| 68 |
+
print(f"{prefix} Output: {name}, shape={shape}, elem_type={tp.elem_type}")
|
| 69 |
+
print()
|
| 70 |
+
|
| 71 |
+
# 打印节点
|
| 72 |
+
for idx, node in enumerate(graph_to_use.node):
|
| 73 |
+
print(f"{prefix}[{idx}] op_type: {node.op_type}, name: {node.name}")
|
| 74 |
+
print(f"{prefix} inputs: {node.input}")
|
| 75 |
+
print(f"{prefix} outputs: {node.output}")
|
| 76 |
+
|
| 77 |
+
# 打印属性
|
| 78 |
+
for attr in node.attribute:
|
| 79 |
+
name = attr.name
|
| 80 |
+
t = attr.type
|
| 81 |
+
if t == onnx.AttributeProto.GRAPH:
|
| 82 |
+
subg = attr.g
|
| 83 |
+
print(f"{prefix} attr: {name} (GRAPH) -> subgraph '{subg.name}', {len(subg.node)} nodes")
|
| 84 |
+
# **单独打印子图**,并且作为一个“完整图”
|
| 85 |
+
print_graph(subg, indent=indent + 4, do_infer_shape=do_infer_shape)
|
| 86 |
+
elif t == onnx.AttributeProto.TENSOR:
|
| 87 |
+
desc = summarize_tensor_proto(attr.t)
|
| 88 |
+
print(f"{prefix} attr: {name} (TENSOR) -> {desc}")
|
| 89 |
+
elif t == onnx.AttributeProto.INTS:
|
| 90 |
+
print(f"{prefix} attr: {name} (INTS) -> {list(attr.ints)}")
|
| 91 |
+
elif t == onnx.AttributeProto.INT:
|
| 92 |
+
print(f"{prefix} attr: {name} (INT) -> {attr.i}")
|
| 93 |
+
elif t == onnx.AttributeProto.FLOAT:
|
| 94 |
+
print(f"{prefix} attr: {name} (FLOAT) -> {attr.f}")
|
| 95 |
+
elif t == onnx.AttributeProto.STRING:
|
| 96 |
+
try:
|
| 97 |
+
s = attr.s.decode('utf-8')
|
| 98 |
+
except:
|
| 99 |
+
s = attr.s
|
| 100 |
+
print(f"{prefix} attr: {name} (STRING) -> {s}")
|
| 101 |
+
else:
|
| 102 |
+
print(f"{prefix} attr: {name} (type={t})")
|
| 103 |
+
|
| 104 |
+
# 打印中间 tensor 的 shape(如果有推断 info)
|
| 105 |
+
for out_name in node.output:
|
| 106 |
+
if out_name in value_info:
|
| 107 |
+
vi = value_info[out_name]
|
| 108 |
+
shape = shape2tuple(vi.type.tensor_type.shape)
|
| 109 |
+
print(f"{prefix} output tensor '{out_name}' shape: {shape}")
|
| 110 |
+
for in_name in node.input:
|
| 111 |
+
if in_name in value_info:
|
| 112 |
+
vi = value_info[in_name]
|
| 113 |
+
shape = shape2tuple(vi.type.tensor_type.shape)
|
| 114 |
+
print(f"{prefix} input tensor '{in_name}' shape: {shape}")
|
| 115 |
+
|
| 116 |
+
print()
|
| 117 |
+
|
| 118 |
+
def print_model_with_branches(onnx_path: str, do_infer_shape: bool = True):
|
| 119 |
+
model = onnx.load(onnx_path)
|
| 120 |
+
onnx.checker.check_model(model)
|
| 121 |
+
|
| 122 |
+
print("=== Main graph ===")
|
| 123 |
+
print_graph(model.graph, indent=0, do_infer_shape=do_infer_shape)
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def main():
|
| 127 |
+
args = get_args()
|
| 128 |
+
|
| 129 |
+
model_path = Path(args.model_path)
|
| 130 |
+
if model_path.name.endswith(".zip"):
|
| 131 |
+
with zipfile.ZipFile(model_path.as_posix(), "r") as f_zip:
|
| 132 |
+
out_root = Path(tempfile.gettempdir()) / "cc_vad"
|
| 133 |
+
out_root.mkdir(parents=True, exist_ok=True)
|
| 134 |
+
f_zip.extractall(path=out_root)
|
| 135 |
+
model_path = out_root / model_path.stem
|
| 136 |
+
|
| 137 |
+
onnx_path = (model_path / "silero_vad.onnx").as_posix()
|
| 138 |
+
|
| 139 |
+
print_model_with_branches(onnx_path, do_infer_shape=not args.no_infer)
|
| 140 |
+
|
| 141 |
+
return
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
if __name__ == "__main__":
|
| 145 |
+
main()
|
toolbox/torchaudio/models/vad/native_silero_vad/inference_native_silero_vad_onnx.py
CHANGED
|
@@ -102,6 +102,7 @@ class InferenceNativeSileroVadOnnx(object):
|
|
| 102 |
context_size = 64 if self.config.sample_rate == 16000 else 32
|
| 103 |
|
| 104 |
chunk = torch.cat(tensors=[context, chunk], dim=1)
|
|
|
|
| 105 |
input_feed = {
|
| 106 |
"input": chunk.numpy(),
|
| 107 |
"state": state.numpy(),
|
|
@@ -114,6 +115,7 @@ class InferenceNativeSileroVadOnnx(object):
|
|
| 114 |
vad_flag = torch.from_numpy(vad_flag)
|
| 115 |
state = torch.from_numpy(state)
|
| 116 |
context = chunk[..., -context_size:]
|
|
|
|
| 117 |
return vad_flag, context, state
|
| 118 |
|
| 119 |
def infer(self, signal: np.ndarray) -> np.ndarray:
|
|
|
|
| 102 |
context_size = 64 if self.config.sample_rate == 16000 else 32
|
| 103 |
|
| 104 |
chunk = torch.cat(tensors=[context, chunk], dim=1)
|
| 105 |
+
# chunk shape: [1, 256+32=288]
|
| 106 |
input_feed = {
|
| 107 |
"input": chunk.numpy(),
|
| 108 |
"state": state.numpy(),
|
|
|
|
| 115 |
vad_flag = torch.from_numpy(vad_flag)
|
| 116 |
state = torch.from_numpy(state)
|
| 117 |
context = chunk[..., -context_size:]
|
| 118 |
+
# context shape: [1, 32]
|
| 119 |
return vad_flag, context, state
|
| 120 |
|
| 121 |
def infer(self, signal: np.ndarray) -> np.ndarray:
|