HoneyTian commited on
Commit
e027841
·
1 Parent(s): ca00d34
examples/silero_vad_by_webrtcvad/run.sh CHANGED
@@ -8,7 +8,7 @@ bash run.sh --stage 3 --stop_stage 5 --system_version centos \
8
  --noise_patterns "/data/tianxing/HuggingDatasets/nx_noise/data/noise/**/*.wav" \
9
  --speech_patterns "/data/tianxing/HuggingDatasets/nx_noise/data/speech/dns3-speech/**/*.wav \
10
  /data/tianxing/HuggingDatasets/nx_noise/data/speech/nx-speech2/**/*.wav" \
11
- --config_file yaml/config-1024-0-20.yaml
12
 
13
  END
14
 
 
8
  --noise_patterns "/data/tianxing/HuggingDatasets/nx_noise/data/noise/**/*.wav" \
9
  --speech_patterns "/data/tianxing/HuggingDatasets/nx_noise/data/speech/dns3-speech/**/*.wav \
10
  /data/tianxing/HuggingDatasets/nx_noise/data/speech/nx-speech2/**/*.wav" \
11
+ --config_file yaml/config-256-0-20.yaml
12
 
13
  END
14
 
examples/silero_vad_by_webrtcvad/step_5_export_model.py CHANGED
@@ -94,7 +94,9 @@ def main():
94
  "new_lstm_hidden_state": {2: "batch_size"},
95
  })
96
 
97
- ort_session = ort.InferenceSession("model.onnx")
 
 
98
  input_feed = {
99
  "inputs": inputs.numpy(),
100
  "encoder_in_cache": encoder_in_cache.numpy(),
 
94
  "new_lstm_hidden_state": {2: "batch_size"},
95
  })
96
 
97
+ ort_session = ort.InferenceSession(
98
+ output_file.as_posix()
99
+ )
100
  input_feed = {
101
  "inputs": inputs.numpy(),
102
  "encoder_in_cache": encoder_in_cache.numpy(),
examples/silero_vad_by_webrtcvad/yaml/config-240-n10-20.yaml CHANGED
@@ -3,8 +3,8 @@ model_name: "silero_vad"
3
  # spec
4
  sample_rate: 8000
5
  nfft: 512
6
- win_size: 512
7
- hop_size: 256
8
  win_type: hann
9
 
10
  # model
@@ -19,12 +19,12 @@ decoder_num_layers: 2
19
 
20
  # lsnr
21
  n_frame: 3
22
- min_local_snr_db: -5
23
  max_local_snr_db: 30
24
  norm_tau: 1.
25
 
26
  # data
27
- min_snr_db: 0
28
  max_snr_db: 20
29
 
30
  # train
 
3
  # spec
4
  sample_rate: 8000
5
  nfft: 512
6
+ win_size: 240
7
+ hop_size: 80
8
  win_type: hann
9
 
10
  # model
 
19
 
20
  # lsnr
21
  n_frame: 3
22
+ min_local_snr_db: -15
23
  max_local_snr_db: 30
24
  norm_tau: 1.
25
 
26
  # data
27
+ min_snr_db: -10
28
  max_snr_db: 20
29
 
30
  # train
examples/silero_vad_by_webrtcvad/yaml/config-256-0-20.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name: "silero_vad"
2
+
3
+ # spec
4
+ sample_rate: 8000
5
+ nfft: 256
6
+ win_size: 256
7
+ hop_size: 128
8
+ win_type: hann
9
+
10
+ # model
11
+ encoder_in_channels: 64
12
+ encoder_hidden_channels: 128
13
+ encoder_out_channels: 128
14
+ encoder_kernel_size: 3
15
+ encoder_num_layers: 4
16
+
17
+ decoder_hidden_size: 128
18
+ decoder_num_layers: 2
19
+
20
+ # lsnr
21
+ n_frame: 3
22
+ min_local_snr_db: -5
23
+ max_local_snr_db: 30
24
+ norm_tau: 1.
25
+
26
+ # data
27
+ min_snr_db: 0
28
+ max_snr_db: 20
29
+
30
+ # train
31
+ lr: 0.001
32
+ lr_scheduler: "CosineAnnealingLR"
33
+ lr_scheduler_kwargs:
34
+ T_max: 250000
35
+ eta_min: 0.0001
36
+
37
+ max_epochs: 100
38
+ clip_grad_norm: 10.0
39
+ seed: 1234
40
+
41
+ num_workers: 4
42
+ batch_size: 128
43
+ eval_steps: 25000
toolbox/torchaudio/models/vad/native_silero_vad/check_model.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ from pathlib import Path
5
+ import tempfile
6
+ import zipfile
7
+
8
+ import onnx
9
+ from onnx import shape_inference
10
+
11
+ from project_settings import project_path
12
+
13
+
14
+ def get_args():
15
+ parser = argparse.ArgumentParser()
16
+ parser.add_argument(
17
+ "--model_path",
18
+ default=(project_path / "trained_models/native_silero_vad.zip").as_posix(),
19
+ type=str
20
+ )
21
+ parser.add_argument("--no-infer", action="store_true", help="不做 shape 推断")
22
+
23
+ args = parser.parse_args()
24
+ return args
25
+
26
+
27
+ def shape2tuple(shape_proto):
28
+ dims = []
29
+ for d in shape_proto.dim:
30
+ dims.append(d.dim_value if (d.dim_value > 0) else None)
31
+ return tuple(dims)
32
+
33
+ def summarize_tensor_proto(tensor_proto):
34
+ dims = tuple(tensor_proto.dims)
35
+ data_type = tensor_proto.data_type
36
+ try:
37
+ arr = numpy_helper.to_array(tensor_proto)
38
+ sample = arr.flatten()[:5].tolist()
39
+ return f"tensor shape={dims}, dtype={data_type}, sample={sample}…"
40
+ except Exception:
41
+ return f"tensor shape={dims}, dtype={data_type}, (cannot parse values)"
42
+
43
+ def print_graph(graph: onnx.GraphProto, indent: int = 0, do_infer_shape: bool = True):
44
+ prefix = " " * indent
45
+
46
+ # 推断 shape
47
+ if do_infer_shape:
48
+ temp_model = onnx.helper.make_model(graph)
49
+ inferred = shape_inference.infer_shapes(temp_model)
50
+ graph_to_use = inferred.graph
51
+ value_info = {vi.name: vi for vi in graph_to_use.value_info}
52
+ else:
53
+ graph_to_use = graph
54
+ value_info = {vi.name: vi for vi in graph_to_use.value_info}
55
+
56
+ print(f"{prefix}Graph '{graph.name}' (nodes = {len(graph_to_use.node)})")
57
+ # 打印输入
58
+ for inp in graph_to_use.input:
59
+ name = inp.name
60
+ tp = inp.type.tensor_type
61
+ shape = shape2tuple(tp.shape)
62
+ print(f"{prefix} Input: {name}, shape={shape}, elem_type={tp.elem_type}")
63
+ # 打印输出
64
+ for out in graph_to_use.output:
65
+ name = out.name
66
+ tp = out.type.tensor_type
67
+ shape = shape2tuple(tp.shape)
68
+ print(f"{prefix} Output: {name}, shape={shape}, elem_type={tp.elem_type}")
69
+ print()
70
+
71
+ # 打印节点
72
+ for idx, node in enumerate(graph_to_use.node):
73
+ print(f"{prefix}[{idx}] op_type: {node.op_type}, name: {node.name}")
74
+ print(f"{prefix} inputs: {node.input}")
75
+ print(f"{prefix} outputs: {node.output}")
76
+
77
+ # 打印属性
78
+ for attr in node.attribute:
79
+ name = attr.name
80
+ t = attr.type
81
+ if t == onnx.AttributeProto.GRAPH:
82
+ subg = attr.g
83
+ print(f"{prefix} attr: {name} (GRAPH) -> subgraph '{subg.name}', {len(subg.node)} nodes")
84
+ # **单独打印子图**,并且作为一个“完整图”
85
+ print_graph(subg, indent=indent + 4, do_infer_shape=do_infer_shape)
86
+ elif t == onnx.AttributeProto.TENSOR:
87
+ desc = summarize_tensor_proto(attr.t)
88
+ print(f"{prefix} attr: {name} (TENSOR) -> {desc}")
89
+ elif t == onnx.AttributeProto.INTS:
90
+ print(f"{prefix} attr: {name} (INTS) -> {list(attr.ints)}")
91
+ elif t == onnx.AttributeProto.INT:
92
+ print(f"{prefix} attr: {name} (INT) -> {attr.i}")
93
+ elif t == onnx.AttributeProto.FLOAT:
94
+ print(f"{prefix} attr: {name} (FLOAT) -> {attr.f}")
95
+ elif t == onnx.AttributeProto.STRING:
96
+ try:
97
+ s = attr.s.decode('utf-8')
98
+ except:
99
+ s = attr.s
100
+ print(f"{prefix} attr: {name} (STRING) -> {s}")
101
+ else:
102
+ print(f"{prefix} attr: {name} (type={t})")
103
+
104
+ # 打印中间 tensor 的 shape(如果有推断 info)
105
+ for out_name in node.output:
106
+ if out_name in value_info:
107
+ vi = value_info[out_name]
108
+ shape = shape2tuple(vi.type.tensor_type.shape)
109
+ print(f"{prefix} output tensor '{out_name}' shape: {shape}")
110
+ for in_name in node.input:
111
+ if in_name in value_info:
112
+ vi = value_info[in_name]
113
+ shape = shape2tuple(vi.type.tensor_type.shape)
114
+ print(f"{prefix} input tensor '{in_name}' shape: {shape}")
115
+
116
+ print()
117
+
118
+ def print_model_with_branches(onnx_path: str, do_infer_shape: bool = True):
119
+ model = onnx.load(onnx_path)
120
+ onnx.checker.check_model(model)
121
+
122
+ print("=== Main graph ===")
123
+ print_graph(model.graph, indent=0, do_infer_shape=do_infer_shape)
124
+
125
+
126
+ def main():
127
+ args = get_args()
128
+
129
+ model_path = Path(args.model_path)
130
+ if model_path.name.endswith(".zip"):
131
+ with zipfile.ZipFile(model_path.as_posix(), "r") as f_zip:
132
+ out_root = Path(tempfile.gettempdir()) / "cc_vad"
133
+ out_root.mkdir(parents=True, exist_ok=True)
134
+ f_zip.extractall(path=out_root)
135
+ model_path = out_root / model_path.stem
136
+
137
+ onnx_path = (model_path / "silero_vad.onnx").as_posix()
138
+
139
+ print_model_with_branches(onnx_path, do_infer_shape=not args.no_infer)
140
+
141
+ return
142
+
143
+
144
+ if __name__ == "__main__":
145
+ main()
toolbox/torchaudio/models/vad/native_silero_vad/inference_native_silero_vad_onnx.py CHANGED
@@ -102,6 +102,7 @@ class InferenceNativeSileroVadOnnx(object):
102
  context_size = 64 if self.config.sample_rate == 16000 else 32
103
 
104
  chunk = torch.cat(tensors=[context, chunk], dim=1)
 
105
  input_feed = {
106
  "input": chunk.numpy(),
107
  "state": state.numpy(),
@@ -114,6 +115,7 @@ class InferenceNativeSileroVadOnnx(object):
114
  vad_flag = torch.from_numpy(vad_flag)
115
  state = torch.from_numpy(state)
116
  context = chunk[..., -context_size:]
 
117
  return vad_flag, context, state
118
 
119
  def infer(self, signal: np.ndarray) -> np.ndarray:
 
102
  context_size = 64 if self.config.sample_rate == 16000 else 32
103
 
104
  chunk = torch.cat(tensors=[context, chunk], dim=1)
105
+ # chunk shape: [1, 256+32=288]
106
  input_feed = {
107
  "input": chunk.numpy(),
108
  "state": state.numpy(),
 
115
  vad_flag = torch.from_numpy(vad_flag)
116
  state = torch.from_numpy(state)
117
  context = chunk[..., -context_size:]
118
+ # context shape: [1, 32]
119
  return vad_flag, context, state
120
 
121
  def infer(self, signal: np.ndarray) -> np.ndarray: