update
Browse files
data/call_monitor/id-ID/voice/000a3f9a-b2bf-46fd-9c69-477fc62cda51_id-ID_1671935534167 - 副本.wav
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:cf9e6ef0ee87be308c8a59a1459836dc9229c83be37c5e7204586c385d8d7a84
|
3 |
-
size 32044
|
|
|
|
|
|
|
|
main.py
CHANGED
@@ -41,6 +41,8 @@ def click_ring_vad_button(audio: Tuple[int, np.ndarray],
|
|
41 |
max_silence_length_ms: int = 300,
|
42 |
start_ring_rate: float = 0.9,
|
43 |
end_ring_rate: float = 0.1,
|
|
|
|
|
44 |
):
|
45 |
global vad
|
46 |
|
@@ -64,8 +66,11 @@ def click_ring_vad_button(audio: Tuple[int, np.ndarray],
|
|
64 |
start_ring_rate=start_ring_rate,
|
65 |
end_ring_rate=end_ring_rate,
|
66 |
frame_length_ms=frame_length_ms,
|
|
|
67 |
padding_length_ms=padding_length_ms,
|
68 |
max_silence_length_ms=max_silence_length_ms,
|
|
|
|
|
69 |
sample_rate=sample_rate,
|
70 |
)
|
71 |
|
@@ -88,7 +93,7 @@ def click_ring_vad_button(audio: Tuple[int, np.ndarray],
|
|
88 |
time = np.arange(0, len(signal)) / sample_rate
|
89 |
plt.figure(figsize=(12, 5))
|
90 |
plt.plot(time, signal / 32768, color="b")
|
91 |
-
plt.plot(time, speech_probs
|
92 |
|
93 |
for start, end in vad_segments:
|
94 |
plt.axvline(x=start, ymin=0.15, ymax=0.85, color="g", linestyle="--", label="开始端点")
|
@@ -143,6 +148,10 @@ def main():
|
|
143 |
ring_start_ring_rate = gr.Slider(minimum=0, maximum=1, value=0.9, step=0.05, label="start_ring_rate")
|
144 |
ring_end_ring_rate = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="end_ring_rate")
|
145 |
|
|
|
|
|
|
|
|
|
146 |
ring_button = gr.Button("retrieval", variant="primary")
|
147 |
|
148 |
with gr.Column(scale=1):
|
@@ -156,7 +165,8 @@ def main():
|
|
156 |
ring_model_name, ring_agg,
|
157 |
ring_frame_length_ms, ring_frame_step_ms,
|
158 |
ring_padding_length_ms, ring_max_silence_length_ms,
|
159 |
-
ring_start_ring_rate, ring_end_ring_rate
|
|
|
160 |
],
|
161 |
outputs=[ring_image, ring_end_points],
|
162 |
fn=click_ring_vad_button
|
@@ -170,7 +180,8 @@ def main():
|
|
170 |
ring_model_name, ring_agg,
|
171 |
ring_frame_length_ms, ring_frame_step_ms,
|
172 |
ring_padding_length_ms, ring_max_silence_length_ms,
|
173 |
-
ring_start_ring_rate, ring_end_ring_rate
|
|
|
174 |
],
|
175 |
outputs=[ring_image, ring_end_points],
|
176 |
)
|
|
|
41 |
max_silence_length_ms: int = 300,
|
42 |
start_ring_rate: float = 0.9,
|
43 |
end_ring_rate: float = 0.1,
|
44 |
+
max_speech_length_s: float = 2.0,
|
45 |
+
min_speech_length_s: float = 0.3,
|
46 |
):
|
47 |
global vad
|
48 |
|
|
|
66 |
start_ring_rate=start_ring_rate,
|
67 |
end_ring_rate=end_ring_rate,
|
68 |
frame_length_ms=frame_length_ms,
|
69 |
+
frame_step_ms=frame_step_ms,
|
70 |
padding_length_ms=padding_length_ms,
|
71 |
max_silence_length_ms=max_silence_length_ms,
|
72 |
+
max_speech_length_s=max_speech_length_s,
|
73 |
+
min_speech_length_s=min_speech_length_s,
|
74 |
sample_rate=sample_rate,
|
75 |
)
|
76 |
|
|
|
93 |
time = np.arange(0, len(signal)) / sample_rate
|
94 |
plt.figure(figsize=(12, 5))
|
95 |
plt.plot(time, signal / 32768, color="b")
|
96 |
+
plt.plot(time, speech_probs, color="gray")
|
97 |
|
98 |
for start, end in vad_segments:
|
99 |
plt.axvline(x=start, ymin=0.15, ymax=0.85, color="g", linestyle="--", label="开始端点")
|
|
|
148 |
ring_start_ring_rate = gr.Slider(minimum=0, maximum=1, value=0.9, step=0.05, label="start_ring_rate")
|
149 |
ring_end_ring_rate = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="end_ring_rate")
|
150 |
|
151 |
+
with gr.Row():
|
152 |
+
ring_max_speech_length_s = gr.Slider(minimum=0.0, maximum=10.0, value=2.0, step=0.05, label="max_speech_length_s")
|
153 |
+
ring_min_speech_length_s = gr.Slider(minimum=0.0, maximum=2.0, value=0.3, step=0.05, label="min_speech_length_s")
|
154 |
+
|
155 |
ring_button = gr.Button("retrieval", variant="primary")
|
156 |
|
157 |
with gr.Column(scale=1):
|
|
|
165 |
ring_model_name, ring_agg,
|
166 |
ring_frame_length_ms, ring_frame_step_ms,
|
167 |
ring_padding_length_ms, ring_max_silence_length_ms,
|
168 |
+
ring_start_ring_rate, ring_end_ring_rate,
|
169 |
+
ring_max_speech_length_s, ring_min_speech_length_s
|
170 |
],
|
171 |
outputs=[ring_image, ring_end_points],
|
172 |
fn=click_ring_vad_button
|
|
|
180 |
ring_model_name, ring_agg,
|
181 |
ring_frame_length_ms, ring_frame_step_ms,
|
182 |
ring_padding_length_ms, ring_max_silence_length_ms,
|
183 |
+
ring_start_ring_rate, ring_end_ring_rate,
|
184 |
+
ring_max_speech_length_s, ring_min_speech_length_s
|
185 |
],
|
186 |
outputs=[ring_image, ring_end_points],
|
187 |
)
|
ring_vad_examples.json
CHANGED
@@ -1,38 +1,66 @@
|
|
1 |
[
|
2 |
[
|
3 |
"data/early_media/3300999628164249998.wav",
|
4 |
-
"webrtcvad", 3, 30,
|
5 |
],
|
6 |
[
|
7 |
"data/early_media/3300999628164852605.wav",
|
8 |
-
"webrtcvad", 3, 30,
|
9 |
],
|
10 |
[
|
11 |
"data/early_media/3300999628164249998.wav",
|
12 |
-
"silerovad", 3, 35,
|
13 |
],
|
14 |
[
|
15 |
"data/early_media/3300999628164852605.wav",
|
16 |
-
"silerovad", 3, 35,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
],
|
18 |
[
|
19 |
"data/early_media/3300999628164852605.wav",
|
20 |
-
"call_voice", 3, 300, 30, 300, 300, 0.2, 0.1
|
21 |
],
|
22 |
[
|
23 |
"data/early_media/62/3300999628999191096.wav",
|
24 |
-
"call_voice", 3, 300, 30, 300, 300, 0.2, 0.1
|
25 |
],
|
26 |
[
|
27 |
"data/early_media/62/33009996287818451333.wav",
|
28 |
-
"call_voice", 3, 300, 30, 300, 300, 0.2, 0.1
|
29 |
],
|
30 |
[
|
31 |
-
"data/call_monitor/id-ID/
|
32 |
-
"
|
33 |
],
|
34 |
[
|
35 |
-
"data/call_monitor/id-ID/noise/
|
36 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
]
|
38 |
]
|
|
|
1 |
[
|
2 |
[
|
3 |
"data/early_media/3300999628164249998.wav",
|
4 |
+
"webrtcvad", 3, 30, 30, 300, 300, 0.9, 0.1, 2.0, 0.3
|
5 |
],
|
6 |
[
|
7 |
"data/early_media/3300999628164852605.wav",
|
8 |
+
"webrtcvad", 3, 30, 30, 300, 300, 0.9, 0.1, 2.0, 0.3
|
9 |
],
|
10 |
[
|
11 |
"data/early_media/3300999628164249998.wav",
|
12 |
+
"silerovad", 3, 35, 35, 350, 350, 0.7, 0.3, 2.0, 0.3
|
13 |
],
|
14 |
[
|
15 |
"data/early_media/3300999628164852605.wav",
|
16 |
+
"silerovad", 3, 35, 35, 350, 350, 0.5, 0.5, 2.0, 0.3
|
17 |
+
],
|
18 |
+
[
|
19 |
+
"data/call_monitor/id-ID/noise_mute/000d7fba-80ce-4bd7-84fe-e9c43de30f4a_id-ID_1678495379262.wav",
|
20 |
+
"silerovad", 3, 35, 35, 350, 350, 0.7, 0.3, 2.0, 0.3
|
21 |
+
],
|
22 |
+
[
|
23 |
+
"data/call_monitor/id-ID/noise/00a0a2a3-14ff-4a84-8aee-b18b2fb65355_id-ID_1680237229413.wav",
|
24 |
+
"silerovad", 3, 35, 35, 350, 350, 0.7, 0.3, 2.0, 0.3
|
25 |
],
|
26 |
[
|
27 |
"data/early_media/3300999628164852605.wav",
|
28 |
+
"call_voice", 3, 300, 30, 300, 300, 0.2, 0.1, 2.0, 0.3
|
29 |
],
|
30 |
[
|
31 |
"data/early_media/62/3300999628999191096.wav",
|
32 |
+
"call_voice", 3, 300, 30, 300, 300, 0.2, 0.1, 2.0, 0.3
|
33 |
],
|
34 |
[
|
35 |
"data/early_media/62/33009996287818451333.wav",
|
36 |
+
"call_voice", 3, 300, 30, 300, 300, 0.2, 0.1, 2.0, 0.3
|
37 |
],
|
38 |
[
|
39 |
+
"data/call_monitor/id-ID/noise/00a0a2a3-14ff-4a84-8aee-b18b2fb65355_id-ID_1680237229413.wav",
|
40 |
+
"call_voice", 3, 300, 30, 300, 300, 0.2, 0.1, 2.0, 0.3
|
41 |
],
|
42 |
[
|
43 |
+
"data/call_monitor/id-ID/noise/000ad44a-fbad-4a22-ba5a-c6dc855779b2_id-ID_1672040947119.wav",
|
44 |
+
"call_voice", 3, 300, 30, 300, 300, 0.2, 0.1, 2.0, 0.3
|
45 |
+
],
|
46 |
+
[
|
47 |
+
"data/call_monitor/id-ID/noise/000da369-6652-4601-b241-33ffbd52a224_id-ID_1676000326981.wav",
|
48 |
+
"call_voice", 3, 300, 30, 300, 300, 0.2, 0.1, 2.0, 0.3
|
49 |
+
],
|
50 |
+
[
|
51 |
+
"data/call_monitor/id-ID/voicemail/00a20d31-e1cb-4c70-821b-6fd151b260ae_id-ID_1671762897272.wav",
|
52 |
+
"call_voice", 3, 300, 30, 300, 300, 0.2, 0.1, 2.0, 0.3
|
53 |
+
],
|
54 |
+
[
|
55 |
+
"data/call_monitor/id-ID/voicemail/000b03b3-172e-4784-8510-24cf37e205ba_id-ID_1672193551438.wav",
|
56 |
+
"call_voice", 3, 300, 30, 300, 300, 0.2, 0.1, 2.0, 0.3
|
57 |
+
],
|
58 |
+
[
|
59 |
+
"data/call_monitor/id-ID/voice/000a3f9a-b2bf-46fd-9c69-477fc62cda51_id-ID_1671935534167.wav",
|
60 |
+
"call_voice", 3, 300, 30, 120, 300, 0.4, 0.1, 2.0, 0.3
|
61 |
+
],
|
62 |
+
[
|
63 |
+
"data/call_monitor/id-ID/voice/000cb369-a0ee-44aa-a213-18b036f1baf7_id-ID_1678762306513.wav",
|
64 |
+
"call_voice", 3, 300, 30, 120, 300, 0.4, 0.1, 2.0, 0.3
|
65 |
]
|
66 |
]
|
toolbox/vad/vad.py
CHANGED
@@ -107,6 +107,8 @@ class Vad(object):
|
|
107 |
frame_step_ms: int = 30,
|
108 |
padding_length_ms: int = 300,
|
109 |
max_silence_length_ms: int = 300,
|
|
|
|
|
110 |
sample_rate: int = 8000
|
111 |
):
|
112 |
self.model = model
|
@@ -115,13 +117,16 @@ class Vad(object):
|
|
115 |
self.frame_length_ms = frame_length_ms
|
116 |
self.padding_length_ms = padding_length_ms
|
117 |
self.max_silence_length_ms = max_silence_length_ms
|
|
|
|
|
118 |
self.sample_rate = sample_rate
|
119 |
|
120 |
# frames
|
121 |
self.frame_length = int(sample_rate * (frame_length_ms / 1000.0))
|
122 |
self.frame_step = int(sample_rate * (frame_step_ms / 1000.0))
|
123 |
self.frame_timestamp_s = 0.0
|
124 |
-
self.signal_cache = np.zeros(shape=(self.frame_length,), dtype=np.int16)
|
|
|
125 |
|
126 |
# segments
|
127 |
self.num_padding_frames = int(padding_length_ms / frame_step_ms)
|
@@ -184,22 +189,23 @@ class Vad(object):
|
|
184 |
|
185 |
for f, _ in self.ring_buffer:
|
186 |
self.voiced_frames.append(f)
|
187 |
-
|
188 |
-
else:
|
189 |
-
self.voiced_frames.append(frame)
|
190 |
-
self.ring_buffer.append((frame, speech_prob))
|
191 |
-
num_voiced = sum([p for _, p in self.ring_buffer])
|
192 |
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
]
|
200 |
-
|
201 |
-
self.
|
202 |
-
|
|
|
|
|
|
|
|
|
|
|
203 |
|
204 |
def vad_segments_generator(self, segments_generator):
|
205 |
segments = list(segments_generator)
|
@@ -208,22 +214,31 @@ class Vad(object):
|
|
208 |
start = round(segment[1], 4)
|
209 |
end = round(segment[2], 4)
|
210 |
|
211 |
-
if self.
|
212 |
self.timestamp_start_s = start
|
213 |
self.timestamp_end_s = end
|
214 |
-
self.is_first_segment = False
|
215 |
continue
|
216 |
|
217 |
-
if self.timestamp_start_s:
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
|
228 |
def vad(self, signal: np.ndarray) -> List[list]:
|
229 |
segments = self.segments_generator(signal)
|
|
|
107 |
frame_step_ms: int = 30,
|
108 |
padding_length_ms: int = 300,
|
109 |
max_silence_length_ms: int = 300,
|
110 |
+
max_speech_length_s: float = 2.0,
|
111 |
+
min_speech_length_s: float = 0.3,
|
112 |
sample_rate: int = 8000
|
113 |
):
|
114 |
self.model = model
|
|
|
117 |
self.frame_length_ms = frame_length_ms
|
118 |
self.padding_length_ms = padding_length_ms
|
119 |
self.max_silence_length_ms = max_silence_length_ms
|
120 |
+
self.max_speech_length_s = max_speech_length_s
|
121 |
+
self.min_speech_length_s = min_speech_length_s
|
122 |
self.sample_rate = sample_rate
|
123 |
|
124 |
# frames
|
125 |
self.frame_length = int(sample_rate * (frame_length_ms / 1000.0))
|
126 |
self.frame_step = int(sample_rate * (frame_step_ms / 1000.0))
|
127 |
self.frame_timestamp_s = 0.0
|
128 |
+
# self.signal_cache = np.zeros(shape=(self.frame_length,), dtype=np.int16)
|
129 |
+
self.signal_cache = None
|
130 |
|
131 |
# segments
|
132 |
self.num_padding_frames = int(padding_length_ms / frame_step_ms)
|
|
|
189 |
|
190 |
for f, _ in self.ring_buffer:
|
191 |
self.voiced_frames.append(f)
|
192 |
+
continue
|
|
|
|
|
|
|
|
|
193 |
|
194 |
+
self.voiced_frames.append(frame)
|
195 |
+
self.ring_buffer.append((frame, speech_prob))
|
196 |
+
num_voiced = sum([p for _, p in self.ring_buffer])
|
197 |
+
|
198 |
+
if num_voiced < self.end_ring_rate * self.ring_buffer.maxlen:
|
199 |
+
segment = [
|
200 |
+
np.concatenate([f.signal for f in self.voiced_frames]),
|
201 |
+
self.voiced_frames[0].timestamp_s,
|
202 |
+
self.voiced_frames[-1].timestamp_s,
|
203 |
+
]
|
204 |
+
yield segment
|
205 |
+
self.triggered = False
|
206 |
+
self.ring_buffer.clear()
|
207 |
+
self.voiced_frames = []
|
208 |
+
continue
|
209 |
|
210 |
def vad_segments_generator(self, segments_generator):
|
211 |
segments = list(segments_generator)
|
|
|
214 |
start = round(segment[1], 4)
|
215 |
end = round(segment[2], 4)
|
216 |
|
217 |
+
if self.timestamp_start_s is None and self.timestamp_end_s is None:
|
218 |
self.timestamp_start_s = start
|
219 |
self.timestamp_end_s = end
|
|
|
220 |
continue
|
221 |
|
222 |
+
if self.timestamp_end_s - self.timestamp_start_s > self.max_speech_length_s:
|
223 |
+
end_ = self.timestamp_start_s + self.max_speech_length_s
|
224 |
+
vad_segment = [self.timestamp_start_s, end_]
|
225 |
+
yield vad_segment
|
226 |
+
self.timestamp_start_s = end_
|
227 |
+
|
228 |
+
silence_length_ms = (start - self.timestamp_end_s) * 1000
|
229 |
+
if silence_length_ms < self.max_silence_length_ms:
|
230 |
+
self.timestamp_end_s = end
|
231 |
+
continue
|
232 |
+
|
233 |
+
if self.timestamp_end_s - self.timestamp_start_s < self.min_speech_length_s:
|
234 |
+
self.timestamp_start_s = start
|
235 |
+
self.timestamp_end_s = end
|
236 |
+
continue
|
237 |
|
238 |
+
vad_segment = [self.timestamp_start_s, self.timestamp_end_s]
|
239 |
+
yield vad_segment
|
240 |
+
self.timestamp_start_s = start
|
241 |
+
self.timestamp_end_s = end
|
242 |
|
243 |
def vad(self, signal: np.ndarray) -> List[list]:
|
244 |
segments = self.segments_generator(signal)
|