lucy1118 commited on
Commit
6043de8
·
verified ·
1 Parent(s): 0515f46

Upload 18 files

Browse files
livekit-plugins-silero/README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # LiveKit Plugins Silero
2
+
3
+ Agent Framework Plugin for Silero. Currently supports Voice Activity Detection.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install livekit-plugins-silero
9
+ ```
10
+
11
+ This plugin contains model files that would need to be downloaded prior to use.
livekit-plugins-silero/build/lib/livekit/plugins/silero/__init__.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from .vad import VAD, VADStream
16
+ from .version import __version__
17
+
18
+ __all__ = ["VAD", "VADStream", "__version__"]
19
+
20
+ import torch
21
+ from livekit.agents import Plugin
22
+
23
+
24
+ class SileroPlugin(Plugin):
25
+ def __init__(self):
26
+ super().__init__(__name__, __version__, __package__)
27
+
28
+ def download_files(self):
29
+ _ = torch.hub.load(
30
+ repo_or_dir="snakers4/silero-vad:v4.0", model="silero_vad", onnx=True
31
+ )
32
+
33
+
34
+ Plugin.register_plugin(SileroPlugin())
livekit-plugins-silero/build/lib/livekit/plugins/silero/log.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import logging
2
+
3
+ logger = logging.getLogger("livekit.plugins.silero")
livekit-plugins-silero/build/lib/livekit/plugins/silero/py.typed ADDED
File without changes
livekit-plugins-silero/build/lib/livekit/plugins/silero/vad.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+ import contextlib
19
+ import time
20
+ from collections import deque
21
+ from typing import List, Optional
22
+
23
+ import numpy as np
24
+ import torch
25
+ from livekit import agents, rtc
26
+
27
+ from .log import logger
28
+
29
+
30
+ class VAD(agents.vad.VAD):
31
+ def __init__(self, *, model_path: str | None = None, use_onnx: bool = True) -> None:
32
+ if model_path:
33
+ model = torch.jit.load(model_path)
34
+ model.eval()
35
+ else:
36
+ model, _ = torch.hub.load(
37
+ repo_or_dir="snakers4/silero-vad:v4.0",
38
+ model="silero_vad",
39
+ onnx=use_onnx,
40
+ )
41
+ self._model = model
42
+
43
+ def stream(
44
+ self,
45
+ *,
46
+ min_speaking_duration: float = 0.2,
47
+ min_silence_duration: float = 0.8,
48
+ padding_duration: float = 0.1,
49
+ sample_rate: int = 16000,
50
+ max_buffered_speech: float = 45.0,
51
+ threshold: float = 0.2,
52
+ ) -> "VADStream":
53
+ return VADStream(
54
+ self._model,
55
+ min_speaking_duration=min_speaking_duration,
56
+ min_silence_duration=min_silence_duration,
57
+ padding_duration=padding_duration,
58
+ sample_rate=sample_rate,
59
+ max_buffered_speech=max_buffered_speech,
60
+ threshold=threshold,
61
+ )
62
+
63
+
64
+ # Based on https://github.com/snakers4/silero-vad/blob/94504ece54c8caeebb808410b08ae55ee82dba82/utils_vad.py#L428
65
+ class VADStream(agents.vad.VADStream):
66
+ def __init__(
67
+ self,
68
+ model,
69
+ *,
70
+ min_speaking_duration: float,
71
+ min_silence_duration: float,
72
+ padding_duration: float,
73
+ sample_rate: int,
74
+ max_buffered_speech: float,
75
+ threshold: float,
76
+ ) -> None:
77
+ self._min_speaking_duration = min_speaking_duration
78
+ self._min_silence_duration = min_silence_duration
79
+ self._padding_duration = padding_duration
80
+ self._sample_rate = sample_rate
81
+ self._max_buffered_speech = max_buffered_speech
82
+ self._threshold = threshold
83
+
84
+ if sample_rate not in [8000, 16000]:
85
+ raise ValueError("Silero VAD only supports 8KHz and 16KHz sample rates")
86
+
87
+ self._queue = asyncio.Queue[Optional[rtc.AudioFrame]]()
88
+ self._event_queue = asyncio.Queue[Optional[agents.vad.VADEvent]]()
89
+ self._model = model
90
+
91
+ self._closed = False
92
+ self._speaking = False
93
+ self._waiting_start = False
94
+ self._waiting_end = False
95
+ self._current_sample = 0
96
+ self._filter = agents.utils.ExpFilter(0.8)
97
+ self._min_speaking_samples = min_speaking_duration * sample_rate
98
+ self._min_silence_samples = min_silence_duration * sample_rate
99
+ self._padding_duration_samples = padding_duration * sample_rate
100
+ self._max_buffered_samples = max_buffered_speech * sample_rate
101
+
102
+ self._queued_frames: deque[rtc.AudioFrame] = deque()
103
+ self._original_frames: deque[rtc.AudioFrame] = deque()
104
+ self._buffered_frames: List[rtc.AudioFrame] = []
105
+ self._main_task = asyncio.create_task(self._run())
106
+
107
+ def push_frame(self, frame: rtc.AudioFrame) -> None:
108
+ if self._closed:
109
+ raise ValueError("cannot push frame to closed stream")
110
+
111
+ self._queue.put_nowait(frame)
112
+
113
+ async def aclose(self, *, wait: bool = True) -> None:
114
+ self._closed = True
115
+ if not wait:
116
+ self._main_task.cancel()
117
+
118
+ self._queue.put_nowait(None)
119
+ with contextlib.suppress(asyncio.CancelledError):
120
+ await self._main_task
121
+
122
+ async def _run(self):
123
+ try:
124
+ while True:
125
+ frame = await self._queue.get()
126
+ if frame is None:
127
+ break # None is sent inside aclose
128
+
129
+ self._queue.task_done()
130
+
131
+ # resample to silero's sample rate
132
+ resampled_frame = frame.remix_and_resample(
133
+ self._sample_rate, 1
134
+ ) # TODO: This is technically wrong, fix when we have a better resampler
135
+ self._original_frames.append(frame)
136
+ self._queued_frames.append(resampled_frame)
137
+
138
+ # run inference by chunks of 40ms until we run out of data
139
+ while True:
140
+ available_length = sum(
141
+ f.samples_per_channel for f in self._queued_frames
142
+ )
143
+
144
+ samples_40ms = self._sample_rate // 1000 * 40
145
+ if available_length < samples_40ms:
146
+ break
147
+
148
+ await asyncio.shield(self._run_inference())
149
+
150
+ except Exception:
151
+ logger.exception("silero stream failed")
152
+ finally:
153
+ self._event_queue.put_nowait(None)
154
+
155
+ async def _run_inference(self) -> None:
156
+ # merge the first 4 frames (we know each is 10ms)
157
+ if len(self._queued_frames) < 4:
158
+ return
159
+
160
+ original_frames = [self._original_frames.popleft() for _ in range(4)]
161
+ merged_frame = agents.utils.merge_frames(
162
+ [self._queued_frames.popleft() for _ in range(4)]
163
+ )
164
+
165
+ # convert data_40ms to tensor & f32
166
+ tensor = torch.from_numpy(np.frombuffer(merged_frame.data, dtype=np.int16))
167
+ tensor = tensor.to(torch.float32) / 32768.0
168
+
169
+ # run inference
170
+ start_time = time.time()
171
+ raw_prob = await asyncio.to_thread(
172
+ lambda: self._model(tensor, self._sample_rate).item()
173
+ )
174
+ probability = self._filter.apply(1.0, raw_prob)
175
+ inference_duration = time.time() - start_time
176
+
177
+ # inference done
178
+ event = agents.vad.VADEvent(
179
+ type=agents.vad.VADEventType.INFERENCE_DONE,
180
+ samples_index=self._current_sample,
181
+ probability=probability,
182
+ raw_inference_prob=raw_prob,
183
+ inference_duration=inference_duration,
184
+ )
185
+ self._event_queue.put_nowait(event)
186
+
187
+ self._dispatch_event(original_frames, probability, raw_prob, inference_duration)
188
+ self._current_sample += merged_frame.samples_per_channel
189
+
190
+ def _dispatch_event(
191
+ self,
192
+ original_frames: List[rtc.AudioFrame],
193
+ probability: float,
194
+ raw_inference_prob: float,
195
+ inference_duration: float,
196
+ ):
197
+ """
198
+ Dispatches a VAD event based on the speech probability and the options
199
+ Args:
200
+ speech_prob: speech probability of the current frame
201
+ original_frames: original frames of the current inference
202
+ """
203
+
204
+ samples_10ms = self._sample_rate / 100
205
+ padding_count = int(
206
+ self._padding_duration_samples // samples_10ms
207
+ ) # number of frames to keep for the padding (one side)
208
+
209
+ self._buffered_frames.extend(original_frames)
210
+ if (
211
+ not self._speaking
212
+ and not self._waiting_start
213
+ and len(self._buffered_frames) > padding_count
214
+ ):
215
+ self._buffered_frames = self._buffered_frames[
216
+ len(self._buffered_frames) - padding_count :
217
+ ]
218
+
219
+ max_buffer_len = padding_count + max(
220
+ int(self._max_buffered_samples // samples_10ms),
221
+ int(self._min_speaking_samples // samples_10ms),
222
+ )
223
+ if len(self._buffered_frames) > max_buffer_len:
224
+ self._buffered_frames = self._buffered_frames[
225
+ len(self._buffered_frames) - max_buffer_len :
226
+ ]
227
+
228
+ if probability >= self._threshold:
229
+ # speaking, wait for min_speaking_duration to trigger START_OF_SPEECH
230
+ self._waiting_end = False
231
+ if not self._waiting_start and not self._speaking:
232
+ self._waiting_start = True
233
+ self._start_speech = self._current_sample
234
+
235
+ if self._waiting_start and (
236
+ self._current_sample - self._start_speech >= self._min_speaking_samples
237
+ ):
238
+ self._waiting_start = False
239
+ self._speaking = True
240
+
241
+ # since we're waiting for the min_spaking_duration to trigger START_OF_SPEECH,
242
+ # put the speech that were used to trigger the start here
243
+ event = agents.vad.VADEvent(
244
+ type=agents.vad.VADEventType.START_OF_SPEECH,
245
+ samples_index=self._start_speech,
246
+ frames=self._buffered_frames[padding_count:],
247
+ speaking=True,
248
+ )
249
+ self._event_queue.put_nowait(event)
250
+
251
+ # we don't check the speech_prob here
252
+ event = agents.vad.VADEvent(
253
+ type=agents.vad.VADEventType.INFERENCE_DONE,
254
+ samples_index=self._current_sample,
255
+ frames=original_frames,
256
+ probability=probability,
257
+ raw_inference_prob=raw_inference_prob,
258
+ inference_duration=inference_duration,
259
+ speaking=self._speaking,
260
+ )
261
+ self._event_queue.put_nowait(event)
262
+
263
+ if probability < self._threshold:
264
+ # stopped speaking, s for min_silence_duration to trigger END_OF_SPEECH,
265
+ self._waiting_start = False
266
+ if not self._waiting_end and self._speaking:
267
+ self._waiting_end = True
268
+ self._end_speech = self._current_sample
269
+
270
+ if self._waiting_end and (
271
+ self._current_sample - self._end_speech
272
+ >= max(self._min_silence_samples, self._padding_duration_samples)
273
+ ):
274
+ self._waiting_end = False
275
+ self._speaking = False
276
+ event = agents.vad.VADEvent(
277
+ type=agents.vad.VADEventType.END_OF_SPEECH,
278
+ samples_index=self._end_speech,
279
+ duration=(self._end_speech - self._start_speech)
280
+ / self._sample_rate,
281
+ frames=self._buffered_frames,
282
+ speaking=False,
283
+ )
284
+ self._event_queue.put_nowait(event)
285
+
286
+ async def __anext__(self) -> agents.vad.VADEvent:
287
+ evt = await self._event_queue.get()
288
+ if evt is None:
289
+ raise StopAsyncIteration
290
+
291
+ return evt
livekit-plugins-silero/build/lib/livekit/plugins/silero/version.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ __version__ = "0.5.2"
livekit-plugins-silero/livekit/plugins/silero/__init__.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from .vad import VAD, VADStream
16
+ from .version import __version__
17
+
18
+ __all__ = ["VAD", "VADStream", "__version__"]
19
+
20
+ import torch
21
+ from livekit.agents import Plugin
22
+
23
+
24
+ class SileroPlugin(Plugin):
25
+ def __init__(self):
26
+ super().__init__(__name__, __version__, __package__)
27
+
28
+ def download_files(self):
29
+ _ = torch.hub.load(
30
+ repo_or_dir="snakers4/silero-vad:v4.0", model="silero_vad", onnx=True
31
+ )
32
+
33
+
34
+ Plugin.register_plugin(SileroPlugin())
livekit-plugins-silero/livekit/plugins/silero/log.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import logging
2
+
3
+ logger = logging.getLogger("livekit.plugins.silero")
livekit-plugins-silero/livekit/plugins/silero/py.typed ADDED
File without changes
livekit-plugins-silero/livekit/plugins/silero/vad.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+ import contextlib
19
+ import time
20
+ from collections import deque
21
+ from typing import List, Optional
22
+
23
+ import numpy as np
24
+ import torch
25
+ from livekit import agents, rtc
26
+
27
+ from .log import logger
28
+
29
+
30
+ class VAD(agents.vad.VAD):
31
+ def __init__(self, *, model_path: str | None = None, use_onnx: bool = True) -> None:
32
+ if model_path:
33
+ model = torch.jit.load(model_path)
34
+ model.eval()
35
+ else:
36
+ model, _ = torch.hub.load(
37
+ repo_or_dir="snakers4/silero-vad:v4.0",
38
+ model="silero_vad",
39
+ onnx=use_onnx,
40
+ )
41
+ self._model = model
42
+
43
+ def stream(
44
+ self,
45
+ *,
46
+ min_speaking_duration: float = 0.2,
47
+ min_silence_duration: float = 0.8,
48
+ padding_duration: float = 0.1,
49
+ sample_rate: int = 16000,
50
+ max_buffered_speech: float = 45.0,
51
+ threshold: float = 0.2,
52
+ ) -> "VADStream":
53
+ return VADStream(
54
+ self._model,
55
+ min_speaking_duration=min_speaking_duration,
56
+ min_silence_duration=min_silence_duration,
57
+ padding_duration=padding_duration,
58
+ sample_rate=sample_rate,
59
+ max_buffered_speech=max_buffered_speech,
60
+ threshold=threshold,
61
+ )
62
+
63
+
64
+ # Based on https://github.com/snakers4/silero-vad/blob/94504ece54c8caeebb808410b08ae55ee82dba82/utils_vad.py#L428
65
+ class VADStream(agents.vad.VADStream):
66
+ def __init__(
67
+ self,
68
+ model,
69
+ *,
70
+ min_speaking_duration: float,
71
+ min_silence_duration: float,
72
+ padding_duration: float,
73
+ sample_rate: int,
74
+ max_buffered_speech: float,
75
+ threshold: float,
76
+ ) -> None:
77
+ self._min_speaking_duration = min_speaking_duration
78
+ self._min_silence_duration = min_silence_duration
79
+ self._padding_duration = padding_duration
80
+ self._sample_rate = sample_rate
81
+ self._max_buffered_speech = max_buffered_speech
82
+ self._threshold = threshold
83
+
84
+ if sample_rate not in [8000, 16000]:
85
+ raise ValueError("Silero VAD only supports 8KHz and 16KHz sample rates")
86
+
87
+ self._queue = asyncio.Queue[Optional[rtc.AudioFrame]]()
88
+ self._event_queue = asyncio.Queue[Optional[agents.vad.VADEvent]]()
89
+ self._model = model
90
+
91
+ self._closed = False
92
+ self._speaking = False
93
+ self._waiting_start = False
94
+ self._waiting_end = False
95
+ self._current_sample = 0
96
+ self._filter = agents.utils.ExpFilter(0.8)
97
+ self._min_speaking_samples = min_speaking_duration * sample_rate
98
+ self._min_silence_samples = min_silence_duration * sample_rate
99
+ self._padding_duration_samples = padding_duration * sample_rate
100
+ self._max_buffered_samples = max_buffered_speech * sample_rate
101
+
102
+ self._queued_frames: deque[rtc.AudioFrame] = deque()
103
+ self._original_frames: deque[rtc.AudioFrame] = deque()
104
+ self._buffered_frames: List[rtc.AudioFrame] = []
105
+ self._main_task = asyncio.create_task(self._run())
106
+
107
+ def push_frame(self, frame: rtc.AudioFrame) -> None:
108
+ if self._closed:
109
+ raise ValueError("cannot push frame to closed stream")
110
+
111
+ self._queue.put_nowait(frame)
112
+
113
+ async def aclose(self, *, wait: bool = True) -> None:
114
+ self._closed = True
115
+ if not wait:
116
+ self._main_task.cancel()
117
+
118
+ self._queue.put_nowait(None)
119
+ with contextlib.suppress(asyncio.CancelledError):
120
+ await self._main_task
121
+
122
+ async def _run(self):
123
+ try:
124
+ while True:
125
+ frame = await self._queue.get()
126
+ if frame is None:
127
+ break # None is sent inside aclose
128
+
129
+ self._queue.task_done()
130
+
131
+ # resample to silero's sample rate
132
+ resampled_frame = frame.remix_and_resample(
133
+ self._sample_rate, 1
134
+ ) # TODO: This is technically wrong, fix when we have a better resampler
135
+ self._original_frames.append(frame)
136
+ self._queued_frames.append(resampled_frame)
137
+
138
+ # run inference by chunks of 40ms until we run out of data
139
+ while True:
140
+ available_length = sum(
141
+ f.samples_per_channel for f in self._queued_frames
142
+ )
143
+
144
+ samples_40ms = self._sample_rate // 1000 * 40
145
+ if available_length < samples_40ms:
146
+ break
147
+
148
+ await asyncio.shield(self._run_inference())
149
+
150
+ except Exception:
151
+ logger.exception("silero stream failed")
152
+ finally:
153
+ self._event_queue.put_nowait(None)
154
+
155
+ async def _run_inference(self) -> None:
156
+ # merge the first 4 frames (we know each is 10ms)
157
+ if len(self._queued_frames) < 4:
158
+ return
159
+
160
+ original_frames = [self._original_frames.popleft() for _ in range(4)]
161
+ merged_frame = agents.utils.merge_frames(
162
+ [self._queued_frames.popleft() for _ in range(4)]
163
+ )
164
+
165
+ # convert data_40ms to tensor & f32
166
+ tensor = torch.from_numpy(np.frombuffer(merged_frame.data, dtype=np.int16))
167
+ tensor = tensor.to(torch.float32) / 32768.0
168
+
169
+ # run inference
170
+ start_time = time.time()
171
+ raw_prob = await asyncio.to_thread(
172
+ lambda: self._model(tensor, self._sample_rate).item()
173
+ )
174
+ probability = self._filter.apply(1.0, raw_prob)
175
+ inference_duration = time.time() - start_time
176
+
177
+ # inference done
178
+ event = agents.vad.VADEvent(
179
+ type=agents.vad.VADEventType.INFERENCE_DONE,
180
+ samples_index=self._current_sample,
181
+ probability=probability,
182
+ raw_inference_prob=raw_prob,
183
+ inference_duration=inference_duration,
184
+ )
185
+ self._event_queue.put_nowait(event)
186
+
187
+ self._dispatch_event(original_frames, probability, raw_prob, inference_duration)
188
+ self._current_sample += merged_frame.samples_per_channel
189
+
190
+ def _dispatch_event(
191
+ self,
192
+ original_frames: List[rtc.AudioFrame],
193
+ probability: float,
194
+ raw_inference_prob: float,
195
+ inference_duration: float,
196
+ ):
197
+ """
198
+ Dispatches a VAD event based on the speech probability and the options
199
+ Args:
200
+ speech_prob: speech probability of the current frame
201
+ original_frames: original frames of the current inference
202
+ """
203
+
204
+ samples_10ms = self._sample_rate / 100
205
+ padding_count = int(
206
+ self._padding_duration_samples // samples_10ms
207
+ ) # number of frames to keep for the padding (one side)
208
+
209
+ self._buffered_frames.extend(original_frames)
210
+ if (
211
+ not self._speaking
212
+ and not self._waiting_start
213
+ and len(self._buffered_frames) > padding_count
214
+ ):
215
+ self._buffered_frames = self._buffered_frames[
216
+ len(self._buffered_frames) - padding_count :
217
+ ]
218
+
219
+ max_buffer_len = padding_count + max(
220
+ int(self._max_buffered_samples // samples_10ms),
221
+ int(self._min_speaking_samples // samples_10ms),
222
+ )
223
+ if len(self._buffered_frames) > max_buffer_len:
224
+ self._buffered_frames = self._buffered_frames[
225
+ len(self._buffered_frames) - max_buffer_len :
226
+ ]
227
+
228
+ if probability >= self._threshold:
229
+ # speaking, wait for min_speaking_duration to trigger START_OF_SPEECH
230
+ self._waiting_end = False
231
+ if not self._waiting_start and not self._speaking:
232
+ self._waiting_start = True
233
+ self._start_speech = self._current_sample
234
+
235
+ if self._waiting_start and (
236
+ self._current_sample - self._start_speech >= self._min_speaking_samples
237
+ ):
238
+ self._waiting_start = False
239
+ self._speaking = True
240
+
241
+ # since we're waiting for the min_spaking_duration to trigger START_OF_SPEECH,
242
+ # put the speech that were used to trigger the start here
243
+ event = agents.vad.VADEvent(
244
+ type=agents.vad.VADEventType.START_OF_SPEECH,
245
+ samples_index=self._start_speech,
246
+ frames=self._buffered_frames[padding_count:],
247
+ speaking=True,
248
+ )
249
+ self._event_queue.put_nowait(event)
250
+
251
+ # we don't check the speech_prob here
252
+ event = agents.vad.VADEvent(
253
+ type=agents.vad.VADEventType.INFERENCE_DONE,
254
+ samples_index=self._current_sample,
255
+ frames=original_frames,
256
+ probability=probability,
257
+ raw_inference_prob=raw_inference_prob,
258
+ inference_duration=inference_duration,
259
+ speaking=self._speaking,
260
+ )
261
+ self._event_queue.put_nowait(event)
262
+
263
+ if probability < self._threshold:
264
+ # stopped speaking, s for min_silence_duration to trigger END_OF_SPEECH,
265
+ self._waiting_start = False
266
+ if not self._waiting_end and self._speaking:
267
+ self._waiting_end = True
268
+ self._end_speech = self._current_sample
269
+
270
+ if self._waiting_end and (
271
+ self._current_sample - self._end_speech
272
+ >= max(self._min_silence_samples, self._padding_duration_samples)
273
+ ):
274
+ self._waiting_end = False
275
+ self._speaking = False
276
+ event = agents.vad.VADEvent(
277
+ type=agents.vad.VADEventType.END_OF_SPEECH,
278
+ samples_index=self._end_speech,
279
+ duration=(self._end_speech - self._start_speech)
280
+ / self._sample_rate,
281
+ frames=self._buffered_frames,
282
+ speaking=False,
283
+ )
284
+ self._event_queue.put_nowait(event)
285
+
286
+ async def __anext__(self) -> agents.vad.VADEvent:
287
+ evt = await self._event_queue.get()
288
+ if evt is None:
289
+ raise StopAsyncIteration
290
+
291
+ return evt
livekit-plugins-silero/livekit/plugins/silero/version.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ __version__ = "0.5.2"
livekit-plugins-silero/livekit_plugins_silero.egg-info/PKG-INFO ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: livekit-plugins-silero
3
+ Version: 0.5.2
4
+ Summary: Agent Framework Plugin for Silero
5
+ Home-page: https://github.com/livekit/agents
6
+ License: Apache-2.0
7
+ Project-URL: Documentation, https://docs.livekit.io
8
+ Project-URL: Website, https://livekit.io/
9
+ Project-URL: Source, https://github.com/livekit/agents
10
+ Keywords: webrtc,realtime,audio,video,livekit
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: Topic :: Multimedia :: Sound/Audio
14
+ Classifier: Topic :: Multimedia :: Video
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3 :: Only
20
+ Requires-Python: >=3.9.0
21
+ Description-Content-Type: text/markdown
22
+ Requires-Dist: livekit-agents~=0.7
23
+ Requires-Dist: torch<3,>=2
24
+ Requires-Dist: torchaudio>=2
25
+ Requires-Dist: numpy<2,>=1
26
+ Requires-Dist: onnxruntime~=1.17.0
27
+
28
+ # LiveKit Plugins Silero
29
+
30
+ Agent Framework Plugin for Silero. Currently supports Voice Activity Detection.
31
+
32
+ ## Installation
33
+
34
+ ```bash
35
+ pip install livekit-plugins-silero
36
+ ```
37
+
38
+ This plugin contains model files that would need to be downloaded prior to use.
livekit-plugins-silero/livekit_plugins_silero.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ README.md
2
+ pyproject.toml
3
+ setup.py
4
+ livekit/plugins/silero/__init__.py
5
+ livekit/plugins/silero/log.py
6
+ livekit/plugins/silero/py.typed
7
+ livekit/plugins/silero/vad.py
8
+ livekit/plugins/silero/version.py
9
+ livekit_plugins_silero.egg-info/PKG-INFO
10
+ livekit_plugins_silero.egg-info/SOURCES.txt
11
+ livekit_plugins_silero.egg-info/dependency_links.txt
12
+ livekit_plugins_silero.egg-info/requires.txt
13
+ livekit_plugins_silero.egg-info/top_level.txt
livekit-plugins-silero/livekit_plugins_silero.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
livekit-plugins-silero/livekit_plugins_silero.egg-info/requires.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ livekit-agents~=0.7
2
+ torch<3,>=2
3
+ torchaudio>=2
4
+ numpy<2,>=1
5
+ onnxruntime~=1.17.0
livekit-plugins-silero/livekit_plugins_silero.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ livekit
livekit-plugins-silero/pyproject.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
livekit-plugins-silero/setup.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ import pathlib
17
+
18
+ import setuptools
19
+ import setuptools.command.build_py
20
+
21
+ here = pathlib.Path(__file__).parent.resolve()
22
+ about = {}
23
+ with open(os.path.join(here, "livekit", "plugins", "silero", "version.py"), "r") as f:
24
+ exec(f.read(), about)
25
+
26
+
27
+ setuptools.setup(
28
+ name="livekit-plugins-silero",
29
+ version=about["__version__"],
30
+ description="Agent Framework Plugin for Silero",
31
+ long_description=(here / "README.md").read_text(encoding="utf-8"),
32
+ long_description_content_type="text/markdown",
33
+ url="https://github.com/livekit/agents",
34
+ cmdclass={},
35
+ classifiers=[
36
+ "Intended Audience :: Developers",
37
+ "License :: OSI Approved :: Apache Software License",
38
+ "Topic :: Multimedia :: Sound/Audio",
39
+ "Topic :: Multimedia :: Video",
40
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
41
+ "Programming Language :: Python :: 3",
42
+ "Programming Language :: Python :: 3.9",
43
+ "Programming Language :: Python :: 3.10",
44
+ "Programming Language :: Python :: 3 :: Only",
45
+ ],
46
+ keywords=["webrtc", "realtime", "audio", "video", "livekit"],
47
+ license="Apache-2.0",
48
+ packages=setuptools.find_namespace_packages(include=["livekit.*"]),
49
+ python_requires=">=3.9.0",
50
+ install_requires=[
51
+ "livekit-agents~=0.7",
52
+ "torch >= 2, < 3",
53
+ "torchaudio >= 2",
54
+ "numpy >= 1, < 2",
55
+ "onnxruntime~=1.17.0",
56
+ ],
57
+ package_data={
58
+ "livekit.plugins.silero": ["py.typed"],
59
+ },
60
+ project_urls={
61
+ "Documentation": "https://docs.livekit.io",
62
+ "Website": "https://livekit.io/",
63
+ "Source": "https://github.com/livekit/agents",
64
+ },
65
+ )