Spaces:
Runtime error
Runtime error
Upload 18 files
Browse files- livekit-plugins-silero/README.md +11 -0
- livekit-plugins-silero/build/lib/livekit/plugins/silero/__init__.py +34 -0
- livekit-plugins-silero/build/lib/livekit/plugins/silero/log.py +3 -0
- livekit-plugins-silero/build/lib/livekit/plugins/silero/py.typed +0 -0
- livekit-plugins-silero/build/lib/livekit/plugins/silero/vad.py +291 -0
- livekit-plugins-silero/build/lib/livekit/plugins/silero/version.py +15 -0
- livekit-plugins-silero/livekit/plugins/silero/__init__.py +34 -0
- livekit-plugins-silero/livekit/plugins/silero/log.py +3 -0
- livekit-plugins-silero/livekit/plugins/silero/py.typed +0 -0
- livekit-plugins-silero/livekit/plugins/silero/vad.py +291 -0
- livekit-plugins-silero/livekit/plugins/silero/version.py +15 -0
- livekit-plugins-silero/livekit_plugins_silero.egg-info/PKG-INFO +38 -0
- livekit-plugins-silero/livekit_plugins_silero.egg-info/SOURCES.txt +13 -0
- livekit-plugins-silero/livekit_plugins_silero.egg-info/dependency_links.txt +1 -0
- livekit-plugins-silero/livekit_plugins_silero.egg-info/requires.txt +5 -0
- livekit-plugins-silero/livekit_plugins_silero.egg-info/top_level.txt +1 -0
- livekit-plugins-silero/pyproject.toml +3 -0
- livekit-plugins-silero/setup.py +65 -0
livekit-plugins-silero/README.md
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# LiveKit Plugins Silero
|
2 |
+
|
3 |
+
Agent Framework Plugin for Silero. Currently supports Voice Activity Detection.
|
4 |
+
|
5 |
+
## Installation
|
6 |
+
|
7 |
+
```bash
|
8 |
+
pip install livekit-plugins-silero
|
9 |
+
```
|
10 |
+
|
11 |
+
This plugin contains model files that would need to be downloaded prior to use.
|
livekit-plugins-silero/build/lib/livekit/plugins/silero/__init__.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2023 LiveKit, Inc.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
from .vad import VAD, VADStream
|
16 |
+
from .version import __version__
|
17 |
+
|
18 |
+
__all__ = ["VAD", "VADStream", "__version__"]
|
19 |
+
|
20 |
+
import torch
|
21 |
+
from livekit.agents import Plugin
|
22 |
+
|
23 |
+
|
24 |
+
class SileroPlugin(Plugin):
|
25 |
+
def __init__(self):
|
26 |
+
super().__init__(__name__, __version__, __package__)
|
27 |
+
|
28 |
+
def download_files(self):
|
29 |
+
_ = torch.hub.load(
|
30 |
+
repo_or_dir="snakers4/silero-vad:v4.0", model="silero_vad", onnx=True
|
31 |
+
)
|
32 |
+
|
33 |
+
|
34 |
+
Plugin.register_plugin(SileroPlugin())
|
livekit-plugins-silero/build/lib/livekit/plugins/silero/log.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
+
logger = logging.getLogger("livekit.plugins.silero")
|
livekit-plugins-silero/build/lib/livekit/plugins/silero/py.typed
ADDED
File without changes
|
livekit-plugins-silero/build/lib/livekit/plugins/silero/vad.py
ADDED
@@ -0,0 +1,291 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2023 LiveKit, Inc.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
from __future__ import annotations
|
16 |
+
|
17 |
+
import asyncio
|
18 |
+
import contextlib
|
19 |
+
import time
|
20 |
+
from collections import deque
|
21 |
+
from typing import List, Optional
|
22 |
+
|
23 |
+
import numpy as np
|
24 |
+
import torch
|
25 |
+
from livekit import agents, rtc
|
26 |
+
|
27 |
+
from .log import logger
|
28 |
+
|
29 |
+
|
30 |
+
class VAD(agents.vad.VAD):
|
31 |
+
def __init__(self, *, model_path: str | None = None, use_onnx: bool = True) -> None:
|
32 |
+
if model_path:
|
33 |
+
model = torch.jit.load(model_path)
|
34 |
+
model.eval()
|
35 |
+
else:
|
36 |
+
model, _ = torch.hub.load(
|
37 |
+
repo_or_dir="snakers4/silero-vad:v4.0",
|
38 |
+
model="silero_vad",
|
39 |
+
onnx=use_onnx,
|
40 |
+
)
|
41 |
+
self._model = model
|
42 |
+
|
43 |
+
def stream(
|
44 |
+
self,
|
45 |
+
*,
|
46 |
+
min_speaking_duration: float = 0.2,
|
47 |
+
min_silence_duration: float = 0.8,
|
48 |
+
padding_duration: float = 0.1,
|
49 |
+
sample_rate: int = 16000,
|
50 |
+
max_buffered_speech: float = 45.0,
|
51 |
+
threshold: float = 0.2,
|
52 |
+
) -> "VADStream":
|
53 |
+
return VADStream(
|
54 |
+
self._model,
|
55 |
+
min_speaking_duration=min_speaking_duration,
|
56 |
+
min_silence_duration=min_silence_duration,
|
57 |
+
padding_duration=padding_duration,
|
58 |
+
sample_rate=sample_rate,
|
59 |
+
max_buffered_speech=max_buffered_speech,
|
60 |
+
threshold=threshold,
|
61 |
+
)
|
62 |
+
|
63 |
+
|
64 |
+
# Based on https://github.com/snakers4/silero-vad/blob/94504ece54c8caeebb808410b08ae55ee82dba82/utils_vad.py#L428
|
65 |
+
class VADStream(agents.vad.VADStream):
|
66 |
+
def __init__(
|
67 |
+
self,
|
68 |
+
model,
|
69 |
+
*,
|
70 |
+
min_speaking_duration: float,
|
71 |
+
min_silence_duration: float,
|
72 |
+
padding_duration: float,
|
73 |
+
sample_rate: int,
|
74 |
+
max_buffered_speech: float,
|
75 |
+
threshold: float,
|
76 |
+
) -> None:
|
77 |
+
self._min_speaking_duration = min_speaking_duration
|
78 |
+
self._min_silence_duration = min_silence_duration
|
79 |
+
self._padding_duration = padding_duration
|
80 |
+
self._sample_rate = sample_rate
|
81 |
+
self._max_buffered_speech = max_buffered_speech
|
82 |
+
self._threshold = threshold
|
83 |
+
|
84 |
+
if sample_rate not in [8000, 16000]:
|
85 |
+
raise ValueError("Silero VAD only supports 8KHz and 16KHz sample rates")
|
86 |
+
|
87 |
+
self._queue = asyncio.Queue[Optional[rtc.AudioFrame]]()
|
88 |
+
self._event_queue = asyncio.Queue[Optional[agents.vad.VADEvent]]()
|
89 |
+
self._model = model
|
90 |
+
|
91 |
+
self._closed = False
|
92 |
+
self._speaking = False
|
93 |
+
self._waiting_start = False
|
94 |
+
self._waiting_end = False
|
95 |
+
self._current_sample = 0
|
96 |
+
self._filter = agents.utils.ExpFilter(0.8)
|
97 |
+
self._min_speaking_samples = min_speaking_duration * sample_rate
|
98 |
+
self._min_silence_samples = min_silence_duration * sample_rate
|
99 |
+
self._padding_duration_samples = padding_duration * sample_rate
|
100 |
+
self._max_buffered_samples = max_buffered_speech * sample_rate
|
101 |
+
|
102 |
+
self._queued_frames: deque[rtc.AudioFrame] = deque()
|
103 |
+
self._original_frames: deque[rtc.AudioFrame] = deque()
|
104 |
+
self._buffered_frames: List[rtc.AudioFrame] = []
|
105 |
+
self._main_task = asyncio.create_task(self._run())
|
106 |
+
|
107 |
+
def push_frame(self, frame: rtc.AudioFrame) -> None:
|
108 |
+
if self._closed:
|
109 |
+
raise ValueError("cannot push frame to closed stream")
|
110 |
+
|
111 |
+
self._queue.put_nowait(frame)
|
112 |
+
|
113 |
+
async def aclose(self, *, wait: bool = True) -> None:
|
114 |
+
self._closed = True
|
115 |
+
if not wait:
|
116 |
+
self._main_task.cancel()
|
117 |
+
|
118 |
+
self._queue.put_nowait(None)
|
119 |
+
with contextlib.suppress(asyncio.CancelledError):
|
120 |
+
await self._main_task
|
121 |
+
|
122 |
+
async def _run(self):
|
123 |
+
try:
|
124 |
+
while True:
|
125 |
+
frame = await self._queue.get()
|
126 |
+
if frame is None:
|
127 |
+
break # None is sent inside aclose
|
128 |
+
|
129 |
+
self._queue.task_done()
|
130 |
+
|
131 |
+
# resample to silero's sample rate
|
132 |
+
resampled_frame = frame.remix_and_resample(
|
133 |
+
self._sample_rate, 1
|
134 |
+
) # TODO: This is technically wrong, fix when we have a better resampler
|
135 |
+
self._original_frames.append(frame)
|
136 |
+
self._queued_frames.append(resampled_frame)
|
137 |
+
|
138 |
+
# run inference by chunks of 40ms until we run out of data
|
139 |
+
while True:
|
140 |
+
available_length = sum(
|
141 |
+
f.samples_per_channel for f in self._queued_frames
|
142 |
+
)
|
143 |
+
|
144 |
+
samples_40ms = self._sample_rate // 1000 * 40
|
145 |
+
if available_length < samples_40ms:
|
146 |
+
break
|
147 |
+
|
148 |
+
await asyncio.shield(self._run_inference())
|
149 |
+
|
150 |
+
except Exception:
|
151 |
+
logger.exception("silero stream failed")
|
152 |
+
finally:
|
153 |
+
self._event_queue.put_nowait(None)
|
154 |
+
|
155 |
+
async def _run_inference(self) -> None:
|
156 |
+
# merge the first 4 frames (we know each is 10ms)
|
157 |
+
if len(self._queued_frames) < 4:
|
158 |
+
return
|
159 |
+
|
160 |
+
original_frames = [self._original_frames.popleft() for _ in range(4)]
|
161 |
+
merged_frame = agents.utils.merge_frames(
|
162 |
+
[self._queued_frames.popleft() for _ in range(4)]
|
163 |
+
)
|
164 |
+
|
165 |
+
# convert data_40ms to tensor & f32
|
166 |
+
tensor = torch.from_numpy(np.frombuffer(merged_frame.data, dtype=np.int16))
|
167 |
+
tensor = tensor.to(torch.float32) / 32768.0
|
168 |
+
|
169 |
+
# run inference
|
170 |
+
start_time = time.time()
|
171 |
+
raw_prob = await asyncio.to_thread(
|
172 |
+
lambda: self._model(tensor, self._sample_rate).item()
|
173 |
+
)
|
174 |
+
probability = self._filter.apply(1.0, raw_prob)
|
175 |
+
inference_duration = time.time() - start_time
|
176 |
+
|
177 |
+
# inference done
|
178 |
+
event = agents.vad.VADEvent(
|
179 |
+
type=agents.vad.VADEventType.INFERENCE_DONE,
|
180 |
+
samples_index=self._current_sample,
|
181 |
+
probability=probability,
|
182 |
+
raw_inference_prob=raw_prob,
|
183 |
+
inference_duration=inference_duration,
|
184 |
+
)
|
185 |
+
self._event_queue.put_nowait(event)
|
186 |
+
|
187 |
+
self._dispatch_event(original_frames, probability, raw_prob, inference_duration)
|
188 |
+
self._current_sample += merged_frame.samples_per_channel
|
189 |
+
|
190 |
+
def _dispatch_event(
|
191 |
+
self,
|
192 |
+
original_frames: List[rtc.AudioFrame],
|
193 |
+
probability: float,
|
194 |
+
raw_inference_prob: float,
|
195 |
+
inference_duration: float,
|
196 |
+
):
|
197 |
+
"""
|
198 |
+
Dispatches a VAD event based on the speech probability and the options
|
199 |
+
Args:
|
200 |
+
speech_prob: speech probability of the current frame
|
201 |
+
original_frames: original frames of the current inference
|
202 |
+
"""
|
203 |
+
|
204 |
+
samples_10ms = self._sample_rate / 100
|
205 |
+
padding_count = int(
|
206 |
+
self._padding_duration_samples // samples_10ms
|
207 |
+
) # number of frames to keep for the padding (one side)
|
208 |
+
|
209 |
+
self._buffered_frames.extend(original_frames)
|
210 |
+
if (
|
211 |
+
not self._speaking
|
212 |
+
and not self._waiting_start
|
213 |
+
and len(self._buffered_frames) > padding_count
|
214 |
+
):
|
215 |
+
self._buffered_frames = self._buffered_frames[
|
216 |
+
len(self._buffered_frames) - padding_count :
|
217 |
+
]
|
218 |
+
|
219 |
+
max_buffer_len = padding_count + max(
|
220 |
+
int(self._max_buffered_samples // samples_10ms),
|
221 |
+
int(self._min_speaking_samples // samples_10ms),
|
222 |
+
)
|
223 |
+
if len(self._buffered_frames) > max_buffer_len:
|
224 |
+
self._buffered_frames = self._buffered_frames[
|
225 |
+
len(self._buffered_frames) - max_buffer_len :
|
226 |
+
]
|
227 |
+
|
228 |
+
if probability >= self._threshold:
|
229 |
+
# speaking, wait for min_speaking_duration to trigger START_OF_SPEECH
|
230 |
+
self._waiting_end = False
|
231 |
+
if not self._waiting_start and not self._speaking:
|
232 |
+
self._waiting_start = True
|
233 |
+
self._start_speech = self._current_sample
|
234 |
+
|
235 |
+
if self._waiting_start and (
|
236 |
+
self._current_sample - self._start_speech >= self._min_speaking_samples
|
237 |
+
):
|
238 |
+
self._waiting_start = False
|
239 |
+
self._speaking = True
|
240 |
+
|
241 |
+
# since we're waiting for the min_spaking_duration to trigger START_OF_SPEECH,
|
242 |
+
# put the speech that were used to trigger the start here
|
243 |
+
event = agents.vad.VADEvent(
|
244 |
+
type=agents.vad.VADEventType.START_OF_SPEECH,
|
245 |
+
samples_index=self._start_speech,
|
246 |
+
frames=self._buffered_frames[padding_count:],
|
247 |
+
speaking=True,
|
248 |
+
)
|
249 |
+
self._event_queue.put_nowait(event)
|
250 |
+
|
251 |
+
# we don't check the speech_prob here
|
252 |
+
event = agents.vad.VADEvent(
|
253 |
+
type=agents.vad.VADEventType.INFERENCE_DONE,
|
254 |
+
samples_index=self._current_sample,
|
255 |
+
frames=original_frames,
|
256 |
+
probability=probability,
|
257 |
+
raw_inference_prob=raw_inference_prob,
|
258 |
+
inference_duration=inference_duration,
|
259 |
+
speaking=self._speaking,
|
260 |
+
)
|
261 |
+
self._event_queue.put_nowait(event)
|
262 |
+
|
263 |
+
if probability < self._threshold:
|
264 |
+
# stopped speaking, s for min_silence_duration to trigger END_OF_SPEECH,
|
265 |
+
self._waiting_start = False
|
266 |
+
if not self._waiting_end and self._speaking:
|
267 |
+
self._waiting_end = True
|
268 |
+
self._end_speech = self._current_sample
|
269 |
+
|
270 |
+
if self._waiting_end and (
|
271 |
+
self._current_sample - self._end_speech
|
272 |
+
>= max(self._min_silence_samples, self._padding_duration_samples)
|
273 |
+
):
|
274 |
+
self._waiting_end = False
|
275 |
+
self._speaking = False
|
276 |
+
event = agents.vad.VADEvent(
|
277 |
+
type=agents.vad.VADEventType.END_OF_SPEECH,
|
278 |
+
samples_index=self._end_speech,
|
279 |
+
duration=(self._end_speech - self._start_speech)
|
280 |
+
/ self._sample_rate,
|
281 |
+
frames=self._buffered_frames,
|
282 |
+
speaking=False,
|
283 |
+
)
|
284 |
+
self._event_queue.put_nowait(event)
|
285 |
+
|
286 |
+
async def __anext__(self) -> agents.vad.VADEvent:
|
287 |
+
evt = await self._event_queue.get()
|
288 |
+
if evt is None:
|
289 |
+
raise StopAsyncIteration
|
290 |
+
|
291 |
+
return evt
|
livekit-plugins-silero/build/lib/livekit/plugins/silero/version.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2023 LiveKit, Inc.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
__version__ = "0.5.2"
|
livekit-plugins-silero/livekit/plugins/silero/__init__.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2023 LiveKit, Inc.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
from .vad import VAD, VADStream
|
16 |
+
from .version import __version__
|
17 |
+
|
18 |
+
__all__ = ["VAD", "VADStream", "__version__"]
|
19 |
+
|
20 |
+
import torch
|
21 |
+
from livekit.agents import Plugin
|
22 |
+
|
23 |
+
|
24 |
+
class SileroPlugin(Plugin):
|
25 |
+
def __init__(self):
|
26 |
+
super().__init__(__name__, __version__, __package__)
|
27 |
+
|
28 |
+
def download_files(self):
|
29 |
+
_ = torch.hub.load(
|
30 |
+
repo_or_dir="snakers4/silero-vad:v4.0", model="silero_vad", onnx=True
|
31 |
+
)
|
32 |
+
|
33 |
+
|
34 |
+
Plugin.register_plugin(SileroPlugin())
|
livekit-plugins-silero/livekit/plugins/silero/log.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
+
logger = logging.getLogger("livekit.plugins.silero")
|
livekit-plugins-silero/livekit/plugins/silero/py.typed
ADDED
File without changes
|
livekit-plugins-silero/livekit/plugins/silero/vad.py
ADDED
@@ -0,0 +1,291 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2023 LiveKit, Inc.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
from __future__ import annotations
|
16 |
+
|
17 |
+
import asyncio
|
18 |
+
import contextlib
|
19 |
+
import time
|
20 |
+
from collections import deque
|
21 |
+
from typing import List, Optional
|
22 |
+
|
23 |
+
import numpy as np
|
24 |
+
import torch
|
25 |
+
from livekit import agents, rtc
|
26 |
+
|
27 |
+
from .log import logger
|
28 |
+
|
29 |
+
|
30 |
+
class VAD(agents.vad.VAD):
|
31 |
+
def __init__(self, *, model_path: str | None = None, use_onnx: bool = True) -> None:
|
32 |
+
if model_path:
|
33 |
+
model = torch.jit.load(model_path)
|
34 |
+
model.eval()
|
35 |
+
else:
|
36 |
+
model, _ = torch.hub.load(
|
37 |
+
repo_or_dir="snakers4/silero-vad:v4.0",
|
38 |
+
model="silero_vad",
|
39 |
+
onnx=use_onnx,
|
40 |
+
)
|
41 |
+
self._model = model
|
42 |
+
|
43 |
+
def stream(
|
44 |
+
self,
|
45 |
+
*,
|
46 |
+
min_speaking_duration: float = 0.2,
|
47 |
+
min_silence_duration: float = 0.8,
|
48 |
+
padding_duration: float = 0.1,
|
49 |
+
sample_rate: int = 16000,
|
50 |
+
max_buffered_speech: float = 45.0,
|
51 |
+
threshold: float = 0.2,
|
52 |
+
) -> "VADStream":
|
53 |
+
return VADStream(
|
54 |
+
self._model,
|
55 |
+
min_speaking_duration=min_speaking_duration,
|
56 |
+
min_silence_duration=min_silence_duration,
|
57 |
+
padding_duration=padding_duration,
|
58 |
+
sample_rate=sample_rate,
|
59 |
+
max_buffered_speech=max_buffered_speech,
|
60 |
+
threshold=threshold,
|
61 |
+
)
|
62 |
+
|
63 |
+
|
64 |
+
# Based on https://github.com/snakers4/silero-vad/blob/94504ece54c8caeebb808410b08ae55ee82dba82/utils_vad.py#L428
|
65 |
+
class VADStream(agents.vad.VADStream):
|
66 |
+
def __init__(
|
67 |
+
self,
|
68 |
+
model,
|
69 |
+
*,
|
70 |
+
min_speaking_duration: float,
|
71 |
+
min_silence_duration: float,
|
72 |
+
padding_duration: float,
|
73 |
+
sample_rate: int,
|
74 |
+
max_buffered_speech: float,
|
75 |
+
threshold: float,
|
76 |
+
) -> None:
|
77 |
+
self._min_speaking_duration = min_speaking_duration
|
78 |
+
self._min_silence_duration = min_silence_duration
|
79 |
+
self._padding_duration = padding_duration
|
80 |
+
self._sample_rate = sample_rate
|
81 |
+
self._max_buffered_speech = max_buffered_speech
|
82 |
+
self._threshold = threshold
|
83 |
+
|
84 |
+
if sample_rate not in [8000, 16000]:
|
85 |
+
raise ValueError("Silero VAD only supports 8KHz and 16KHz sample rates")
|
86 |
+
|
87 |
+
self._queue = asyncio.Queue[Optional[rtc.AudioFrame]]()
|
88 |
+
self._event_queue = asyncio.Queue[Optional[agents.vad.VADEvent]]()
|
89 |
+
self._model = model
|
90 |
+
|
91 |
+
self._closed = False
|
92 |
+
self._speaking = False
|
93 |
+
self._waiting_start = False
|
94 |
+
self._waiting_end = False
|
95 |
+
self._current_sample = 0
|
96 |
+
self._filter = agents.utils.ExpFilter(0.8)
|
97 |
+
self._min_speaking_samples = min_speaking_duration * sample_rate
|
98 |
+
self._min_silence_samples = min_silence_duration * sample_rate
|
99 |
+
self._padding_duration_samples = padding_duration * sample_rate
|
100 |
+
self._max_buffered_samples = max_buffered_speech * sample_rate
|
101 |
+
|
102 |
+
self._queued_frames: deque[rtc.AudioFrame] = deque()
|
103 |
+
self._original_frames: deque[rtc.AudioFrame] = deque()
|
104 |
+
self._buffered_frames: List[rtc.AudioFrame] = []
|
105 |
+
self._main_task = asyncio.create_task(self._run())
|
106 |
+
|
107 |
+
def push_frame(self, frame: rtc.AudioFrame) -> None:
|
108 |
+
if self._closed:
|
109 |
+
raise ValueError("cannot push frame to closed stream")
|
110 |
+
|
111 |
+
self._queue.put_nowait(frame)
|
112 |
+
|
113 |
+
async def aclose(self, *, wait: bool = True) -> None:
|
114 |
+
self._closed = True
|
115 |
+
if not wait:
|
116 |
+
self._main_task.cancel()
|
117 |
+
|
118 |
+
self._queue.put_nowait(None)
|
119 |
+
with contextlib.suppress(asyncio.CancelledError):
|
120 |
+
await self._main_task
|
121 |
+
|
122 |
+
async def _run(self):
|
123 |
+
try:
|
124 |
+
while True:
|
125 |
+
frame = await self._queue.get()
|
126 |
+
if frame is None:
|
127 |
+
break # None is sent inside aclose
|
128 |
+
|
129 |
+
self._queue.task_done()
|
130 |
+
|
131 |
+
# resample to silero's sample rate
|
132 |
+
resampled_frame = frame.remix_and_resample(
|
133 |
+
self._sample_rate, 1
|
134 |
+
) # TODO: This is technically wrong, fix when we have a better resampler
|
135 |
+
self._original_frames.append(frame)
|
136 |
+
self._queued_frames.append(resampled_frame)
|
137 |
+
|
138 |
+
# run inference by chunks of 40ms until we run out of data
|
139 |
+
while True:
|
140 |
+
available_length = sum(
|
141 |
+
f.samples_per_channel for f in self._queued_frames
|
142 |
+
)
|
143 |
+
|
144 |
+
samples_40ms = self._sample_rate // 1000 * 40
|
145 |
+
if available_length < samples_40ms:
|
146 |
+
break
|
147 |
+
|
148 |
+
await asyncio.shield(self._run_inference())
|
149 |
+
|
150 |
+
except Exception:
|
151 |
+
logger.exception("silero stream failed")
|
152 |
+
finally:
|
153 |
+
self._event_queue.put_nowait(None)
|
154 |
+
|
155 |
+
async def _run_inference(self) -> None:
|
156 |
+
# merge the first 4 frames (we know each is 10ms)
|
157 |
+
if len(self._queued_frames) < 4:
|
158 |
+
return
|
159 |
+
|
160 |
+
original_frames = [self._original_frames.popleft() for _ in range(4)]
|
161 |
+
merged_frame = agents.utils.merge_frames(
|
162 |
+
[self._queued_frames.popleft() for _ in range(4)]
|
163 |
+
)
|
164 |
+
|
165 |
+
# convert data_40ms to tensor & f32
|
166 |
+
tensor = torch.from_numpy(np.frombuffer(merged_frame.data, dtype=np.int16))
|
167 |
+
tensor = tensor.to(torch.float32) / 32768.0
|
168 |
+
|
169 |
+
# run inference
|
170 |
+
start_time = time.time()
|
171 |
+
raw_prob = await asyncio.to_thread(
|
172 |
+
lambda: self._model(tensor, self._sample_rate).item()
|
173 |
+
)
|
174 |
+
probability = self._filter.apply(1.0, raw_prob)
|
175 |
+
inference_duration = time.time() - start_time
|
176 |
+
|
177 |
+
# inference done
|
178 |
+
event = agents.vad.VADEvent(
|
179 |
+
type=agents.vad.VADEventType.INFERENCE_DONE,
|
180 |
+
samples_index=self._current_sample,
|
181 |
+
probability=probability,
|
182 |
+
raw_inference_prob=raw_prob,
|
183 |
+
inference_duration=inference_duration,
|
184 |
+
)
|
185 |
+
self._event_queue.put_nowait(event)
|
186 |
+
|
187 |
+
self._dispatch_event(original_frames, probability, raw_prob, inference_duration)
|
188 |
+
self._current_sample += merged_frame.samples_per_channel
|
189 |
+
|
190 |
+
def _dispatch_event(
|
191 |
+
self,
|
192 |
+
original_frames: List[rtc.AudioFrame],
|
193 |
+
probability: float,
|
194 |
+
raw_inference_prob: float,
|
195 |
+
inference_duration: float,
|
196 |
+
):
|
197 |
+
"""
|
198 |
+
Dispatches a VAD event based on the speech probability and the options
|
199 |
+
Args:
|
200 |
+
speech_prob: speech probability of the current frame
|
201 |
+
original_frames: original frames of the current inference
|
202 |
+
"""
|
203 |
+
|
204 |
+
samples_10ms = self._sample_rate / 100
|
205 |
+
padding_count = int(
|
206 |
+
self._padding_duration_samples // samples_10ms
|
207 |
+
) # number of frames to keep for the padding (one side)
|
208 |
+
|
209 |
+
self._buffered_frames.extend(original_frames)
|
210 |
+
if (
|
211 |
+
not self._speaking
|
212 |
+
and not self._waiting_start
|
213 |
+
and len(self._buffered_frames) > padding_count
|
214 |
+
):
|
215 |
+
self._buffered_frames = self._buffered_frames[
|
216 |
+
len(self._buffered_frames) - padding_count :
|
217 |
+
]
|
218 |
+
|
219 |
+
max_buffer_len = padding_count + max(
|
220 |
+
int(self._max_buffered_samples // samples_10ms),
|
221 |
+
int(self._min_speaking_samples // samples_10ms),
|
222 |
+
)
|
223 |
+
if len(self._buffered_frames) > max_buffer_len:
|
224 |
+
self._buffered_frames = self._buffered_frames[
|
225 |
+
len(self._buffered_frames) - max_buffer_len :
|
226 |
+
]
|
227 |
+
|
228 |
+
if probability >= self._threshold:
|
229 |
+
# speaking, wait for min_speaking_duration to trigger START_OF_SPEECH
|
230 |
+
self._waiting_end = False
|
231 |
+
if not self._waiting_start and not self._speaking:
|
232 |
+
self._waiting_start = True
|
233 |
+
self._start_speech = self._current_sample
|
234 |
+
|
235 |
+
if self._waiting_start and (
|
236 |
+
self._current_sample - self._start_speech >= self._min_speaking_samples
|
237 |
+
):
|
238 |
+
self._waiting_start = False
|
239 |
+
self._speaking = True
|
240 |
+
|
241 |
+
# since we're waiting for the min_spaking_duration to trigger START_OF_SPEECH,
|
242 |
+
# put the speech that were used to trigger the start here
|
243 |
+
event = agents.vad.VADEvent(
|
244 |
+
type=agents.vad.VADEventType.START_OF_SPEECH,
|
245 |
+
samples_index=self._start_speech,
|
246 |
+
frames=self._buffered_frames[padding_count:],
|
247 |
+
speaking=True,
|
248 |
+
)
|
249 |
+
self._event_queue.put_nowait(event)
|
250 |
+
|
251 |
+
# we don't check the speech_prob here
|
252 |
+
event = agents.vad.VADEvent(
|
253 |
+
type=agents.vad.VADEventType.INFERENCE_DONE,
|
254 |
+
samples_index=self._current_sample,
|
255 |
+
frames=original_frames,
|
256 |
+
probability=probability,
|
257 |
+
raw_inference_prob=raw_inference_prob,
|
258 |
+
inference_duration=inference_duration,
|
259 |
+
speaking=self._speaking,
|
260 |
+
)
|
261 |
+
self._event_queue.put_nowait(event)
|
262 |
+
|
263 |
+
if probability < self._threshold:
|
264 |
+
# stopped speaking, s for min_silence_duration to trigger END_OF_SPEECH,
|
265 |
+
self._waiting_start = False
|
266 |
+
if not self._waiting_end and self._speaking:
|
267 |
+
self._waiting_end = True
|
268 |
+
self._end_speech = self._current_sample
|
269 |
+
|
270 |
+
if self._waiting_end and (
|
271 |
+
self._current_sample - self._end_speech
|
272 |
+
>= max(self._min_silence_samples, self._padding_duration_samples)
|
273 |
+
):
|
274 |
+
self._waiting_end = False
|
275 |
+
self._speaking = False
|
276 |
+
event = agents.vad.VADEvent(
|
277 |
+
type=agents.vad.VADEventType.END_OF_SPEECH,
|
278 |
+
samples_index=self._end_speech,
|
279 |
+
duration=(self._end_speech - self._start_speech)
|
280 |
+
/ self._sample_rate,
|
281 |
+
frames=self._buffered_frames,
|
282 |
+
speaking=False,
|
283 |
+
)
|
284 |
+
self._event_queue.put_nowait(event)
|
285 |
+
|
286 |
+
async def __anext__(self) -> agents.vad.VADEvent:
|
287 |
+
evt = await self._event_queue.get()
|
288 |
+
if evt is None:
|
289 |
+
raise StopAsyncIteration
|
290 |
+
|
291 |
+
return evt
|
livekit-plugins-silero/livekit/plugins/silero/version.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2023 LiveKit, Inc.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
__version__ = "0.5.2"
|
livekit-plugins-silero/livekit_plugins_silero.egg-info/PKG-INFO
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Metadata-Version: 2.1
|
2 |
+
Name: livekit-plugins-silero
|
3 |
+
Version: 0.5.2
|
4 |
+
Summary: Agent Framework Plugin for Silero
|
5 |
+
Home-page: https://github.com/livekit/agents
|
6 |
+
License: Apache-2.0
|
7 |
+
Project-URL: Documentation, https://docs.livekit.io
|
8 |
+
Project-URL: Website, https://livekit.io/
|
9 |
+
Project-URL: Source, https://github.com/livekit/agents
|
10 |
+
Keywords: webrtc,realtime,audio,video,livekit
|
11 |
+
Classifier: Intended Audience :: Developers
|
12 |
+
Classifier: License :: OSI Approved :: Apache Software License
|
13 |
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
14 |
+
Classifier: Topic :: Multimedia :: Video
|
15 |
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
16 |
+
Classifier: Programming Language :: Python :: 3
|
17 |
+
Classifier: Programming Language :: Python :: 3.9
|
18 |
+
Classifier: Programming Language :: Python :: 3.10
|
19 |
+
Classifier: Programming Language :: Python :: 3 :: Only
|
20 |
+
Requires-Python: >=3.9.0
|
21 |
+
Description-Content-Type: text/markdown
|
22 |
+
Requires-Dist: livekit-agents~=0.7
|
23 |
+
Requires-Dist: torch<3,>=2
|
24 |
+
Requires-Dist: torchaudio>=2
|
25 |
+
Requires-Dist: numpy<2,>=1
|
26 |
+
Requires-Dist: onnxruntime~=1.17.0
|
27 |
+
|
28 |
+
# LiveKit Plugins Silero
|
29 |
+
|
30 |
+
Agent Framework Plugin for Silero. Currently supports Voice Activity Detection.
|
31 |
+
|
32 |
+
## Installation
|
33 |
+
|
34 |
+
```bash
|
35 |
+
pip install livekit-plugins-silero
|
36 |
+
```
|
37 |
+
|
38 |
+
This plugin contains model files that would need to be downloaded prior to use.
|
livekit-plugins-silero/livekit_plugins_silero.egg-info/SOURCES.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
README.md
|
2 |
+
pyproject.toml
|
3 |
+
setup.py
|
4 |
+
livekit/plugins/silero/__init__.py
|
5 |
+
livekit/plugins/silero/log.py
|
6 |
+
livekit/plugins/silero/py.typed
|
7 |
+
livekit/plugins/silero/vad.py
|
8 |
+
livekit/plugins/silero/version.py
|
9 |
+
livekit_plugins_silero.egg-info/PKG-INFO
|
10 |
+
livekit_plugins_silero.egg-info/SOURCES.txt
|
11 |
+
livekit_plugins_silero.egg-info/dependency_links.txt
|
12 |
+
livekit_plugins_silero.egg-info/requires.txt
|
13 |
+
livekit_plugins_silero.egg-info/top_level.txt
|
livekit-plugins-silero/livekit_plugins_silero.egg-info/dependency_links.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
livekit-plugins-silero/livekit_plugins_silero.egg-info/requires.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
livekit-agents~=0.7
|
2 |
+
torch<3,>=2
|
3 |
+
torchaudio>=2
|
4 |
+
numpy<2,>=1
|
5 |
+
onnxruntime~=1.17.0
|
livekit-plugins-silero/livekit_plugins_silero.egg-info/top_level.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
livekit
|
livekit-plugins-silero/pyproject.toml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[build-system]
|
2 |
+
requires = ["setuptools>=61.0"]
|
3 |
+
build-backend = "setuptools.build_meta"
|
livekit-plugins-silero/setup.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2023 LiveKit, Inc.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import os
|
16 |
+
import pathlib
|
17 |
+
|
18 |
+
import setuptools
|
19 |
+
import setuptools.command.build_py
|
20 |
+
|
21 |
+
here = pathlib.Path(__file__).parent.resolve()
|
22 |
+
about = {}
|
23 |
+
with open(os.path.join(here, "livekit", "plugins", "silero", "version.py"), "r") as f:
|
24 |
+
exec(f.read(), about)
|
25 |
+
|
26 |
+
|
27 |
+
setuptools.setup(
|
28 |
+
name="livekit-plugins-silero",
|
29 |
+
version=about["__version__"],
|
30 |
+
description="Agent Framework Plugin for Silero",
|
31 |
+
long_description=(here / "README.md").read_text(encoding="utf-8"),
|
32 |
+
long_description_content_type="text/markdown",
|
33 |
+
url="https://github.com/livekit/agents",
|
34 |
+
cmdclass={},
|
35 |
+
classifiers=[
|
36 |
+
"Intended Audience :: Developers",
|
37 |
+
"License :: OSI Approved :: Apache Software License",
|
38 |
+
"Topic :: Multimedia :: Sound/Audio",
|
39 |
+
"Topic :: Multimedia :: Video",
|
40 |
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
41 |
+
"Programming Language :: Python :: 3",
|
42 |
+
"Programming Language :: Python :: 3.9",
|
43 |
+
"Programming Language :: Python :: 3.10",
|
44 |
+
"Programming Language :: Python :: 3 :: Only",
|
45 |
+
],
|
46 |
+
keywords=["webrtc", "realtime", "audio", "video", "livekit"],
|
47 |
+
license="Apache-2.0",
|
48 |
+
packages=setuptools.find_namespace_packages(include=["livekit.*"]),
|
49 |
+
python_requires=">=3.9.0",
|
50 |
+
install_requires=[
|
51 |
+
"livekit-agents~=0.7",
|
52 |
+
"torch >= 2, < 3",
|
53 |
+
"torchaudio >= 2",
|
54 |
+
"numpy >= 1, < 2",
|
55 |
+
"onnxruntime~=1.17.0",
|
56 |
+
],
|
57 |
+
package_data={
|
58 |
+
"livekit.plugins.silero": ["py.typed"],
|
59 |
+
},
|
60 |
+
project_urls={
|
61 |
+
"Documentation": "https://docs.livekit.io",
|
62 |
+
"Website": "https://livekit.io/",
|
63 |
+
"Source": "https://github.com/livekit/agents",
|
64 |
+
},
|
65 |
+
)
|