Spaces:
Runtime error
Runtime error
Upload 42 files
Browse files- .gitattributes +1 -0
- snakers4_silero-vad_master/.github/ISSUE_TEMPLATE/bug_report.md +52 -0
- snakers4_silero-vad_master/.github/ISSUE_TEMPLATE/feature_request.md +27 -0
- snakers4_silero-vad_master/.github/ISSUE_TEMPLATE/questions---help---support.md +12 -0
- snakers4_silero-vad_master/CODE_OF_CONDUCT.md +76 -0
- snakers4_silero-vad_master/LICENSE +21 -0
- snakers4_silero-vad_master/README.md +113 -0
- snakers4_silero-vad_master/__pycache__/hubconf.cpython-310.pyc +0 -0
- snakers4_silero-vad_master/__pycache__/utils_vad.cpython-310.pyc +0 -0
- snakers4_silero-vad_master/datasets/README.md +84 -0
- snakers4_silero-vad_master/examples/colab_record_example.ipynb +241 -0
- snakers4_silero-vad_master/examples/cpp/README.md +43 -0
- snakers4_silero-vad_master/examples/cpp/silero-vad-onnx.cpp +486 -0
- snakers4_silero-vad_master/examples/cpp/wav.h +235 -0
- snakers4_silero-vad_master/examples/go/README.md +19 -0
- snakers4_silero-vad_master/examples/go/cmd/main.go +60 -0
- snakers4_silero-vad_master/examples/go/go.mod +13 -0
- snakers4_silero-vad_master/examples/go/go.sum +16 -0
- snakers4_silero-vad_master/examples/java-example/pom.xml +30 -0
- snakers4_silero-vad_master/examples/java-example/src/main/java/org/example/App.java +69 -0
- snakers4_silero-vad_master/examples/java-example/src/main/java/org/example/SlieroVadDetector.java +145 -0
- snakers4_silero-vad_master/examples/java-example/src/main/java/org/example/SlieroVadOnnxModel.java +180 -0
- snakers4_silero-vad_master/examples/microphone_and_webRTC_integration/README.md +28 -0
- snakers4_silero-vad_master/examples/microphone_and_webRTC_integration/microphone_and_webRTC_integration.py +201 -0
- snakers4_silero-vad_master/examples/parallel_example.ipynb +149 -0
- snakers4_silero-vad_master/examples/pyaudio-streaming/README.md +20 -0
- snakers4_silero-vad_master/examples/pyaudio-streaming/pyaudio-streaming-examples.ipynb +331 -0
- snakers4_silero-vad_master/examples/rust-example/.gitignore +2 -0
- snakers4_silero-vad_master/examples/rust-example/Cargo.lock +781 -0
- snakers4_silero-vad_master/examples/rust-example/Cargo.toml +9 -0
- snakers4_silero-vad_master/examples/rust-example/README.md +19 -0
- snakers4_silero-vad_master/examples/rust-example/src/main.rs +36 -0
- snakers4_silero-vad_master/examples/rust-example/src/silero.rs +59 -0
- snakers4_silero-vad_master/examples/rust-example/src/utils.rs +60 -0
- snakers4_silero-vad_master/examples/rust-example/src/vad_iter.rs +223 -0
- snakers4_silero-vad_master/files/lang_dict_95.json +1 -0
- snakers4_silero-vad_master/files/lang_group_dict_95.json +1 -0
- snakers4_silero-vad_master/files/silero_logo.jpg +0 -0
- snakers4_silero-vad_master/files/silero_vad.jit +3 -0
- snakers4_silero-vad_master/files/silero_vad.onnx +3 -0
- snakers4_silero-vad_master/hubconf.py +114 -0
- snakers4_silero-vad_master/silero-vad.ipynb +204 -0
- snakers4_silero-vad_master/utils_vad.py +545 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
snakers4_silero-vad_master/files/silero_vad.jit filter=lfs diff=lfs merge=lfs -text
|
snakers4_silero-vad_master/.github/ISSUE_TEMPLATE/bug_report.md
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
name: Bug report
|
3 |
+
about: Create a report to help us improve
|
4 |
+
title: Bug report - [X]
|
5 |
+
labels: bug
|
6 |
+
assignees: snakers4
|
7 |
+
|
8 |
+
---
|
9 |
+
|
10 |
+
## 🐛 Bug
|
11 |
+
|
12 |
+
<!-- A clear and concise description of what the bug is. -->
|
13 |
+
|
14 |
+
## To Reproduce
|
15 |
+
|
16 |
+
Steps to reproduce the behavior:
|
17 |
+
|
18 |
+
1.
|
19 |
+
2.
|
20 |
+
3.
|
21 |
+
|
22 |
+
<!-- If you have a code sample, error messages, stack traces, please provide it here as well -->
|
23 |
+
|
24 |
+
## Expected behavior
|
25 |
+
|
26 |
+
<!-- A clear and concise description of what you expected to happen. -->
|
27 |
+
|
28 |
+
## Environment
|
29 |
+
|
30 |
+
Please copy and paste the output from this
|
31 |
+
[environment collection script](https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py)
|
32 |
+
(or fill out the checklist below manually).
|
33 |
+
|
34 |
+
You can get the script and run it with:
|
35 |
+
```
|
36 |
+
wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py
|
37 |
+
# For security purposes, please check the contents of collect_env.py before running it.
|
38 |
+
python collect_env.py
|
39 |
+
```
|
40 |
+
|
41 |
+
- PyTorch Version (e.g., 1.0):
|
42 |
+
- OS (e.g., Linux):
|
43 |
+
- How you installed PyTorch (`conda`, `pip`, source):
|
44 |
+
- Build command you used (if compiling from source):
|
45 |
+
- Python version:
|
46 |
+
- CUDA/cuDNN version:
|
47 |
+
- GPU models and configuration:
|
48 |
+
- Any other relevant information:
|
49 |
+
|
50 |
+
## Additional context
|
51 |
+
|
52 |
+
<!-- Add any other context about the problem here. -->
|
snakers4_silero-vad_master/.github/ISSUE_TEMPLATE/feature_request.md
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
name: Feature request
|
3 |
+
about: Suggest an idea for this project
|
4 |
+
title: Feature request - [X]
|
5 |
+
labels: enhancement
|
6 |
+
assignees: snakers4
|
7 |
+
|
8 |
+
---
|
9 |
+
|
10 |
+
## 🚀 Feature
|
11 |
+
<!-- A clear and concise description of the feature proposal -->
|
12 |
+
|
13 |
+
## Motivation
|
14 |
+
|
15 |
+
<!-- Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too -->
|
16 |
+
|
17 |
+
## Pitch
|
18 |
+
|
19 |
+
<!-- A clear and concise description of what you want to happen. -->
|
20 |
+
|
21 |
+
## Alternatives
|
22 |
+
|
23 |
+
<!-- A clear and concise description of any alternative solutions or features you've considered, if any. -->
|
24 |
+
|
25 |
+
## Additional context
|
26 |
+
|
27 |
+
<!-- Add any other context or screenshots about the feature request here. -->
|
snakers4_silero-vad_master/.github/ISSUE_TEMPLATE/questions---help---support.md
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
name: Questions / Help / Support
|
3 |
+
about: Ask for help, support or ask a question
|
4 |
+
title: "❓ Questions / Help / Support"
|
5 |
+
labels: help wanted
|
6 |
+
assignees: snakers4
|
7 |
+
|
8 |
+
---
|
9 |
+
|
10 |
+
## ❓ Questions and Help
|
11 |
+
|
12 |
+
We have a [wiki](https://github.com/snakers4/silero-models/wiki) available for our users. Please make sure you have checked it out first.
|
snakers4_silero-vad_master/CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Contributor Covenant Code of Conduct
|
2 |
+
|
3 |
+
## Our Pledge
|
4 |
+
|
5 |
+
In the interest of fostering an open and welcoming environment, we as
|
6 |
+
contributors and maintainers pledge to making participation in our project and
|
7 |
+
our community a harassment-free experience for everyone, regardless of age, body
|
8 |
+
size, disability, ethnicity, sex characteristics, gender identity and expression,
|
9 |
+
level of experience, education, socio-economic status, nationality, personal
|
10 |
+
appearance, race, religion, or sexual identity and orientation.
|
11 |
+
|
12 |
+
## Our Standards
|
13 |
+
|
14 |
+
Examples of behavior that contributes to creating a positive environment
|
15 |
+
include:
|
16 |
+
|
17 |
+
* Using welcoming and inclusive language
|
18 |
+
* Being respectful of differing viewpoints and experiences
|
19 |
+
* Gracefully accepting constructive criticism
|
20 |
+
* Focusing on what is best for the community
|
21 |
+
* Showing empathy towards other community members
|
22 |
+
|
23 |
+
Examples of unacceptable behavior by participants include:
|
24 |
+
|
25 |
+
* The use of sexualized language or imagery and unwelcome sexual attention or
|
26 |
+
advances
|
27 |
+
* Trolling, insulting/derogatory comments, and personal or political attacks
|
28 |
+
* Public or private harassment
|
29 |
+
* Publishing others' private information, such as a physical or electronic
|
30 |
+
address, without explicit permission
|
31 |
+
* Other conduct which could reasonably be considered inappropriate in a
|
32 |
+
professional setting
|
33 |
+
|
34 |
+
## Our Responsibilities
|
35 |
+
|
36 |
+
Project maintainers are responsible for clarifying the standards of acceptable
|
37 |
+
behavior and are expected to take appropriate and fair corrective action in
|
38 |
+
response to any instances of unacceptable behavior.
|
39 |
+
|
40 |
+
Project maintainers have the right and responsibility to remove, edit, or
|
41 |
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
42 |
+
that are not aligned to this Code of Conduct, or to ban temporarily or
|
43 |
+
permanently any contributor for other behaviors that they deem inappropriate,
|
44 |
+
threatening, offensive, or harmful.
|
45 |
+
|
46 |
+
## Scope
|
47 |
+
|
48 |
+
This Code of Conduct applies both within project spaces and in public spaces
|
49 |
+
when an individual is representing the project or its community. Examples of
|
50 |
+
representing a project or community include using an official project e-mail
|
51 |
+
address, posting via an official social media account, or acting as an appointed
|
52 |
+
representative at an online or offline event. Representation of a project may be
|
53 |
+
further defined and clarified by project maintainers.
|
54 |
+
|
55 |
+
## Enforcement
|
56 |
+
|
57 |
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
58 |
+
reported by contacting the project team at aveysov@gmail.com. All
|
59 |
+
complaints will be reviewed and investigated and will result in a response that
|
60 |
+
is deemed necessary and appropriate to the circumstances. The project team is
|
61 |
+
obligated to maintain confidentiality with regard to the reporter of an incident.
|
62 |
+
Further details of specific enforcement policies may be posted separately.
|
63 |
+
|
64 |
+
Project maintainers who do not follow or enforce the Code of Conduct in good
|
65 |
+
faith may face temporary or permanent repercussions as determined by other
|
66 |
+
members of the project's leadership.
|
67 |
+
|
68 |
+
## Attribution
|
69 |
+
|
70 |
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
|
71 |
+
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
|
72 |
+
|
73 |
+
[homepage]: https://www.contributor-covenant.org
|
74 |
+
|
75 |
+
For answers to common questions about this code of conduct, see
|
76 |
+
https://www.contributor-covenant.org/faq
|
snakers4_silero-vad_master/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2020-present Silero Team
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
snakers4_silero-vad_master/README.md
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[![Mailing list : test](http://img.shields.io/badge/Email-gray.svg?style=for-the-badge&logo=gmail)](mailto:hello@silero.ai) [![Mailing list : test](http://img.shields.io/badge/Telegram-blue.svg?style=for-the-badge&logo=telegram)](https://t.me/silero_speech) [![License: CC BY-NC 4.0](https://img.shields.io/badge/License-MIT-lightgrey.svg?style=for-the-badge)](https://github.com/snakers4/silero-vad/blob/master/LICENSE)
|
2 |
+
|
3 |
+
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb)
|
4 |
+
|
5 |
+
![header](https://user-images.githubusercontent.com/12515440/89997349-b3523080-dc94-11ea-9906-ca2e8bc50535.png)
|
6 |
+
|
7 |
+
<br/>
|
8 |
+
<h1 align="center">Silero VAD</h1>
|
9 |
+
<br/>
|
10 |
+
|
11 |
+
**Silero VAD** - pre-trained enterprise-grade [Voice Activity Detector](https://en.wikipedia.org/wiki/Voice_activity_detection) (also see our [STT models](https://github.com/snakers4/silero-models)).
|
12 |
+
|
13 |
+
<br/>
|
14 |
+
|
15 |
+
<p align="center">
|
16 |
+
<img src="https://user-images.githubusercontent.com/12515440/228639780-876f7801-8ec5-4daf-89f3-b45b22dd1a73.png" />
|
17 |
+
</p>
|
18 |
+
|
19 |
+
|
20 |
+
<details>
|
21 |
+
<summary>Real Time Example</summary>
|
22 |
+
|
23 |
+
https://user-images.githubusercontent.com/36505480/144874384-95f80f6d-a4f1-42cc-9be7-004c891dd481.mp4
|
24 |
+
|
25 |
+
</details>
|
26 |
+
|
27 |
+
<br/>
|
28 |
+
<h2 align="center">Key Features</h2>
|
29 |
+
<br/>
|
30 |
+
|
31 |
+
- **Stellar accuracy**
|
32 |
+
|
33 |
+
Silero VAD has [excellent results](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics#vs-other-available-solutions) on speech detection tasks.
|
34 |
+
|
35 |
+
- **Fast**
|
36 |
+
|
37 |
+
One audio chunk (30+ ms) [takes](https://github.com/snakers4/silero-vad/wiki/Performance-Metrics#silero-vad-performance-metrics) less than **1ms** to be processed on a single CPU thread. Using batching or GPU can also improve performance considerably. Under certain conditions ONNX may even run up to 4-5x faster.
|
38 |
+
|
39 |
+
- **Lightweight**
|
40 |
+
|
41 |
+
JIT model is around one megabyte in size.
|
42 |
+
|
43 |
+
- **General**
|
44 |
+
|
45 |
+
Silero VAD was trained on huge corpora that include over **100** languages and it performs well on audios from different domains with various background noise and quality levels.
|
46 |
+
|
47 |
+
- **Flexible sampling rate**
|
48 |
+
|
49 |
+
Silero VAD [supports](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics#sample-rate-comparison) **8000 Hz** and **16000 Hz** [sampling rates](https://en.wikipedia.org/wiki/Sampling_(signal_processing)#Sampling_rate).
|
50 |
+
|
51 |
+
- **Flexible chunk size**
|
52 |
+
|
53 |
+
Model was trained on **30 ms**. Longer chunks are supported directly, others may work as well.
|
54 |
+
|
55 |
+
- **Highly Portable**
|
56 |
+
|
57 |
+
Silero VAD reaps benefits from the rich ecosystems built around **PyTorch** and **ONNX** running everywhere where these runtimes are available.
|
58 |
+
|
59 |
+
- **No Strings Attached**
|
60 |
+
|
61 |
+
Published under permissive license (MIT) Silero VAD has zero strings attached - no telemetry, no keys, no registration, no built-in expiration, no keys or vendor lock.
|
62 |
+
|
63 |
+
<br/>
|
64 |
+
<h2 align="center">Typical Use Cases</h2>
|
65 |
+
<br/>
|
66 |
+
|
67 |
+
- Voice activity detection for IOT / edge / mobile use cases
|
68 |
+
- Data cleaning and preparation, voice detection in general
|
69 |
+
- Telephony and call-center automation, voice bots
|
70 |
+
- Voice interfaces
|
71 |
+
|
72 |
+
<br/>
|
73 |
+
<h2 align="center">Links</h2>
|
74 |
+
<br/>
|
75 |
+
|
76 |
+
|
77 |
+
- [Examples and Dependencies](https://github.com/snakers4/silero-vad/wiki/Examples-and-Dependencies#dependencies)
|
78 |
+
- [Quality Metrics](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics)
|
79 |
+
- [Performance Metrics](https://github.com/snakers4/silero-vad/wiki/Performance-Metrics)
|
80 |
+
- [Versions and Available Models](https://github.com/snakers4/silero-vad/wiki/Version-history-and-Available-Models)
|
81 |
+
- [Further reading](https://github.com/snakers4/silero-models#further-reading)
|
82 |
+
- [FAQ](https://github.com/snakers4/silero-vad/wiki/FAQ)
|
83 |
+
|
84 |
+
<br/>
|
85 |
+
<h2 align="center">Get In Touch</h2>
|
86 |
+
<br/>
|
87 |
+
|
88 |
+
Try our models, create an [issue](https://github.com/snakers4/silero-vad/issues/new), start a [discussion](https://github.com/snakers4/silero-vad/discussions/new), join our telegram [chat](https://t.me/silero_speech), [email](mailto:hello@silero.ai) us, read our [news](https://t.me/silero_news).
|
89 |
+
|
90 |
+
Please see our [wiki](https://github.com/snakers4/silero-models/wiki) and [tiers](https://github.com/snakers4/silero-models/wiki/Licensing-and-Tiers) for relevant information and [email](mailto:hello@silero.ai) us directly.
|
91 |
+
|
92 |
+
**Citations**
|
93 |
+
|
94 |
+
```
|
95 |
+
@misc{Silero VAD,
|
96 |
+
author = {Silero Team},
|
97 |
+
title = {Silero VAD: pre-trained enterprise-grade Voice Activity Detector (VAD), Number Detector and Language Classifier},
|
98 |
+
year = {2021},
|
99 |
+
publisher = {GitHub},
|
100 |
+
journal = {GitHub repository},
|
101 |
+
howpublished = {\url{https://github.com/snakers4/silero-vad}},
|
102 |
+
commit = {insert_some_commit_here},
|
103 |
+
email = {hello@silero.ai}
|
104 |
+
}
|
105 |
+
```
|
106 |
+
|
107 |
+
<br/>
|
108 |
+
<h2 align="center">Examples and VAD-based Community Apps</h2>
|
109 |
+
<br/>
|
110 |
+
|
111 |
+
- Example of VAD ONNX Runtime model usage in [C++](https://github.com/snakers4/silero-vad/tree/master/examples/cpp)
|
112 |
+
|
113 |
+
- Voice activity detection for the [browser](https://github.com/ricky0123/vad) using ONNX Runtime Web
|
snakers4_silero-vad_master/__pycache__/hubconf.cpython-310.pyc
ADDED
Binary file (2.61 kB). View file
|
|
snakers4_silero-vad_master/__pycache__/utils_vad.cpython-310.pyc
ADDED
Binary file (15.7 kB). View file
|
|
snakers4_silero-vad_master/datasets/README.md
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Датасет Silero-VAD
|
2 |
+
|
3 |
+
> Датасет создан при поддержке Фонда содействия инновациям в рамках федерального проекта «Искусственный
|
4 |
+
интеллект» национальной программы «Цифровая экономика Российской Федерации».
|
5 |
+
|
6 |
+
По ссылкам ниже представлены `.feather` файлы, содержащие размеченные с помощью Silero VAD открытые наборы аудиоданных, а также короткое описание каждого набора данных с примерами загрузки. `.feather` файлы можно открыть с помощью библиотеки `pandas`:
|
7 |
+
```python3
|
8 |
+
import pandas as pd
|
9 |
+
dataframe = pd.read_feather(PATH_TO_FEATHER_FILE)
|
10 |
+
```
|
11 |
+
|
12 |
+
Каждый `.feather` файл с разметкой содержит следующие колонки:
|
13 |
+
- `speech_timings` - разметка данного аудио. Это список, содержащий словари вида `{'start': START_SECOND, 'end': END_SECOND}`, где `START_SECOND` и `END_SECOND` - время начала и конца речи в секундах. Количество данных словарей равно количеству речевых аудио отрывков, найденных в данном аудио;
|
14 |
+
- `language` - ISO код языка данного аудио.
|
15 |
+
|
16 |
+
Колонки, содержащие информацию о загрузке аудио файла различаются и описаны для каждого набора данных ниже.
|
17 |
+
|
18 |
+
**Все данные размечены при временной дискретизации в ~30 миллисекунд (`num_samples` - 512)**
|
19 |
+
|
20 |
+
| Название | Число часов | Число языков | Ссылка | Лицензия | md5sum |
|
21 |
+
|----------------------|-------------|-------------|--------|----------|----------|
|
22 |
+
| **Bible.is** | 53,138 | 1,596 | [URL](https://live.bible.is/) | [Уникальная](https://live.bible.is/terms) | ea404eeaf2cd283b8223f63002be11f9 |
|
23 |
+
| **globalrecordings.net** | 9,743 | 6,171[^1] | [URL](https://globalrecordings.net/en) | CC BY-NC-SA 4.0 | 3c5c0f31b0abd9fe94ddbe8b1e2eb326 |
|
24 |
+
| **VoxLingua107** | 6,628 | 107 | [URL](https://bark.phon.ioc.ee/voxlingua107/) | CC BY 4.0 | 5dfef33b4d091b6d399cfaf3d05f2140 |
|
25 |
+
| **Common Voice** | 30,329 | 120 | [URL](https://commonvoice.mozilla.org/en/datasets) | CC0 | 5e30a85126adf74a5fd1496e6ac8695d |
|
26 |
+
| **MLS** | 50,709 | 8 | [URL](https://www.openslr.org/94/) | CC BY 4.0 | a339d0e94bdf41bba3c003756254ac4e |
|
27 |
+
| **Итого** | **150,547** | **6,171+** | | | |
|
28 |
+
|
29 |
+
## Bible.is
|
30 |
+
|
31 |
+
[Ссылка на `.feather` файл с разметкой](https://models.silero.ai/vad_datasets/BibleIs.feather)
|
32 |
+
|
33 |
+
- Колонка `audio_link` содержит ссылки на конкретные аудио файлы.
|
34 |
+
|
35 |
+
## globalrecordings.net
|
36 |
+
|
37 |
+
[Ссылка на `.feather` файл с разметкой](https://models.silero.ai/vad_datasets/globalrecordings.feather)
|
38 |
+
|
39 |
+
- Колонка `folder_link` содержит ссылки на скачивание `.zip` архива для конкретного языка. Внимание! Ссылки на архивы дублируются, т.к каждый архив может содержать множество аудио.
|
40 |
+
- Колонка `audio_path` содержит пути до конкретного аудио после распаковки соответствующего архива из колонки `folder_link`
|
41 |
+
|
42 |
+
``Количество уникальных ISO кодов данного датасета не совпадает с фактическим количеством представленных языков, т.к некоторые близкие языки могут кодироваться одним и тем же ISO кодом.``
|
43 |
+
|
44 |
+
## VoxLingua107
|
45 |
+
|
46 |
+
[Ссылка на `.feather` файл с разметкой](https://models.silero.ai/vad_datasets/VoxLingua107.feather)
|
47 |
+
|
48 |
+
- Колонка `folder_link` содержит ссылки на скачивание `.zip` архива для конкретного языка. Внимание! Ссылки на архивы дублируются, т.к каждый архив может содержать множество аудио.
|
49 |
+
- Колонка `audio_path` содержит пути до конкретного аудио после распаковки соответствующего архива из колонки `folder_link`
|
50 |
+
|
51 |
+
## Common Voice
|
52 |
+
|
53 |
+
[Ссылка на `.feather` файл с разметкой](https://models.silero.ai/vad_datasets/common_voice.feather)
|
54 |
+
|
55 |
+
Этот датасет невозможно скачать по статичным ссылкам. Для загрузки необходимо перейти по [ссылке](https://commonvoice.mozilla.org/en/datasets) и, получив доступ в соответствующей форме, скачать архивы для каждого доступного языка. Внимание! Представленная разметка актуальна для версии исходного датасета `Common Voice Corpus 16.1`.
|
56 |
+
|
57 |
+
- Колонка `audio_path` содержит уникальные названия `.mp3` файлов, полученных после скачивания соответствующего датасета.
|
58 |
+
|
59 |
+
## MLS
|
60 |
+
|
61 |
+
[Ссылка на `.feather` файл с разметкой](https://models.silero.ai/vad_datasets/MLS.feather)
|
62 |
+
|
63 |
+
- Колонка `folder_link` содержит ссылки на скачивание `.zip` архива для конкретного языка. Внимание! Ссылки на архивы дублируются, т.к каждый архив может содержать множество аудио.
|
64 |
+
- Колонка `audio_path` содержит пути до конкретного аудио после распаковки соответствующего архива из колонки `folder_link`
|
65 |
+
|
66 |
+
## Лицензия
|
67 |
+
|
68 |
+
Данный датасет распространяется под [лицензией](https://creativecommons.org/licenses/by-nc-sa/4.0/deed.en) `CC BY-NC-SA 4.0`.
|
69 |
+
|
70 |
+
## Цитирование
|
71 |
+
|
72 |
+
```
|
73 |
+
@misc{Silero VAD Dataset,
|
74 |
+
author = {Silero Team},
|
75 |
+
title = {Silero-VAD Dataset: a large public Internet-scale dataset for voice activity detection for 6000+ languages},
|
76 |
+
year = {2024},
|
77 |
+
publisher = {GitHub},
|
78 |
+
journal = {GitHub repository},
|
79 |
+
howpublished = {\url{https://github.com/snakers4/silero-vad/datasets/README.md}},
|
80 |
+
email = {hello@silero.ai}
|
81 |
+
}
|
82 |
+
```
|
83 |
+
|
84 |
+
[^1]: ``Количество уникальных ISO кодов данного датасета не совпадает с фактическим количеством представленных языков, т.к некоторые близкие языки могут кодироваться одним и тем же ISO кодом.``
|
snakers4_silero-vad_master/examples/colab_record_example.ipynb
ADDED
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {
|
6 |
+
"id": "bccAucKjnPHm"
|
7 |
+
},
|
8 |
+
"source": [
|
9 |
+
"### Dependencies and inputs"
|
10 |
+
]
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"cell_type": "code",
|
14 |
+
"execution_count": null,
|
15 |
+
"metadata": {
|
16 |
+
"id": "cSih95WFmwgi"
|
17 |
+
},
|
18 |
+
"outputs": [],
|
19 |
+
"source": [
|
20 |
+
"!pip -q install pydub\n",
|
21 |
+
"from google.colab import output\n",
|
22 |
+
"from base64 import b64decode, b64encode\n",
|
23 |
+
"from io import BytesIO\n",
|
24 |
+
"import numpy as np\n",
|
25 |
+
"from pydub import AudioSegment\n",
|
26 |
+
"from IPython.display import HTML, display\n",
|
27 |
+
"import torch\n",
|
28 |
+
"import matplotlib.pyplot as plt\n",
|
29 |
+
"import moviepy.editor as mpe\n",
|
30 |
+
"from matplotlib.animation import FuncAnimation, FFMpegWriter\n",
|
31 |
+
"import matplotlib\n",
|
32 |
+
"matplotlib.use('Agg')\n",
|
33 |
+
"\n",
|
34 |
+
"torch.set_num_threads(1)\n",
|
35 |
+
"\n",
|
36 |
+
"model, _ = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
|
37 |
+
" model='silero_vad',\n",
|
38 |
+
" force_reload=True)\n",
|
39 |
+
"\n",
|
40 |
+
"def int2float(sound):\n",
|
41 |
+
" abs_max = np.abs(sound).max()\n",
|
42 |
+
" sound = sound.astype('float32')\n",
|
43 |
+
" if abs_max > 0:\n",
|
44 |
+
" sound *= 1/32768\n",
|
45 |
+
" sound = sound.squeeze()\n",
|
46 |
+
" return sound\n",
|
47 |
+
"\n",
|
48 |
+
"AUDIO_HTML = \"\"\"\n",
|
49 |
+
"<script>\n",
|
50 |
+
"var my_div = document.createElement(\"DIV\");\n",
|
51 |
+
"var my_p = document.createElement(\"P\");\n",
|
52 |
+
"var my_btn = document.createElement(\"BUTTON\");\n",
|
53 |
+
"var t = document.createTextNode(\"Press to start recording\");\n",
|
54 |
+
"\n",
|
55 |
+
"my_btn.appendChild(t);\n",
|
56 |
+
"//my_p.appendChild(my_btn);\n",
|
57 |
+
"my_div.appendChild(my_btn);\n",
|
58 |
+
"document.body.appendChild(my_div);\n",
|
59 |
+
"\n",
|
60 |
+
"var base64data = 0;\n",
|
61 |
+
"var reader;\n",
|
62 |
+
"var recorder, gumStream;\n",
|
63 |
+
"var recordButton = my_btn;\n",
|
64 |
+
"\n",
|
65 |
+
"var handleSuccess = function(stream) {\n",
|
66 |
+
" gumStream = stream;\n",
|
67 |
+
" var options = {\n",
|
68 |
+
" //bitsPerSecond: 8000, //chrome seems to ignore, always 48k\n",
|
69 |
+
" mimeType : 'audio/webm;codecs=opus'\n",
|
70 |
+
" //mimeType : 'audio/webm;codecs=pcm'\n",
|
71 |
+
" }; \n",
|
72 |
+
" //recorder = new MediaRecorder(stream, options);\n",
|
73 |
+
" recorder = new MediaRecorder(stream);\n",
|
74 |
+
" recorder.ondataavailable = function(e) { \n",
|
75 |
+
" var url = URL.createObjectURL(e.data);\n",
|
76 |
+
" // var preview = document.createElement('audio');\n",
|
77 |
+
" // preview.controls = true;\n",
|
78 |
+
" // preview.src = url;\n",
|
79 |
+
" // document.body.appendChild(preview);\n",
|
80 |
+
"\n",
|
81 |
+
" reader = new FileReader();\n",
|
82 |
+
" reader.readAsDataURL(e.data); \n",
|
83 |
+
" reader.onloadend = function() {\n",
|
84 |
+
" base64data = reader.result;\n",
|
85 |
+
" //console.log(\"Inside FileReader:\" + base64data);\n",
|
86 |
+
" }\n",
|
87 |
+
" };\n",
|
88 |
+
" recorder.start();\n",
|
89 |
+
" };\n",
|
90 |
+
"\n",
|
91 |
+
"recordButton.innerText = \"Recording... press to stop\";\n",
|
92 |
+
"\n",
|
93 |
+
"navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);\n",
|
94 |
+
"\n",
|
95 |
+
"\n",
|
96 |
+
"function toggleRecording() {\n",
|
97 |
+
" if (recorder && recorder.state == \"recording\") {\n",
|
98 |
+
" recorder.stop();\n",
|
99 |
+
" gumStream.getAudioTracks()[0].stop();\n",
|
100 |
+
" recordButton.innerText = \"Saving recording...\"\n",
|
101 |
+
" }\n",
|
102 |
+
"}\n",
|
103 |
+
"\n",
|
104 |
+
"// https://stackoverflow.com/a/951057\n",
|
105 |
+
"function sleep(ms) {\n",
|
106 |
+
" return new Promise(resolve => setTimeout(resolve, ms));\n",
|
107 |
+
"}\n",
|
108 |
+
"\n",
|
109 |
+
"var data = new Promise(resolve=>{\n",
|
110 |
+
"//recordButton.addEventListener(\"click\", toggleRecording);\n",
|
111 |
+
"recordButton.onclick = ()=>{\n",
|
112 |
+
"toggleRecording()\n",
|
113 |
+
"\n",
|
114 |
+
"sleep(2000).then(() => {\n",
|
115 |
+
" // wait 2000ms for the data to be available...\n",
|
116 |
+
" // ideally this should use something like await...\n",
|
117 |
+
" //console.log(\"Inside data:\" + base64data)\n",
|
118 |
+
" resolve(base64data.toString())\n",
|
119 |
+
"\n",
|
120 |
+
"});\n",
|
121 |
+
"\n",
|
122 |
+
"}\n",
|
123 |
+
"});\n",
|
124 |
+
" \n",
|
125 |
+
"</script>\n",
|
126 |
+
"\"\"\"\n",
|
127 |
+
"\n",
|
128 |
+
"def record(sec=10):\n",
|
129 |
+
" display(HTML(AUDIO_HTML))\n",
|
130 |
+
" s = output.eval_js(\"data\")\n",
|
131 |
+
" b = b64decode(s.split(',')[1])\n",
|
132 |
+
" audio = AudioSegment.from_file(BytesIO(b))\n",
|
133 |
+
" audio.export('test.mp3', format='mp3')\n",
|
134 |
+
" audio = audio.set_channels(1)\n",
|
135 |
+
" audio = audio.set_frame_rate(16000)\n",
|
136 |
+
" audio_float = int2float(np.array(audio.get_array_of_samples()))\n",
|
137 |
+
" audio_tens = torch.tensor(audio_float )\n",
|
138 |
+
" return audio_tens\n",
|
139 |
+
"\n",
|
140 |
+
"def make_animation(probs, audio_duration, interval=40):\n",
|
141 |
+
" fig = plt.figure(figsize=(16, 9))\n",
|
142 |
+
" ax = plt.axes(xlim=(0, audio_duration), ylim=(0, 1.02))\n",
|
143 |
+
" line, = ax.plot([], [], lw=2)\n",
|
144 |
+
" x = [i / 16000 * 512 for i in range(len(probs))]\n",
|
145 |
+
" plt.xlabel('Time, seconds', fontsize=16)\n",
|
146 |
+
" plt.ylabel('Speech Probability', fontsize=16)\n",
|
147 |
+
"\n",
|
148 |
+
" def init():\n",
|
149 |
+
" plt.fill_between(x, probs, color='#064273')\n",
|
150 |
+
" line.set_data([], [])\n",
|
151 |
+
" line.set_color('#990000')\n",
|
152 |
+
" return line,\n",
|
153 |
+
"\n",
|
154 |
+
" def animate(i):\n",
|
155 |
+
" x = i * interval / 1000 - 0.04\n",
|
156 |
+
" y = np.linspace(0, 1.02, 2)\n",
|
157 |
+
" \n",
|
158 |
+
" line.set_data(x, y)\n",
|
159 |
+
" line.set_color('#990000')\n",
|
160 |
+
" return line,\n",
|
161 |
+
"\n",
|
162 |
+
" anim = FuncAnimation(fig, animate, init_func=init, interval=interval, save_count=audio_duration / (interval / 1000))\n",
|
163 |
+
"\n",
|
164 |
+
" f = r\"animation.mp4\" \n",
|
165 |
+
" writervideo = FFMpegWriter(fps=1000/interval) \n",
|
166 |
+
" anim.save(f, writer=writervideo)\n",
|
167 |
+
" plt.close('all')\n",
|
168 |
+
"\n",
|
169 |
+
"def combine_audio(vidname, audname, outname, fps=25): \n",
|
170 |
+
" my_clip = mpe.VideoFileClip(vidname, verbose=False)\n",
|
171 |
+
" audio_background = mpe.AudioFileClip(audname)\n",
|
172 |
+
" final_clip = my_clip.set_audio(audio_background)\n",
|
173 |
+
" final_clip.write_videofile(outname,fps=fps,verbose=False)\n",
|
174 |
+
"\n",
|
175 |
+
"def record_make_animation():\n",
|
176 |
+
" tensor = record()\n",
|
177 |
+
"\n",
|
178 |
+
" print('Calculating probabilities...')\n",
|
179 |
+
" speech_probs = []\n",
|
180 |
+
" window_size_samples = 512\n",
|
181 |
+
" for i in range(0, len(tensor), window_size_samples):\n",
|
182 |
+
" if len(tensor[i: i+ window_size_samples]) < window_size_samples:\n",
|
183 |
+
" break\n",
|
184 |
+
" speech_prob = model(tensor[i: i+ window_size_samples], 16000).item()\n",
|
185 |
+
" speech_probs.append(speech_prob)\n",
|
186 |
+
" model.reset_states()\n",
|
187 |
+
" print('Making animation...')\n",
|
188 |
+
" make_animation(speech_probs, len(tensor) / 16000)\n",
|
189 |
+
"\n",
|
190 |
+
" print('Merging your voice with animation...')\n",
|
191 |
+
" combine_audio('animation.mp4', 'test.mp3', 'merged.mp4')\n",
|
192 |
+
" print('Done!')\n",
|
193 |
+
" mp4 = open('merged.mp4','rb').read()\n",
|
194 |
+
" data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
|
195 |
+
" display(HTML(\"\"\"\n",
|
196 |
+
" <video width=800 controls>\n",
|
197 |
+
" <source src=\"%s\" type=\"video/mp4\">\n",
|
198 |
+
" </video>\n",
|
199 |
+
" \"\"\" % data_url))"
|
200 |
+
]
|
201 |
+
},
|
202 |
+
{
|
203 |
+
"cell_type": "markdown",
|
204 |
+
"metadata": {
|
205 |
+
"id": "IFVs3GvTnpB1"
|
206 |
+
},
|
207 |
+
"source": [
|
208 |
+
"## Record example"
|
209 |
+
]
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"cell_type": "code",
|
213 |
+
"execution_count": null,
|
214 |
+
"metadata": {
|
215 |
+
"id": "5EBjrTwiqAaQ"
|
216 |
+
},
|
217 |
+
"outputs": [],
|
218 |
+
"source": [
|
219 |
+
"record_make_animation()"
|
220 |
+
]
|
221 |
+
}
|
222 |
+
],
|
223 |
+
"metadata": {
|
224 |
+
"colab": {
|
225 |
+
"collapsed_sections": [
|
226 |
+
"bccAucKjnPHm"
|
227 |
+
],
|
228 |
+
"name": "Untitled2.ipynb",
|
229 |
+
"provenance": []
|
230 |
+
},
|
231 |
+
"kernelspec": {
|
232 |
+
"display_name": "Python 3",
|
233 |
+
"name": "python3"
|
234 |
+
},
|
235 |
+
"language_info": {
|
236 |
+
"name": "python"
|
237 |
+
}
|
238 |
+
},
|
239 |
+
"nbformat": 4,
|
240 |
+
"nbformat_minor": 0
|
241 |
+
}
|
snakers4_silero-vad_master/examples/cpp/README.md
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Stream example in C++
|
2 |
+
|
3 |
+
Here's a simple example of the vad model in c++ onnxruntime.
|
4 |
+
|
5 |
+
|
6 |
+
|
7 |
+
## Requirements
|
8 |
+
|
9 |
+
Code are tested in the environments bellow, feel free to try others.
|
10 |
+
|
11 |
+
- WSL2 + Debian-bullseye (docker)
|
12 |
+
- gcc 12.2.0
|
13 |
+
- onnxruntime-linux-x64-1.12.1
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
## Usage
|
18 |
+
|
19 |
+
1. Install gcc 12.2.0, or just pull the docker image with `docker pull gcc:12.2.0-bullseye`
|
20 |
+
|
21 |
+
2. Install onnxruntime-linux-x64-1.12.1
|
22 |
+
|
23 |
+
- Download lib onnxruntime:
|
24 |
+
|
25 |
+
`wget https://github.com/microsoft/onnxruntime/releases/download/v1.12.1/onnxruntime-linux-x64-1.12.1.tgz`
|
26 |
+
|
27 |
+
- Unzip. Assume the path is `/root/onnxruntime-linux-x64-1.12.1`
|
28 |
+
|
29 |
+
3. Modify wav path & Test configs in main function
|
30 |
+
|
31 |
+
`wav::WavReader wav_reader("${path_to_your_wav_file}");`
|
32 |
+
|
33 |
+
test sample rate, frame per ms, threshold...
|
34 |
+
|
35 |
+
4. Build with gcc and run
|
36 |
+
|
37 |
+
```bash
|
38 |
+
# Build
|
39 |
+
g++ silero-vad-onnx.cpp -I /root/onnxruntime-linux-x64-1.12.1/include/ -L /root/onnxruntime-linux-x64-1.12.1/lib/ -lonnxruntime -Wl,-rpath,/root/onnxruntime-linux-x64-1.12.1/lib/ -o test
|
40 |
+
|
41 |
+
# Run
|
42 |
+
./test
|
43 |
+
```
|
snakers4_silero-vad_master/examples/cpp/silero-vad-onnx.cpp
ADDED
@@ -0,0 +1,486 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include <iostream>
|
2 |
+
#include <vector>
|
3 |
+
#include <sstream>
|
4 |
+
#include <cstring>
|
5 |
+
#include <limits>
|
6 |
+
#include <chrono>
|
7 |
+
#include <memory>
|
8 |
+
#include <string>
|
9 |
+
#include <stdexcept>
|
10 |
+
#include <iostream>
|
11 |
+
#include <string>
|
12 |
+
#include "onnxruntime_cxx_api.h"
|
13 |
+
#include "wav.h"
|
14 |
+
#include <cstdio>
|
15 |
+
#include <cstdarg>
|
16 |
+
#if __cplusplus < 201703L
|
17 |
+
#include <memory>
|
18 |
+
#endif
|
19 |
+
|
20 |
+
//#define __DEBUG_SPEECH_PROB___
|
21 |
+
|
22 |
+
class timestamp_t
|
23 |
+
{
|
24 |
+
public:
|
25 |
+
int start;
|
26 |
+
int end;
|
27 |
+
|
28 |
+
// default + parameterized constructor
|
29 |
+
timestamp_t(int start = -1, int end = -1)
|
30 |
+
: start(start), end(end)
|
31 |
+
{
|
32 |
+
};
|
33 |
+
|
34 |
+
// assignment operator modifies object, therefore non-const
|
35 |
+
timestamp_t& operator=(const timestamp_t& a)
|
36 |
+
{
|
37 |
+
start = a.start;
|
38 |
+
end = a.end;
|
39 |
+
return *this;
|
40 |
+
};
|
41 |
+
|
42 |
+
// equality comparison. doesn't modify object. therefore const.
|
43 |
+
bool operator==(const timestamp_t& a) const
|
44 |
+
{
|
45 |
+
return (start == a.start && end == a.end);
|
46 |
+
};
|
47 |
+
std::string c_str()
|
48 |
+
{
|
49 |
+
//return std::format("timestamp {:08d}, {:08d}", start, end);
|
50 |
+
return format("{start:%08d,end:%08d}", start, end);
|
51 |
+
};
|
52 |
+
private:
|
53 |
+
|
54 |
+
std::string format(const char* fmt, ...)
|
55 |
+
{
|
56 |
+
char buf[256];
|
57 |
+
|
58 |
+
va_list args;
|
59 |
+
va_start(args, fmt);
|
60 |
+
const auto r = std::vsnprintf(buf, sizeof buf, fmt, args);
|
61 |
+
va_end(args);
|
62 |
+
|
63 |
+
if (r < 0)
|
64 |
+
// conversion failed
|
65 |
+
return {};
|
66 |
+
|
67 |
+
const size_t len = r;
|
68 |
+
if (len < sizeof buf)
|
69 |
+
// we fit in the buffer
|
70 |
+
return { buf, len };
|
71 |
+
|
72 |
+
#if __cplusplus >= 201703L
|
73 |
+
// C++17: Create a string and write to its underlying array
|
74 |
+
std::string s(len, '\0');
|
75 |
+
va_start(args, fmt);
|
76 |
+
std::vsnprintf(s.data(), len + 1, fmt, args);
|
77 |
+
va_end(args);
|
78 |
+
|
79 |
+
return s;
|
80 |
+
#else
|
81 |
+
// C++11 or C++14: We need to allocate scratch memory
|
82 |
+
auto vbuf = std::unique_ptr<char[]>(new char[len + 1]);
|
83 |
+
va_start(args, fmt);
|
84 |
+
std::vsnprintf(vbuf.get(), len + 1, fmt, args);
|
85 |
+
va_end(args);
|
86 |
+
|
87 |
+
return { vbuf.get(), len };
|
88 |
+
#endif
|
89 |
+
};
|
90 |
+
};
|
91 |
+
|
92 |
+
|
93 |
+
class VadIterator
|
94 |
+
{
|
95 |
+
private:
|
96 |
+
// OnnxRuntime resources
|
97 |
+
Ort::Env env;
|
98 |
+
Ort::SessionOptions session_options;
|
99 |
+
std::shared_ptr<Ort::Session> session = nullptr;
|
100 |
+
Ort::AllocatorWithDefaultOptions allocator;
|
101 |
+
Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeCPU);
|
102 |
+
|
103 |
+
private:
|
104 |
+
void init_engine_threads(int inter_threads, int intra_threads)
|
105 |
+
{
|
106 |
+
// The method should be called in each thread/proc in multi-thread/proc work
|
107 |
+
session_options.SetIntraOpNumThreads(intra_threads);
|
108 |
+
session_options.SetInterOpNumThreads(inter_threads);
|
109 |
+
session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
|
110 |
+
};
|
111 |
+
|
112 |
+
void init_onnx_model(const std::wstring& model_path)
|
113 |
+
{
|
114 |
+
// Init threads = 1 for
|
115 |
+
init_engine_threads(1, 1);
|
116 |
+
// Load model
|
117 |
+
session = std::make_shared<Ort::Session>(env, model_path.c_str(), session_options);
|
118 |
+
};
|
119 |
+
|
120 |
+
void reset_states()
|
121 |
+
{
|
122 |
+
// Call reset before each audio start
|
123 |
+
std::memset(_h.data(), 0.0f, _h.size() * sizeof(float));
|
124 |
+
std::memset(_c.data(), 0.0f, _c.size() * sizeof(float));
|
125 |
+
triggered = false;
|
126 |
+
temp_end = 0;
|
127 |
+
current_sample = 0;
|
128 |
+
|
129 |
+
prev_end = next_start = 0;
|
130 |
+
|
131 |
+
speeches.clear();
|
132 |
+
current_speech = timestamp_t();
|
133 |
+
};
|
134 |
+
|
135 |
+
void predict(const std::vector<float> &data)
|
136 |
+
{
|
137 |
+
// Infer
|
138 |
+
// Create ort tensors
|
139 |
+
input.assign(data.begin(), data.end());
|
140 |
+
Ort::Value input_ort = Ort::Value::CreateTensor<float>(
|
141 |
+
memory_info, input.data(), input.size(), input_node_dims, 2);
|
142 |
+
Ort::Value sr_ort = Ort::Value::CreateTensor<int64_t>(
|
143 |
+
memory_info, sr.data(), sr.size(), sr_node_dims, 1);
|
144 |
+
Ort::Value h_ort = Ort::Value::CreateTensor<float>(
|
145 |
+
memory_info, _h.data(), _h.size(), hc_node_dims, 3);
|
146 |
+
Ort::Value c_ort = Ort::Value::CreateTensor<float>(
|
147 |
+
memory_info, _c.data(), _c.size(), hc_node_dims, 3);
|
148 |
+
|
149 |
+
// Clear and add inputs
|
150 |
+
ort_inputs.clear();
|
151 |
+
ort_inputs.emplace_back(std::move(input_ort));
|
152 |
+
ort_inputs.emplace_back(std::move(sr_ort));
|
153 |
+
ort_inputs.emplace_back(std::move(h_ort));
|
154 |
+
ort_inputs.emplace_back(std::move(c_ort));
|
155 |
+
|
156 |
+
// Infer
|
157 |
+
ort_outputs = session->Run(
|
158 |
+
Ort::RunOptions{nullptr},
|
159 |
+
input_node_names.data(), ort_inputs.data(), ort_inputs.size(),
|
160 |
+
output_node_names.data(), output_node_names.size());
|
161 |
+
|
162 |
+
// Output probability & update h,c recursively
|
163 |
+
float speech_prob = ort_outputs[0].GetTensorMutableData<float>()[0];
|
164 |
+
float *hn = ort_outputs[1].GetTensorMutableData<float>();
|
165 |
+
std::memcpy(_h.data(), hn, size_hc * sizeof(float));
|
166 |
+
float *cn = ort_outputs[2].GetTensorMutableData<float>();
|
167 |
+
std::memcpy(_c.data(), cn, size_hc * sizeof(float));
|
168 |
+
|
169 |
+
// Push forward sample index
|
170 |
+
current_sample += window_size_samples;
|
171 |
+
|
172 |
+
// Reset temp_end when > threshold
|
173 |
+
if ((speech_prob >= threshold))
|
174 |
+
{
|
175 |
+
#ifdef __DEBUG_SPEECH_PROB___
|
176 |
+
float speech = current_sample - window_size_samples; // minus window_size_samples to get precise start time point.
|
177 |
+
printf("{ start: %.3f s (%.3f) %08d}\n", 1.0 * speech / sample_rate, speech_prob, current_sample- window_size_samples);
|
178 |
+
#endif //__DEBUG_SPEECH_PROB___
|
179 |
+
if (temp_end != 0)
|
180 |
+
{
|
181 |
+
temp_end = 0;
|
182 |
+
if (next_start < prev_end)
|
183 |
+
next_start = current_sample - window_size_samples;
|
184 |
+
}
|
185 |
+
if (triggered == false)
|
186 |
+
{
|
187 |
+
triggered = true;
|
188 |
+
|
189 |
+
current_speech.start = current_sample - window_size_samples;
|
190 |
+
}
|
191 |
+
return;
|
192 |
+
}
|
193 |
+
|
194 |
+
if (
|
195 |
+
(triggered == true)
|
196 |
+
&& ((current_sample - current_speech.start) > max_speech_samples)
|
197 |
+
) {
|
198 |
+
if (prev_end > 0) {
|
199 |
+
current_speech.end = prev_end;
|
200 |
+
speeches.push_back(current_speech);
|
201 |
+
current_speech = timestamp_t();
|
202 |
+
|
203 |
+
// previously reached silence(< neg_thres) and is still not speech(< thres)
|
204 |
+
if (next_start < prev_end)
|
205 |
+
triggered = false;
|
206 |
+
else{
|
207 |
+
current_speech.start = next_start;
|
208 |
+
}
|
209 |
+
prev_end = 0;
|
210 |
+
next_start = 0;
|
211 |
+
temp_end = 0;
|
212 |
+
|
213 |
+
}
|
214 |
+
else{
|
215 |
+
current_speech.end = current_sample;
|
216 |
+
speeches.push_back(current_speech);
|
217 |
+
current_speech = timestamp_t();
|
218 |
+
prev_end = 0;
|
219 |
+
next_start = 0;
|
220 |
+
temp_end = 0;
|
221 |
+
triggered = false;
|
222 |
+
}
|
223 |
+
return;
|
224 |
+
|
225 |
+
}
|
226 |
+
if ((speech_prob >= (threshold - 0.15)) && (speech_prob < threshold))
|
227 |
+
{
|
228 |
+
if (triggered) {
|
229 |
+
#ifdef __DEBUG_SPEECH_PROB___
|
230 |
+
float speech = current_sample - window_size_samples; // minus window_size_samples to get precise start time point.
|
231 |
+
printf("{ speeking: %.3f s (%.3f) %08d}\n", 1.0 * speech / sample_rate, speech_prob, current_sample - window_size_samples);
|
232 |
+
#endif //__DEBUG_SPEECH_PROB___
|
233 |
+
}
|
234 |
+
else {
|
235 |
+
#ifdef __DEBUG_SPEECH_PROB___
|
236 |
+
float speech = current_sample - window_size_samples; // minus window_size_samples to get precise start time point.
|
237 |
+
printf("{ silence: %.3f s (%.3f) %08d}\n", 1.0 * speech / sample_rate, speech_prob, current_sample - window_size_samples);
|
238 |
+
#endif //__DEBUG_SPEECH_PROB___
|
239 |
+
}
|
240 |
+
return;
|
241 |
+
}
|
242 |
+
|
243 |
+
|
244 |
+
// 4) End
|
245 |
+
if ((speech_prob < (threshold - 0.15)))
|
246 |
+
{
|
247 |
+
#ifdef __DEBUG_SPEECH_PROB___
|
248 |
+
float speech = current_sample - window_size_samples - speech_pad_samples; // minus window_size_samples to get precise start time point.
|
249 |
+
printf("{ end: %.3f s (%.3f) %08d}\n", 1.0 * speech / sample_rate, speech_prob, current_sample - window_size_samples);
|
250 |
+
#endif //__DEBUG_SPEECH_PROB___
|
251 |
+
if (triggered == true)
|
252 |
+
{
|
253 |
+
if (temp_end == 0)
|
254 |
+
{
|
255 |
+
temp_end = current_sample;
|
256 |
+
}
|
257 |
+
if (current_sample - temp_end > min_silence_samples_at_max_speech)
|
258 |
+
prev_end = temp_end;
|
259 |
+
// a. silence < min_slience_samples, continue speaking
|
260 |
+
if ((current_sample - temp_end) < min_silence_samples)
|
261 |
+
{
|
262 |
+
|
263 |
+
}
|
264 |
+
// b. silence >= min_slience_samples, end speaking
|
265 |
+
else
|
266 |
+
{
|
267 |
+
current_speech.end = temp_end;
|
268 |
+
if (current_speech.end - current_speech.start > min_speech_samples)
|
269 |
+
{
|
270 |
+
speeches.push_back(current_speech);
|
271 |
+
current_speech = timestamp_t();
|
272 |
+
prev_end = 0;
|
273 |
+
next_start = 0;
|
274 |
+
temp_end = 0;
|
275 |
+
triggered = false;
|
276 |
+
}
|
277 |
+
}
|
278 |
+
}
|
279 |
+
else {
|
280 |
+
// may first windows see end state.
|
281 |
+
}
|
282 |
+
return;
|
283 |
+
}
|
284 |
+
};
|
285 |
+
public:
|
286 |
+
void process(const std::vector<float>& input_wav)
|
287 |
+
{
|
288 |
+
reset_states();
|
289 |
+
|
290 |
+
audio_length_samples = input_wav.size();
|
291 |
+
|
292 |
+
for (int j = 0; j < audio_length_samples; j += window_size_samples)
|
293 |
+
{
|
294 |
+
if (j + window_size_samples > audio_length_samples)
|
295 |
+
break;
|
296 |
+
std::vector<float> r{ &input_wav[0] + j, &input_wav[0] + j + window_size_samples };
|
297 |
+
predict(r);
|
298 |
+
}
|
299 |
+
|
300 |
+
if (current_speech.start >= 0) {
|
301 |
+
current_speech.end = audio_length_samples;
|
302 |
+
speeches.push_back(current_speech);
|
303 |
+
current_speech = timestamp_t();
|
304 |
+
prev_end = 0;
|
305 |
+
next_start = 0;
|
306 |
+
temp_end = 0;
|
307 |
+
triggered = false;
|
308 |
+
}
|
309 |
+
};
|
310 |
+
|
311 |
+
void process(const std::vector<float>& input_wav, std::vector<float>& output_wav)
|
312 |
+
{
|
313 |
+
process(input_wav);
|
314 |
+
collect_chunks(input_wav, output_wav);
|
315 |
+
}
|
316 |
+
|
317 |
+
void collect_chunks(const std::vector<float>& input_wav, std::vector<float>& output_wav)
|
318 |
+
{
|
319 |
+
output_wav.clear();
|
320 |
+
for (int i = 0; i < speeches.size(); i++) {
|
321 |
+
#ifdef __DEBUG_SPEECH_PROB___
|
322 |
+
std::cout << speeches[i].c_str() << std::endl;
|
323 |
+
#endif //#ifdef __DEBUG_SPEECH_PROB___
|
324 |
+
std::vector<float> slice(&input_wav[speeches[i].start], &input_wav[speeches[i].end]);
|
325 |
+
output_wav.insert(output_wav.end(),slice.begin(),slice.end());
|
326 |
+
}
|
327 |
+
};
|
328 |
+
|
329 |
+
const std::vector<timestamp_t> get_speech_timestamps() const
|
330 |
+
{
|
331 |
+
return speeches;
|
332 |
+
}
|
333 |
+
|
334 |
+
void drop_chunks(const std::vector<float>& input_wav, std::vector<float>& output_wav)
|
335 |
+
{
|
336 |
+
output_wav.clear();
|
337 |
+
int current_start = 0;
|
338 |
+
for (int i = 0; i < speeches.size(); i++) {
|
339 |
+
|
340 |
+
std::vector<float> slice(&input_wav[current_start],&input_wav[speeches[i].start]);
|
341 |
+
output_wav.insert(output_wav.end(), slice.begin(), slice.end());
|
342 |
+
current_start = speeches[i].end;
|
343 |
+
}
|
344 |
+
|
345 |
+
std::vector<float> slice(&input_wav[current_start], &input_wav[input_wav.size()]);
|
346 |
+
output_wav.insert(output_wav.end(), slice.begin(), slice.end());
|
347 |
+
};
|
348 |
+
|
349 |
+
private:
|
350 |
+
// model config
|
351 |
+
int64_t window_size_samples; // Assign when init, support 256 512 768 for 8k; 512 1024 1536 for 16k.
|
352 |
+
int sample_rate; //Assign when init support 16000 or 8000
|
353 |
+
int sr_per_ms; // Assign when init, support 8 or 16
|
354 |
+
float threshold;
|
355 |
+
int min_silence_samples; // sr_per_ms * #ms
|
356 |
+
int min_silence_samples_at_max_speech; // sr_per_ms * #98
|
357 |
+
int min_speech_samples; // sr_per_ms * #ms
|
358 |
+
float max_speech_samples;
|
359 |
+
int speech_pad_samples; // usually a
|
360 |
+
int audio_length_samples;
|
361 |
+
|
362 |
+
// model states
|
363 |
+
bool triggered = false;
|
364 |
+
unsigned int temp_end = 0;
|
365 |
+
unsigned int current_sample = 0;
|
366 |
+
// MAX 4294967295 samples / 8sample per ms / 1000 / 60 = 8947 minutes
|
367 |
+
int prev_end;
|
368 |
+
int next_start = 0;
|
369 |
+
|
370 |
+
//Output timestamp
|
371 |
+
std::vector<timestamp_t> speeches;
|
372 |
+
timestamp_t current_speech;
|
373 |
+
|
374 |
+
|
375 |
+
// Onnx model
|
376 |
+
// Inputs
|
377 |
+
std::vector<Ort::Value> ort_inputs;
|
378 |
+
|
379 |
+
std::vector<const char *> input_node_names = {"input", "sr", "h", "c"};
|
380 |
+
std::vector<float> input;
|
381 |
+
std::vector<int64_t> sr;
|
382 |
+
unsigned int size_hc = 2 * 1 * 64; // It's FIXED.
|
383 |
+
std::vector<float> _h;
|
384 |
+
std::vector<float> _c;
|
385 |
+
|
386 |
+
int64_t input_node_dims[2] = {};
|
387 |
+
const int64_t sr_node_dims[1] = {1};
|
388 |
+
const int64_t hc_node_dims[3] = {2, 1, 64};
|
389 |
+
|
390 |
+
// Outputs
|
391 |
+
std::vector<Ort::Value> ort_outputs;
|
392 |
+
std::vector<const char *> output_node_names = {"output", "hn", "cn"};
|
393 |
+
|
394 |
+
public:
|
395 |
+
// Construction
|
396 |
+
VadIterator(const std::wstring ModelPath,
|
397 |
+
int Sample_rate = 16000, int windows_frame_size = 64,
|
398 |
+
float Threshold = 0.5, int min_silence_duration_ms = 0,
|
399 |
+
int speech_pad_ms = 64, int min_speech_duration_ms = 64,
|
400 |
+
float max_speech_duration_s = std::numeric_limits<float>::infinity())
|
401 |
+
{
|
402 |
+
init_onnx_model(ModelPath);
|
403 |
+
threshold = Threshold;
|
404 |
+
sample_rate = Sample_rate;
|
405 |
+
sr_per_ms = sample_rate / 1000;
|
406 |
+
|
407 |
+
window_size_samples = windows_frame_size * sr_per_ms;
|
408 |
+
|
409 |
+
min_speech_samples = sr_per_ms * min_speech_duration_ms;
|
410 |
+
speech_pad_samples = sr_per_ms * speech_pad_ms;
|
411 |
+
|
412 |
+
max_speech_samples = (
|
413 |
+
sample_rate * max_speech_duration_s
|
414 |
+
- window_size_samples
|
415 |
+
- 2 * speech_pad_samples
|
416 |
+
);
|
417 |
+
|
418 |
+
min_silence_samples = sr_per_ms * min_silence_duration_ms;
|
419 |
+
min_silence_samples_at_max_speech = sr_per_ms * 98;
|
420 |
+
|
421 |
+
input.resize(window_size_samples);
|
422 |
+
input_node_dims[0] = 1;
|
423 |
+
input_node_dims[1] = window_size_samples;
|
424 |
+
|
425 |
+
_h.resize(size_hc);
|
426 |
+
_c.resize(size_hc);
|
427 |
+
sr.resize(1);
|
428 |
+
sr[0] = sample_rate;
|
429 |
+
};
|
430 |
+
};
|
431 |
+
|
432 |
+
int main()
|
433 |
+
{
|
434 |
+
std::vector<timestamp_t> stamps;
|
435 |
+
|
436 |
+
// Read wav
|
437 |
+
wav::WavReader wav_reader("recorder.wav"); //16000,1,32float
|
438 |
+
std::vector<float> input_wav(wav_reader.num_samples());
|
439 |
+
std::vector<float> output_wav;
|
440 |
+
|
441 |
+
for (int i = 0; i < wav_reader.num_samples(); i++)
|
442 |
+
{
|
443 |
+
input_wav[i] = static_cast<float>(*(wav_reader.data() + i));
|
444 |
+
}
|
445 |
+
|
446 |
+
|
447 |
+
|
448 |
+
// ===== Test configs =====
|
449 |
+
std::wstring path = L"silero_vad.onnx";
|
450 |
+
VadIterator vad(path);
|
451 |
+
|
452 |
+
// ==============================================
|
453 |
+
// ==== = Example 1 of full function =====
|
454 |
+
// ==============================================
|
455 |
+
vad.process(input_wav);
|
456 |
+
|
457 |
+
// 1.a get_speech_timestamps
|
458 |
+
stamps = vad.get_speech_timestamps();
|
459 |
+
for (int i = 0; i < stamps.size(); i++) {
|
460 |
+
|
461 |
+
std::cout << stamps[i].c_str() << std::endl;
|
462 |
+
}
|
463 |
+
|
464 |
+
// 1.b collect_chunks output wav
|
465 |
+
vad.collect_chunks(input_wav, output_wav);
|
466 |
+
|
467 |
+
// 1.c drop_chunks output wav
|
468 |
+
vad.drop_chunks(input_wav, output_wav);
|
469 |
+
|
470 |
+
// ==============================================
|
471 |
+
// ===== Example 2 of simple full function =====
|
472 |
+
// ==============================================
|
473 |
+
vad.process(input_wav, output_wav);
|
474 |
+
|
475 |
+
stamps = vad.get_speech_timestamps();
|
476 |
+
for (int i = 0; i < stamps.size(); i++) {
|
477 |
+
|
478 |
+
std::cout << stamps[i].c_str() << std::endl;
|
479 |
+
}
|
480 |
+
|
481 |
+
// ==============================================
|
482 |
+
// ===== Example 3 of full function =====
|
483 |
+
// ==============================================
|
484 |
+
for(int i = 0; i<2; i++)
|
485 |
+
vad.process(input_wav, output_wav);
|
486 |
+
}
|
snakers4_silero-vad_master/examples/cpp/wav.h
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Copyright (c) 2016 Personal (Binbin Zhang)
|
2 |
+
//
|
3 |
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
// you may not use this file except in compliance with the License.
|
5 |
+
// You may obtain a copy of the License at
|
6 |
+
//
|
7 |
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
//
|
9 |
+
// Unless required by applicable law or agreed to in writing, software
|
10 |
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
// See the License for the specific language governing permissions and
|
13 |
+
// limitations under the License.
|
14 |
+
|
15 |
+
|
16 |
+
#ifndef FRONTEND_WAV_H_
|
17 |
+
#define FRONTEND_WAV_H_
|
18 |
+
|
19 |
+
#include <assert.h>
|
20 |
+
#include <stdint.h>
|
21 |
+
#include <stdio.h>
|
22 |
+
#include <stdlib.h>
|
23 |
+
#include <string.h>
|
24 |
+
|
25 |
+
#include <string>
|
26 |
+
|
27 |
+
// #include "utils/log.h"
|
28 |
+
|
29 |
+
namespace wav {
|
30 |
+
|
31 |
+
struct WavHeader {
|
32 |
+
char riff[4]; // "riff"
|
33 |
+
unsigned int size;
|
34 |
+
char wav[4]; // "WAVE"
|
35 |
+
char fmt[4]; // "fmt "
|
36 |
+
unsigned int fmt_size;
|
37 |
+
uint16_t format;
|
38 |
+
uint16_t channels;
|
39 |
+
unsigned int sample_rate;
|
40 |
+
unsigned int bytes_per_second;
|
41 |
+
uint16_t block_size;
|
42 |
+
uint16_t bit;
|
43 |
+
char data[4]; // "data"
|
44 |
+
unsigned int data_size;
|
45 |
+
};
|
46 |
+
|
47 |
+
class WavReader {
|
48 |
+
public:
|
49 |
+
WavReader() : data_(nullptr) {}
|
50 |
+
explicit WavReader(const std::string& filename) { Open(filename); }
|
51 |
+
|
52 |
+
bool Open(const std::string& filename) {
|
53 |
+
FILE* fp = fopen(filename.c_str(), "rb"); //文件读取
|
54 |
+
if (NULL == fp) {
|
55 |
+
std::cout << "Error in read " << filename;
|
56 |
+
return false;
|
57 |
+
}
|
58 |
+
|
59 |
+
WavHeader header;
|
60 |
+
fread(&header, 1, sizeof(header), fp);
|
61 |
+
if (header.fmt_size < 16) {
|
62 |
+
printf("WaveData: expect PCM format data "
|
63 |
+
"to have fmt chunk of at least size 16.\n");
|
64 |
+
return false;
|
65 |
+
} else if (header.fmt_size > 16) {
|
66 |
+
int offset = 44 - 8 + header.fmt_size - 16;
|
67 |
+
fseek(fp, offset, SEEK_SET);
|
68 |
+
fread(header.data, 8, sizeof(char), fp);
|
69 |
+
}
|
70 |
+
// check "riff" "WAVE" "fmt " "data"
|
71 |
+
|
72 |
+
// Skip any sub-chunks between "fmt" and "data". Usually there will
|
73 |
+
// be a single "fact" sub chunk, but on Windows there can also be a
|
74 |
+
// "list" sub chunk.
|
75 |
+
while (0 != strncmp(header.data, "data", 4)) {
|
76 |
+
// We will just ignore the data in these chunks.
|
77 |
+
fseek(fp, header.data_size, SEEK_CUR);
|
78 |
+
// read next sub chunk
|
79 |
+
fread(header.data, 8, sizeof(char), fp);
|
80 |
+
}
|
81 |
+
|
82 |
+
if (header.data_size == 0) {
|
83 |
+
int offset = ftell(fp);
|
84 |
+
fseek(fp, 0, SEEK_END);
|
85 |
+
header.data_size = ftell(fp) - offset;
|
86 |
+
fseek(fp, offset, SEEK_SET);
|
87 |
+
}
|
88 |
+
|
89 |
+
num_channel_ = header.channels;
|
90 |
+
sample_rate_ = header.sample_rate;
|
91 |
+
bits_per_sample_ = header.bit;
|
92 |
+
int num_data = header.data_size / (bits_per_sample_ / 8);
|
93 |
+
data_ = new float[num_data]; // Create 1-dim array
|
94 |
+
num_samples_ = num_data / num_channel_;
|
95 |
+
|
96 |
+
std::cout << "num_channel_ :" << num_channel_ << std::endl;
|
97 |
+
std::cout << "sample_rate_ :" << sample_rate_ << std::endl;
|
98 |
+
std::cout << "bits_per_sample_:" << bits_per_sample_ << std::endl;
|
99 |
+
std::cout << "num_samples :" << num_data << std::endl;
|
100 |
+
std::cout << "num_data_size :" << header.data_size << std::endl;
|
101 |
+
|
102 |
+
switch (bits_per_sample_) {
|
103 |
+
case 8: {
|
104 |
+
char sample;
|
105 |
+
for (int i = 0; i < num_data; ++i) {
|
106 |
+
fread(&sample, 1, sizeof(char), fp);
|
107 |
+
data_[i] = static_cast<float>(sample) / 32768;
|
108 |
+
}
|
109 |
+
break;
|
110 |
+
}
|
111 |
+
case 16: {
|
112 |
+
int16_t sample;
|
113 |
+
for (int i = 0; i < num_data; ++i) {
|
114 |
+
fread(&sample, 1, sizeof(int16_t), fp);
|
115 |
+
data_[i] = static_cast<float>(sample) / 32768;
|
116 |
+
}
|
117 |
+
break;
|
118 |
+
}
|
119 |
+
case 32:
|
120 |
+
{
|
121 |
+
if (header.format == 1) //S32
|
122 |
+
{
|
123 |
+
int sample;
|
124 |
+
for (int i = 0; i < num_data; ++i) {
|
125 |
+
fread(&sample, 1, sizeof(int), fp);
|
126 |
+
data_[i] = static_cast<float>(sample) / 32768;
|
127 |
+
}
|
128 |
+
}
|
129 |
+
else if (header.format == 3) // IEEE-float
|
130 |
+
{
|
131 |
+
float sample;
|
132 |
+
for (int i = 0; i < num_data; ++i) {
|
133 |
+
fread(&sample, 1, sizeof(float), fp);
|
134 |
+
data_[i] = static_cast<float>(sample);
|
135 |
+
}
|
136 |
+
}
|
137 |
+
else {
|
138 |
+
printf("unsupported quantization bits\n");
|
139 |
+
}
|
140 |
+
break;
|
141 |
+
}
|
142 |
+
default:
|
143 |
+
printf("unsupported quantization bits\n");
|
144 |
+
break;
|
145 |
+
}
|
146 |
+
|
147 |
+
fclose(fp);
|
148 |
+
return true;
|
149 |
+
}
|
150 |
+
|
151 |
+
int num_channel() const { return num_channel_; }
|
152 |
+
int sample_rate() const { return sample_rate_; }
|
153 |
+
int bits_per_sample() const { return bits_per_sample_; }
|
154 |
+
int num_samples() const { return num_samples_; }
|
155 |
+
|
156 |
+
~WavReader() {
|
157 |
+
delete[] data_;
|
158 |
+
}
|
159 |
+
|
160 |
+
const float* data() const { return data_; }
|
161 |
+
|
162 |
+
private:
|
163 |
+
int num_channel_;
|
164 |
+
int sample_rate_;
|
165 |
+
int bits_per_sample_;
|
166 |
+
int num_samples_; // sample points per channel
|
167 |
+
float* data_;
|
168 |
+
};
|
169 |
+
|
170 |
+
class WavWriter {
|
171 |
+
public:
|
172 |
+
WavWriter(const float* data, int num_samples, int num_channel,
|
173 |
+
int sample_rate, int bits_per_sample)
|
174 |
+
: data_(data),
|
175 |
+
num_samples_(num_samples),
|
176 |
+
num_channel_(num_channel),
|
177 |
+
sample_rate_(sample_rate),
|
178 |
+
bits_per_sample_(bits_per_sample) {}
|
179 |
+
|
180 |
+
void Write(const std::string& filename) {
|
181 |
+
FILE* fp = fopen(filename.c_str(), "w");
|
182 |
+
// init char 'riff' 'WAVE' 'fmt ' 'data'
|
183 |
+
WavHeader header;
|
184 |
+
char wav_header[44] = {0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57,
|
185 |
+
0x41, 0x56, 0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00,
|
186 |
+
0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
187 |
+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
188 |
+
0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00};
|
189 |
+
memcpy(&header, wav_header, sizeof(header));
|
190 |
+
header.channels = num_channel_;
|
191 |
+
header.bit = bits_per_sample_;
|
192 |
+
header.sample_rate = sample_rate_;
|
193 |
+
header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8);
|
194 |
+
header.size = sizeof(header) - 8 + header.data_size;
|
195 |
+
header.bytes_per_second =
|
196 |
+
sample_rate_ * num_channel_ * (bits_per_sample_ / 8);
|
197 |
+
header.block_size = num_channel_ * (bits_per_sample_ / 8);
|
198 |
+
|
199 |
+
fwrite(&header, 1, sizeof(header), fp);
|
200 |
+
|
201 |
+
for (int i = 0; i < num_samples_; ++i) {
|
202 |
+
for (int j = 0; j < num_channel_; ++j) {
|
203 |
+
switch (bits_per_sample_) {
|
204 |
+
case 8: {
|
205 |
+
char sample = static_cast<char>(data_[i * num_channel_ + j]);
|
206 |
+
fwrite(&sample, 1, sizeof(sample), fp);
|
207 |
+
break;
|
208 |
+
}
|
209 |
+
case 16: {
|
210 |
+
int16_t sample = static_cast<int16_t>(data_[i * num_channel_ + j]);
|
211 |
+
fwrite(&sample, 1, sizeof(sample), fp);
|
212 |
+
break;
|
213 |
+
}
|
214 |
+
case 32: {
|
215 |
+
int sample = static_cast<int>(data_[i * num_channel_ + j]);
|
216 |
+
fwrite(&sample, 1, sizeof(sample), fp);
|
217 |
+
break;
|
218 |
+
}
|
219 |
+
}
|
220 |
+
}
|
221 |
+
}
|
222 |
+
fclose(fp);
|
223 |
+
}
|
224 |
+
|
225 |
+
private:
|
226 |
+
const float* data_;
|
227 |
+
int num_samples_; // total float points in data_
|
228 |
+
int num_channel_;
|
229 |
+
int sample_rate_;
|
230 |
+
int bits_per_sample_;
|
231 |
+
};
|
232 |
+
|
233 |
+
} // namespace wenet
|
234 |
+
|
235 |
+
#endif // FRONTEND_WAV_H_
|
snakers4_silero-vad_master/examples/go/README.md
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Golang Example
|
2 |
+
|
3 |
+
This is a sample program of how to run speech detection using `silero-vad` from Golang (CGO + ONNX Runtime).
|
4 |
+
|
5 |
+
### Requirements
|
6 |
+
|
7 |
+
- Golang >= v1.21
|
8 |
+
- ONNX Runtime
|
9 |
+
|
10 |
+
### Usage
|
11 |
+
|
12 |
+
```sh
|
13 |
+
go run ./cmd/main.go test.wav
|
14 |
+
```
|
15 |
+
|
16 |
+
> **_Note_**
|
17 |
+
>
|
18 |
+
> Make sure you have the ONNX Runtime library and C headers installed in your path.
|
19 |
+
|
snakers4_silero-vad_master/examples/go/cmd/main.go
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package main
|
2 |
+
|
3 |
+
import (
|
4 |
+
"log"
|
5 |
+
"os"
|
6 |
+
|
7 |
+
"github.com/streamer45/silero-vad-go/speech"
|
8 |
+
|
9 |
+
"github.com/go-audio/wav"
|
10 |
+
)
|
11 |
+
|
12 |
+
func main() {
|
13 |
+
sd, err := speech.NewDetector(speech.DetectorConfig{
|
14 |
+
ModelPath: "../../files/silero_vad.onnx",
|
15 |
+
SampleRate: 16000,
|
16 |
+
WindowSize: 1536,
|
17 |
+
Threshold: 0.5,
|
18 |
+
MinSilenceDurationMs: 0,
|
19 |
+
SpeechPadMs: 0,
|
20 |
+
})
|
21 |
+
if err != nil {
|
22 |
+
log.Fatalf("failed to create speech detector: %s", err)
|
23 |
+
}
|
24 |
+
|
25 |
+
f, err := os.Open(os.Args[1])
|
26 |
+
if err != nil {
|
27 |
+
log.Fatalf("failed to open sample audio file: %s", err)
|
28 |
+
}
|
29 |
+
defer f.Close()
|
30 |
+
|
31 |
+
dec := wav.NewDecoder(f)
|
32 |
+
|
33 |
+
if ok := dec.IsValidFile(); !ok {
|
34 |
+
log.Fatalf("invalid WAV file")
|
35 |
+
}
|
36 |
+
|
37 |
+
buf, err := dec.FullPCMBuffer()
|
38 |
+
if err != nil {
|
39 |
+
log.Fatalf("failed to get PCM buffer")
|
40 |
+
}
|
41 |
+
|
42 |
+
pcmBuf := buf.AsFloat32Buffer()
|
43 |
+
|
44 |
+
segments, err := sd.Detect(pcmBuf.Data)
|
45 |
+
if err != nil {
|
46 |
+
log.Fatalf("Detect failed: %s", err)
|
47 |
+
}
|
48 |
+
|
49 |
+
for _, s := range segments {
|
50 |
+
log.Printf("speech starts at %0.2fs", s.SpeechStartAt)
|
51 |
+
if s.SpeechEndAt > 0 {
|
52 |
+
log.Printf("speech ends at %0.2fs", s.SpeechEndAt)
|
53 |
+
}
|
54 |
+
}
|
55 |
+
|
56 |
+
err = sd.Destroy()
|
57 |
+
if err != nil {
|
58 |
+
log.Fatalf("failed to destroy detector: %s", err)
|
59 |
+
}
|
60 |
+
}
|
snakers4_silero-vad_master/examples/go/go.mod
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module silero
|
2 |
+
|
3 |
+
go 1.21.4
|
4 |
+
|
5 |
+
require (
|
6 |
+
github.com/go-audio/wav v1.1.0
|
7 |
+
github.com/streamer45/silero-vad-go v0.1.0
|
8 |
+
)
|
9 |
+
|
10 |
+
require (
|
11 |
+
github.com/go-audio/audio v1.0.0 // indirect
|
12 |
+
github.com/go-audio/riff v1.0.0 // indirect
|
13 |
+
)
|
snakers4_silero-vad_master/examples/go/go.sum
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
2 |
+
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
3 |
+
github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
|
4 |
+
github.com/go-audio/audio v1.0.0/go.mod h1:6uAu0+H2lHkwdGsAY+j2wHPNPpPoeg5AaEFh9FlA+Zs=
|
5 |
+
github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA=
|
6 |
+
github.com/go-audio/riff v1.0.0/go.mod h1:l3cQwc85y79NQFCRB7TiPoNiaijp6q8Z0Uv38rVG498=
|
7 |
+
github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
|
8 |
+
github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
|
9 |
+
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
10 |
+
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
11 |
+
github.com/streamer45/silero-vad-go v0.1.0 h1:0nGZ6VT3LKOkBG/w+4kljIB6brxtgQn6YuNjTVYjOQ4=
|
12 |
+
github.com/streamer45/silero-vad-go v0.1.0/go.mod h1:B+2FXs/5fZ6pzl6unUZYhZqkYdOB+3saBVzjOzdZnUs=
|
13 |
+
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
|
14 |
+
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
|
15 |
+
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
16 |
+
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
snakers4_silero-vad_master/examples/java-example/pom.xml
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
2 |
+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
3 |
+
<modelVersion>4.0.0</modelVersion>
|
4 |
+
|
5 |
+
<groupId>org.example</groupId>
|
6 |
+
<artifactId>java-example</artifactId>
|
7 |
+
<version>1.0-SNAPSHOT</version>
|
8 |
+
<packaging>jar</packaging>
|
9 |
+
|
10 |
+
<name>sliero-vad-example</name>
|
11 |
+
<url>http://maven.apache.org</url>
|
12 |
+
|
13 |
+
<properties>
|
14 |
+
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
15 |
+
</properties>
|
16 |
+
|
17 |
+
<dependencies>
|
18 |
+
<dependency>
|
19 |
+
<groupId>junit</groupId>
|
20 |
+
<artifactId>junit</artifactId>
|
21 |
+
<version>3.8.1</version>
|
22 |
+
<scope>test</scope>
|
23 |
+
</dependency>
|
24 |
+
<dependency>
|
25 |
+
<groupId>com.microsoft.onnxruntime</groupId>
|
26 |
+
<artifactId>onnxruntime</artifactId>
|
27 |
+
<version>1.16.0-rc1</version>
|
28 |
+
</dependency>
|
29 |
+
</dependencies>
|
30 |
+
</project>
|
snakers4_silero-vad_master/examples/java-example/src/main/java/org/example/App.java
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package org.example;
|
2 |
+
|
3 |
+
import ai.onnxruntime.OrtException;
|
4 |
+
import javax.sound.sampled.*;
|
5 |
+
import java.util.Map;
|
6 |
+
|
7 |
+
public class App {
|
8 |
+
|
9 |
+
private static final String MODEL_PATH = "src/main/resources/silero_vad.onnx";
|
10 |
+
private static final int SAMPLE_RATE = 16000;
|
11 |
+
private static final float START_THRESHOLD = 0.6f;
|
12 |
+
private static final float END_THRESHOLD = 0.45f;
|
13 |
+
private static final int MIN_SILENCE_DURATION_MS = 600;
|
14 |
+
private static final int SPEECH_PAD_MS = 500;
|
15 |
+
private static final int WINDOW_SIZE_SAMPLES = 2048;
|
16 |
+
|
17 |
+
public static void main(String[] args) {
|
18 |
+
// Initialize the Voice Activity Detector
|
19 |
+
SlieroVadDetector vadDetector;
|
20 |
+
try {
|
21 |
+
vadDetector = new SlieroVadDetector(MODEL_PATH, START_THRESHOLD, END_THRESHOLD, SAMPLE_RATE, MIN_SILENCE_DURATION_MS, SPEECH_PAD_MS);
|
22 |
+
} catch (OrtException e) {
|
23 |
+
System.err.println("Error initializing the VAD detector: " + e.getMessage());
|
24 |
+
return;
|
25 |
+
}
|
26 |
+
|
27 |
+
// Set audio format
|
28 |
+
AudioFormat format = new AudioFormat(SAMPLE_RATE, 16, 1, true, false);
|
29 |
+
DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);
|
30 |
+
|
31 |
+
// Get the target data line and open it with the specified format
|
32 |
+
TargetDataLine targetDataLine;
|
33 |
+
try {
|
34 |
+
targetDataLine = (TargetDataLine) AudioSystem.getLine(info);
|
35 |
+
targetDataLine.open(format);
|
36 |
+
targetDataLine.start();
|
37 |
+
} catch (LineUnavailableException e) {
|
38 |
+
System.err.println("Error opening target data line: " + e.getMessage());
|
39 |
+
return;
|
40 |
+
}
|
41 |
+
|
42 |
+
// Main loop to continuously read data and apply Voice Activity Detection
|
43 |
+
while (targetDataLine.isOpen()) {
|
44 |
+
byte[] data = new byte[WINDOW_SIZE_SAMPLES];
|
45 |
+
|
46 |
+
int numBytesRead = targetDataLine.read(data, 0, data.length);
|
47 |
+
if (numBytesRead <= 0) {
|
48 |
+
System.err.println("Error reading data from target data line.");
|
49 |
+
continue;
|
50 |
+
}
|
51 |
+
|
52 |
+
// Apply the Voice Activity Detector to the data and get the result
|
53 |
+
Map<String, Double> detectResult;
|
54 |
+
try {
|
55 |
+
detectResult = vadDetector.apply(data, true);
|
56 |
+
} catch (Exception e) {
|
57 |
+
System.err.println("Error applying VAD detector: " + e.getMessage());
|
58 |
+
continue;
|
59 |
+
}
|
60 |
+
|
61 |
+
if (!detectResult.isEmpty()) {
|
62 |
+
System.out.println(detectResult);
|
63 |
+
}
|
64 |
+
}
|
65 |
+
|
66 |
+
// Close the target data line to release audio resources
|
67 |
+
targetDataLine.close();
|
68 |
+
}
|
69 |
+
}
|
snakers4_silero-vad_master/examples/java-example/src/main/java/org/example/SlieroVadDetector.java
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package org.example;
|
2 |
+
|
3 |
+
import ai.onnxruntime.OrtException;
|
4 |
+
|
5 |
+
import java.math.BigDecimal;
|
6 |
+
import java.math.RoundingMode;
|
7 |
+
import java.util.Collections;
|
8 |
+
import java.util.HashMap;
|
9 |
+
import java.util.Map;
|
10 |
+
|
11 |
+
|
12 |
+
public class SlieroVadDetector {
|
13 |
+
// OnnxModel model used for speech processing
|
14 |
+
private final SlieroVadOnnxModel model;
|
15 |
+
// Threshold for speech start
|
16 |
+
private final float startThreshold;
|
17 |
+
// Threshold for speech end
|
18 |
+
private final float endThreshold;
|
19 |
+
// Sampling rate
|
20 |
+
private final int samplingRate;
|
21 |
+
// Minimum number of silence samples to determine the end threshold of speech
|
22 |
+
private final float minSilenceSamples;
|
23 |
+
// Additional number of samples for speech start or end to calculate speech start or end time
|
24 |
+
private final float speechPadSamples;
|
25 |
+
// Whether in the triggered state (i.e. whether speech is being detected)
|
26 |
+
private boolean triggered;
|
27 |
+
// Temporarily stored number of speech end samples
|
28 |
+
private int tempEnd;
|
29 |
+
// Number of samples currently being processed
|
30 |
+
private int currentSample;
|
31 |
+
|
32 |
+
|
33 |
+
public SlieroVadDetector(String modelPath,
|
34 |
+
float startThreshold,
|
35 |
+
float endThreshold,
|
36 |
+
int samplingRate,
|
37 |
+
int minSilenceDurationMs,
|
38 |
+
int speechPadMs) throws OrtException {
|
39 |
+
// Check if the sampling rate is 8000 or 16000, if not, throw an exception
|
40 |
+
if (samplingRate != 8000 && samplingRate != 16000) {
|
41 |
+
throw new IllegalArgumentException("does not support sampling rates other than [8000, 16000]");
|
42 |
+
}
|
43 |
+
|
44 |
+
// Initialize the parameters
|
45 |
+
this.model = new SlieroVadOnnxModel(modelPath);
|
46 |
+
this.startThreshold = startThreshold;
|
47 |
+
this.endThreshold = endThreshold;
|
48 |
+
this.samplingRate = samplingRate;
|
49 |
+
this.minSilenceSamples = samplingRate * minSilenceDurationMs / 1000f;
|
50 |
+
this.speechPadSamples = samplingRate * speechPadMs / 1000f;
|
51 |
+
// Reset the state
|
52 |
+
reset();
|
53 |
+
}
|
54 |
+
|
55 |
+
// Method to reset the state, including the model state, trigger state, temporary end time, and current sample count
|
56 |
+
public void reset() {
|
57 |
+
model.resetStates();
|
58 |
+
triggered = false;
|
59 |
+
tempEnd = 0;
|
60 |
+
currentSample = 0;
|
61 |
+
}
|
62 |
+
|
63 |
+
// apply method for processing the audio array, returning possible speech start or end times
|
64 |
+
public Map<String, Double> apply(byte[] data, boolean returnSeconds) {
|
65 |
+
|
66 |
+
// Convert the byte array to a float array
|
67 |
+
float[] audioData = new float[data.length / 2];
|
68 |
+
for (int i = 0; i < audioData.length; i++) {
|
69 |
+
audioData[i] = ((data[i * 2] & 0xff) | (data[i * 2 + 1] << 8)) / 32767.0f;
|
70 |
+
}
|
71 |
+
|
72 |
+
// Get the length of the audio array as the window size
|
73 |
+
int windowSizeSamples = audioData.length;
|
74 |
+
// Update the current sample count
|
75 |
+
currentSample += windowSizeSamples;
|
76 |
+
|
77 |
+
// Call the model to get the prediction probability of speech
|
78 |
+
float speechProb = 0;
|
79 |
+
try {
|
80 |
+
speechProb = model.call(new float[][]{audioData}, samplingRate)[0];
|
81 |
+
} catch (OrtException e) {
|
82 |
+
throw new RuntimeException(e);
|
83 |
+
}
|
84 |
+
|
85 |
+
// If the speech probability is greater than the threshold and the temporary end time is not 0, reset the temporary end time
|
86 |
+
// This indicates that the speech duration has exceeded expectations and needs to recalculate the end time
|
87 |
+
if (speechProb >= startThreshold && tempEnd != 0) {
|
88 |
+
tempEnd = 0;
|
89 |
+
}
|
90 |
+
|
91 |
+
// If the speech probability is greater than the threshold and not in the triggered state, set to triggered state and calculate the speech start time
|
92 |
+
if (speechProb >= startThreshold && !triggered) {
|
93 |
+
triggered = true;
|
94 |
+
int speechStart = (int) (currentSample - speechPadSamples);
|
95 |
+
speechStart = Math.max(speechStart, 0);
|
96 |
+
Map<String, Double> result = new HashMap<>();
|
97 |
+
// Decide whether to return the result in seconds or sample count based on the returnSeconds parameter
|
98 |
+
if (returnSeconds) {
|
99 |
+
double speechStartSeconds = speechStart / (double) samplingRate;
|
100 |
+
double roundedSpeechStart = BigDecimal.valueOf(speechStartSeconds).setScale(1, RoundingMode.HALF_UP).doubleValue();
|
101 |
+
result.put("start", roundedSpeechStart);
|
102 |
+
} else {
|
103 |
+
result.put("start", (double) speechStart);
|
104 |
+
}
|
105 |
+
|
106 |
+
return result;
|
107 |
+
}
|
108 |
+
|
109 |
+
// If the speech probability is less than a certain threshold and in the triggered state, calculate the speech end time
|
110 |
+
if (speechProb < endThreshold && triggered) {
|
111 |
+
// Initialize or update the temporary end time
|
112 |
+
if (tempEnd == 0) {
|
113 |
+
tempEnd = currentSample;
|
114 |
+
}
|
115 |
+
// If the number of silence samples between the current sample and the temporary end time is less than the minimum silence samples, return null
|
116 |
+
// This indicates that it is not yet possible to determine whether the speech has ended
|
117 |
+
if (currentSample - tempEnd < minSilenceSamples) {
|
118 |
+
return Collections.emptyMap();
|
119 |
+
} else {
|
120 |
+
// Calculate the speech end time, reset the trigger state and temporary end time
|
121 |
+
int speechEnd = (int) (tempEnd + speechPadSamples);
|
122 |
+
tempEnd = 0;
|
123 |
+
triggered = false;
|
124 |
+
Map<String, Double> result = new HashMap<>();
|
125 |
+
|
126 |
+
if (returnSeconds) {
|
127 |
+
double speechEndSeconds = speechEnd / (double) samplingRate;
|
128 |
+
double roundedSpeechEnd = BigDecimal.valueOf(speechEndSeconds).setScale(1, RoundingMode.HALF_UP).doubleValue();
|
129 |
+
result.put("end", roundedSpeechEnd);
|
130 |
+
} else {
|
131 |
+
result.put("end", (double) speechEnd);
|
132 |
+
}
|
133 |
+
return result;
|
134 |
+
}
|
135 |
+
}
|
136 |
+
|
137 |
+
// If the above conditions are not met, return null by default
|
138 |
+
return Collections.emptyMap();
|
139 |
+
}
|
140 |
+
|
141 |
+
public void close() throws OrtException {
|
142 |
+
reset();
|
143 |
+
model.close();
|
144 |
+
}
|
145 |
+
}
|
snakers4_silero-vad_master/examples/java-example/src/main/java/org/example/SlieroVadOnnxModel.java
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package org.example;
|
2 |
+
|
3 |
+
import ai.onnxruntime.OnnxTensor;
|
4 |
+
import ai.onnxruntime.OrtEnvironment;
|
5 |
+
import ai.onnxruntime.OrtException;
|
6 |
+
import ai.onnxruntime.OrtSession;
|
7 |
+
import java.util.Arrays;
|
8 |
+
import java.util.HashMap;
|
9 |
+
import java.util.List;
|
10 |
+
import java.util.Map;
|
11 |
+
|
12 |
+
public class SlieroVadOnnxModel {
|
13 |
+
// Define private variable OrtSession
|
14 |
+
private final OrtSession session;
|
15 |
+
private float[][][] h;
|
16 |
+
private float[][][] c;
|
17 |
+
// Define the last sample rate
|
18 |
+
private int lastSr = 0;
|
19 |
+
// Define the last batch size
|
20 |
+
private int lastBatchSize = 0;
|
21 |
+
// Define a list of supported sample rates
|
22 |
+
private static final List<Integer> SAMPLE_RATES = Arrays.asList(8000, 16000);
|
23 |
+
|
24 |
+
// Constructor
|
25 |
+
public SlieroVadOnnxModel(String modelPath) throws OrtException {
|
26 |
+
// Get the ONNX runtime environment
|
27 |
+
OrtEnvironment env = OrtEnvironment.getEnvironment();
|
28 |
+
// Create an ONNX session options object
|
29 |
+
OrtSession.SessionOptions opts = new OrtSession.SessionOptions();
|
30 |
+
// Set the InterOp thread count to 1, InterOp threads are used for parallel processing of different computation graph operations
|
31 |
+
opts.setInterOpNumThreads(1);
|
32 |
+
// Set the IntraOp thread count to 1, IntraOp threads are used for parallel processing within a single operation
|
33 |
+
opts.setIntraOpNumThreads(1);
|
34 |
+
// Add a CPU device, setting to false disables CPU execution optimization
|
35 |
+
opts.addCPU(true);
|
36 |
+
// Create an ONNX session using the environment, model path, and options
|
37 |
+
session = env.createSession(modelPath, opts);
|
38 |
+
// Reset states
|
39 |
+
resetStates();
|
40 |
+
}
|
41 |
+
|
42 |
+
/**
|
43 |
+
* Reset states
|
44 |
+
*/
|
45 |
+
void resetStates() {
|
46 |
+
h = new float[2][1][64];
|
47 |
+
c = new float[2][1][64];
|
48 |
+
lastSr = 0;
|
49 |
+
lastBatchSize = 0;
|
50 |
+
}
|
51 |
+
|
52 |
+
public void close() throws OrtException {
|
53 |
+
session.close();
|
54 |
+
}
|
55 |
+
|
56 |
+
/**
|
57 |
+
* Define inner class ValidationResult
|
58 |
+
*/
|
59 |
+
public static class ValidationResult {
|
60 |
+
public final float[][] x;
|
61 |
+
public final int sr;
|
62 |
+
|
63 |
+
// Constructor
|
64 |
+
public ValidationResult(float[][] x, int sr) {
|
65 |
+
this.x = x;
|
66 |
+
this.sr = sr;
|
67 |
+
}
|
68 |
+
}
|
69 |
+
|
70 |
+
/**
|
71 |
+
* Function to validate input data
|
72 |
+
*/
|
73 |
+
private ValidationResult validateInput(float[][] x, int sr) {
|
74 |
+
// Process the input data with dimension 1
|
75 |
+
if (x.length == 1) {
|
76 |
+
x = new float[][]{x[0]};
|
77 |
+
}
|
78 |
+
// Throw an exception when the input data dimension is greater than 2
|
79 |
+
if (x.length > 2) {
|
80 |
+
throw new IllegalArgumentException("Incorrect audio data dimension: " + x[0].length);
|
81 |
+
}
|
82 |
+
|
83 |
+
// Process the input data when the sample rate is not equal to 16000 and is a multiple of 16000
|
84 |
+
if (sr != 16000 && (sr % 16000 == 0)) {
|
85 |
+
int step = sr / 16000;
|
86 |
+
float[][] reducedX = new float[x.length][];
|
87 |
+
|
88 |
+
for (int i = 0; i < x.length; i++) {
|
89 |
+
float[] current = x[i];
|
90 |
+
float[] newArr = new float[(current.length + step - 1) / step];
|
91 |
+
|
92 |
+
for (int j = 0, index = 0; j < current.length; j += step, index++) {
|
93 |
+
newArr[index] = current[j];
|
94 |
+
}
|
95 |
+
|
96 |
+
reducedX[i] = newArr;
|
97 |
+
}
|
98 |
+
|
99 |
+
x = reducedX;
|
100 |
+
sr = 16000;
|
101 |
+
}
|
102 |
+
|
103 |
+
// If the sample rate is not in the list of supported sample rates, throw an exception
|
104 |
+
if (!SAMPLE_RATES.contains(sr)) {
|
105 |
+
throw new IllegalArgumentException("Only supports sample rates " + SAMPLE_RATES + " (or multiples of 16000)");
|
106 |
+
}
|
107 |
+
|
108 |
+
// If the input audio block is too short, throw an exception
|
109 |
+
if (((float) sr) / x[0].length > 31.25) {
|
110 |
+
throw new IllegalArgumentException("Input audio is too short");
|
111 |
+
}
|
112 |
+
|
113 |
+
// Return the validated result
|
114 |
+
return new ValidationResult(x, sr);
|
115 |
+
}
|
116 |
+
|
117 |
+
/**
|
118 |
+
* Method to call the ONNX model
|
119 |
+
*/
|
120 |
+
public float[] call(float[][] x, int sr) throws OrtException {
|
121 |
+
ValidationResult result = validateInput(x, sr);
|
122 |
+
x = result.x;
|
123 |
+
sr = result.sr;
|
124 |
+
|
125 |
+
int batchSize = x.length;
|
126 |
+
|
127 |
+
if (lastBatchSize == 0 || lastSr != sr || lastBatchSize != batchSize) {
|
128 |
+
resetStates();
|
129 |
+
}
|
130 |
+
|
131 |
+
OrtEnvironment env = OrtEnvironment.getEnvironment();
|
132 |
+
|
133 |
+
OnnxTensor inputTensor = null;
|
134 |
+
OnnxTensor hTensor = null;
|
135 |
+
OnnxTensor cTensor = null;
|
136 |
+
OnnxTensor srTensor = null;
|
137 |
+
OrtSession.Result ortOutputs = null;
|
138 |
+
|
139 |
+
try {
|
140 |
+
// Create input tensors
|
141 |
+
inputTensor = OnnxTensor.createTensor(env, x);
|
142 |
+
hTensor = OnnxTensor.createTensor(env, h);
|
143 |
+
cTensor = OnnxTensor.createTensor(env, c);
|
144 |
+
srTensor = OnnxTensor.createTensor(env, new long[]{sr});
|
145 |
+
|
146 |
+
Map<String, OnnxTensor> inputs = new HashMap<>();
|
147 |
+
inputs.put("input", inputTensor);
|
148 |
+
inputs.put("sr", srTensor);
|
149 |
+
inputs.put("h", hTensor);
|
150 |
+
inputs.put("c", cTensor);
|
151 |
+
|
152 |
+
// Call the ONNX model for calculation
|
153 |
+
ortOutputs = session.run(inputs);
|
154 |
+
// Get the output results
|
155 |
+
float[][] output = (float[][]) ortOutputs.get(0).getValue();
|
156 |
+
h = (float[][][]) ortOutputs.get(1).getValue();
|
157 |
+
c = (float[][][]) ortOutputs.get(2).getValue();
|
158 |
+
|
159 |
+
lastSr = sr;
|
160 |
+
lastBatchSize = batchSize;
|
161 |
+
return output[0];
|
162 |
+
} finally {
|
163 |
+
if (inputTensor != null) {
|
164 |
+
inputTensor.close();
|
165 |
+
}
|
166 |
+
if (hTensor != null) {
|
167 |
+
hTensor.close();
|
168 |
+
}
|
169 |
+
if (cTensor != null) {
|
170 |
+
cTensor.close();
|
171 |
+
}
|
172 |
+
if (srTensor != null) {
|
173 |
+
srTensor.close();
|
174 |
+
}
|
175 |
+
if (ortOutputs != null) {
|
176 |
+
ortOutputs.close();
|
177 |
+
}
|
178 |
+
}
|
179 |
+
}
|
180 |
+
}
|
snakers4_silero-vad_master/examples/microphone_and_webRTC_integration/README.md
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
In this example, an integration with the microphone and the webRTC VAD has been done. I used [this](https://github.com/mozilla/DeepSpeech-examples/tree/r0.8/mic_vad_streaming) as a draft.
|
3 |
+
Here a short video to present the results:
|
4 |
+
|
5 |
+
https://user-images.githubusercontent.com/28188499/116685087-182ff100-a9b2-11eb-927d-ed9f621226ee.mp4
|
6 |
+
|
7 |
+
# Requirements:
|
8 |
+
The libraries used for the following example are:
|
9 |
+
```
|
10 |
+
Python == 3.6.9
|
11 |
+
webrtcvad >= 2.0.10
|
12 |
+
torchaudio >= 0.8.1
|
13 |
+
torch >= 1.8.1
|
14 |
+
halo >= 0.0.31
|
15 |
+
Soundfile >= 0.13.3
|
16 |
+
```
|
17 |
+
Using pip3:
|
18 |
+
```
|
19 |
+
pip3 install webrtcvad
|
20 |
+
pip3 install torchaudio
|
21 |
+
pip3 install torch
|
22 |
+
pip3 install halo
|
23 |
+
pip3 install soundfile
|
24 |
+
```
|
25 |
+
Moreover, to make the code easier, the default sample_rate is 16KHz without resampling.
|
26 |
+
|
27 |
+
This example has been tested on ``` ubuntu 18.04.3 LTS```
|
28 |
+
|
snakers4_silero-vad_master/examples/microphone_and_webRTC_integration/microphone_and_webRTC_integration.py
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import collections, queue
|
2 |
+
import numpy as np
|
3 |
+
import pyaudio
|
4 |
+
import webrtcvad
|
5 |
+
from halo import Halo
|
6 |
+
import torch
|
7 |
+
import torchaudio
|
8 |
+
|
9 |
+
class Audio(object):
|
10 |
+
"""Streams raw audio from microphone. Data is received in a separate thread, and stored in a buffer, to be read from."""
|
11 |
+
|
12 |
+
FORMAT = pyaudio.paInt16
|
13 |
+
# Network/VAD rate-space
|
14 |
+
RATE_PROCESS = 16000
|
15 |
+
CHANNELS = 1
|
16 |
+
BLOCKS_PER_SECOND = 50
|
17 |
+
|
18 |
+
def __init__(self, callback=None, device=None, input_rate=RATE_PROCESS):
|
19 |
+
def proxy_callback(in_data, frame_count, time_info, status):
|
20 |
+
#pylint: disable=unused-argument
|
21 |
+
callback(in_data)
|
22 |
+
return (None, pyaudio.paContinue)
|
23 |
+
if callback is None: callback = lambda in_data: self.buffer_queue.put(in_data)
|
24 |
+
self.buffer_queue = queue.Queue()
|
25 |
+
self.device = device
|
26 |
+
self.input_rate = input_rate
|
27 |
+
self.sample_rate = self.RATE_PROCESS
|
28 |
+
self.block_size = int(self.RATE_PROCESS / float(self.BLOCKS_PER_SECOND))
|
29 |
+
self.block_size_input = int(self.input_rate / float(self.BLOCKS_PER_SECOND))
|
30 |
+
self.pa = pyaudio.PyAudio()
|
31 |
+
|
32 |
+
kwargs = {
|
33 |
+
'format': self.FORMAT,
|
34 |
+
'channels': self.CHANNELS,
|
35 |
+
'rate': self.input_rate,
|
36 |
+
'input': True,
|
37 |
+
'frames_per_buffer': self.block_size_input,
|
38 |
+
'stream_callback': proxy_callback,
|
39 |
+
}
|
40 |
+
|
41 |
+
self.chunk = None
|
42 |
+
# if not default device
|
43 |
+
if self.device:
|
44 |
+
kwargs['input_device_index'] = self.device
|
45 |
+
|
46 |
+
self.stream = self.pa.open(**kwargs)
|
47 |
+
self.stream.start_stream()
|
48 |
+
|
49 |
+
def read(self):
|
50 |
+
"""Return a block of audio data, blocking if necessary."""
|
51 |
+
return self.buffer_queue.get()
|
52 |
+
|
53 |
+
def destroy(self):
|
54 |
+
self.stream.stop_stream()
|
55 |
+
self.stream.close()
|
56 |
+
self.pa.terminate()
|
57 |
+
|
58 |
+
frame_duration_ms = property(lambda self: 1000 * self.block_size // self.sample_rate)
|
59 |
+
|
60 |
+
|
61 |
+
class VADAudio(Audio):
|
62 |
+
"""Filter & segment audio with voice activity detection."""
|
63 |
+
|
64 |
+
def __init__(self, aggressiveness=3, device=None, input_rate=None):
|
65 |
+
super().__init__(device=device, input_rate=input_rate)
|
66 |
+
self.vad = webrtcvad.Vad(aggressiveness)
|
67 |
+
|
68 |
+
def frame_generator(self):
|
69 |
+
"""Generator that yields all audio frames from microphone."""
|
70 |
+
if self.input_rate == self.RATE_PROCESS:
|
71 |
+
while True:
|
72 |
+
yield self.read()
|
73 |
+
else:
|
74 |
+
raise Exception("Resampling required")
|
75 |
+
|
76 |
+
def vad_collector(self, padding_ms=300, ratio=0.75, frames=None):
|
77 |
+
"""Generator that yields series of consecutive audio frames comprising each utterence, separated by yielding a single None.
|
78 |
+
Determines voice activity by ratio of frames in padding_ms. Uses a buffer to include padding_ms prior to being triggered.
|
79 |
+
Example: (frame, ..., frame, None, frame, ..., frame, None, ...)
|
80 |
+
|---utterence---| |---utterence---|
|
81 |
+
"""
|
82 |
+
if frames is None: frames = self.frame_generator()
|
83 |
+
num_padding_frames = padding_ms // self.frame_duration_ms
|
84 |
+
ring_buffer = collections.deque(maxlen=num_padding_frames)
|
85 |
+
triggered = False
|
86 |
+
|
87 |
+
for frame in frames:
|
88 |
+
if len(frame) < 640:
|
89 |
+
return
|
90 |
+
|
91 |
+
is_speech = self.vad.is_speech(frame, self.sample_rate)
|
92 |
+
|
93 |
+
if not triggered:
|
94 |
+
ring_buffer.append((frame, is_speech))
|
95 |
+
num_voiced = len([f for f, speech in ring_buffer if speech])
|
96 |
+
if num_voiced > ratio * ring_buffer.maxlen:
|
97 |
+
triggered = True
|
98 |
+
for f, s in ring_buffer:
|
99 |
+
yield f
|
100 |
+
ring_buffer.clear()
|
101 |
+
|
102 |
+
else:
|
103 |
+
yield frame
|
104 |
+
ring_buffer.append((frame, is_speech))
|
105 |
+
num_unvoiced = len([f for f, speech in ring_buffer if not speech])
|
106 |
+
if num_unvoiced > ratio * ring_buffer.maxlen:
|
107 |
+
triggered = False
|
108 |
+
yield None
|
109 |
+
ring_buffer.clear()
|
110 |
+
|
111 |
+
def main(ARGS):
|
112 |
+
# Start audio with VAD
|
113 |
+
vad_audio = VADAudio(aggressiveness=ARGS.webRTC_aggressiveness,
|
114 |
+
device=ARGS.device,
|
115 |
+
input_rate=ARGS.rate)
|
116 |
+
|
117 |
+
print("Listening (ctrl-C to exit)...")
|
118 |
+
frames = vad_audio.vad_collector()
|
119 |
+
|
120 |
+
# load silero VAD
|
121 |
+
torchaudio.set_audio_backend("soundfile")
|
122 |
+
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
|
123 |
+
model=ARGS.silaro_model_name,
|
124 |
+
force_reload= ARGS.reload)
|
125 |
+
(get_speech_ts,_,_, _,_, _, _) = utils
|
126 |
+
|
127 |
+
|
128 |
+
# Stream from microphone to DeepSpeech using VAD
|
129 |
+
spinner = None
|
130 |
+
if not ARGS.nospinner:
|
131 |
+
spinner = Halo(spinner='line')
|
132 |
+
wav_data = bytearray()
|
133 |
+
for frame in frames:
|
134 |
+
if frame is not None:
|
135 |
+
if spinner: spinner.start()
|
136 |
+
|
137 |
+
wav_data.extend(frame)
|
138 |
+
else:
|
139 |
+
if spinner: spinner.stop()
|
140 |
+
print("webRTC has detected a possible speech")
|
141 |
+
|
142 |
+
newsound= np.frombuffer(wav_data,np.int16)
|
143 |
+
audio_float32=Int2Float(newsound)
|
144 |
+
time_stamps =get_speech_ts(audio_float32, model,num_steps=ARGS.num_steps,trig_sum=ARGS.trig_sum,neg_trig_sum=ARGS.neg_trig_sum,
|
145 |
+
num_samples_per_window=ARGS.num_samples_per_window,min_speech_samples=ARGS.min_speech_samples,
|
146 |
+
min_silence_samples=ARGS.min_silence_samples)
|
147 |
+
|
148 |
+
if(len(time_stamps)>0):
|
149 |
+
print("silero VAD has detected a possible speech")
|
150 |
+
else:
|
151 |
+
print("silero VAD has detected a noise")
|
152 |
+
print()
|
153 |
+
wav_data = bytearray()
|
154 |
+
|
155 |
+
|
156 |
+
def Int2Float(sound):
|
157 |
+
_sound = np.copy(sound) #
|
158 |
+
abs_max = np.abs(_sound).max()
|
159 |
+
_sound = _sound.astype('float32')
|
160 |
+
if abs_max > 0:
|
161 |
+
_sound *= 1/abs_max
|
162 |
+
audio_float32 = torch.from_numpy(_sound.squeeze())
|
163 |
+
return audio_float32
|
164 |
+
|
165 |
+
if __name__ == '__main__':
|
166 |
+
DEFAULT_SAMPLE_RATE = 16000
|
167 |
+
|
168 |
+
import argparse
|
169 |
+
parser = argparse.ArgumentParser(description="Stream from microphone to webRTC and silero VAD")
|
170 |
+
|
171 |
+
parser.add_argument('-v', '--webRTC_aggressiveness', type=int, default=3,
|
172 |
+
help="Set aggressiveness of webRTC: an integer between 0 and 3, 0 being the least aggressive about filtering out non-speech, 3 the most aggressive. Default: 3")
|
173 |
+
parser.add_argument('--nospinner', action='store_true',
|
174 |
+
help="Disable spinner")
|
175 |
+
parser.add_argument('-d', '--device', type=int, default=None,
|
176 |
+
help="Device input index (Int) as listed by pyaudio.PyAudio.get_device_info_by_index(). If not provided, falls back to PyAudio.get_default_device().")
|
177 |
+
|
178 |
+
parser.add_argument('-name', '--silaro_model_name', type=str, default="silero_vad",
|
179 |
+
help="select the name of the model. You can select between 'silero_vad',''silero_vad_micro','silero_vad_micro_8k','silero_vad_mini','silero_vad_mini_8k'")
|
180 |
+
parser.add_argument('--reload', action='store_true',help="download the last version of the silero vad")
|
181 |
+
|
182 |
+
parser.add_argument('-ts', '--trig_sum', type=float, default=0.25,
|
183 |
+
help="overlapping windows are used for each audio chunk, trig sum defines average probability among those windows for switching into triggered state (speech state)")
|
184 |
+
|
185 |
+
parser.add_argument('-nts', '--neg_trig_sum', type=float, default=0.07,
|
186 |
+
help="same as trig_sum, but for switching from triggered to non-triggered state (non-speech)")
|
187 |
+
|
188 |
+
parser.add_argument('-N', '--num_steps', type=int, default=8,
|
189 |
+
help="nubmer of overlapping windows to split audio chunk into (we recommend 4 or 8)")
|
190 |
+
|
191 |
+
parser.add_argument('-nspw', '--num_samples_per_window', type=int, default=4000,
|
192 |
+
help="number of samples in each window, our models were trained using 4000 samples (250 ms) per window, so this is preferable value (lesser values reduce quality)")
|
193 |
+
|
194 |
+
parser.add_argument('-msps', '--min_speech_samples', type=int, default=10000,
|
195 |
+
help="minimum speech chunk duration in samples")
|
196 |
+
|
197 |
+
parser.add_argument('-msis', '--min_silence_samples', type=int, default=500,
|
198 |
+
help=" minimum silence duration in samples between to separate speech chunks")
|
199 |
+
ARGS = parser.parse_args()
|
200 |
+
ARGS.rate=DEFAULT_SAMPLE_RATE
|
201 |
+
main(ARGS)
|
snakers4_silero-vad_master/examples/parallel_example.ipynb
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"attachments": {},
|
5 |
+
"cell_type": "markdown",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"## Install Dependencies"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "code",
|
13 |
+
"execution_count": null,
|
14 |
+
"metadata": {},
|
15 |
+
"outputs": [],
|
16 |
+
"source": [
|
17 |
+
"# !pip install -q torchaudio\n",
|
18 |
+
"SAMPLING_RATE = 16000\n",
|
19 |
+
"import torch\n",
|
20 |
+
"from pprint import pprint\n",
|
21 |
+
"\n",
|
22 |
+
"torch.set_num_threads(1)\n",
|
23 |
+
"NUM_PROCESS=4 # set to the number of CPU cores in the machine\n",
|
24 |
+
"NUM_COPIES=8\n",
|
25 |
+
"# download wav files, make multiple copies\n",
|
26 |
+
"for idx in range(NUM_COPIES):\n",
|
27 |
+
" torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', f\"en_example{idx}.wav\")\n"
|
28 |
+
]
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"attachments": {},
|
32 |
+
"cell_type": "markdown",
|
33 |
+
"metadata": {},
|
34 |
+
"source": [
|
35 |
+
"## Load VAD model from torch hub"
|
36 |
+
]
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"cell_type": "code",
|
40 |
+
"execution_count": null,
|
41 |
+
"metadata": {},
|
42 |
+
"outputs": [],
|
43 |
+
"source": [
|
44 |
+
"model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
|
45 |
+
" model='silero_vad',\n",
|
46 |
+
" force_reload=True,\n",
|
47 |
+
" onnx=False)\n",
|
48 |
+
"\n",
|
49 |
+
"(get_speech_timestamps,\n",
|
50 |
+
"save_audio,\n",
|
51 |
+
"read_audio,\n",
|
52 |
+
"VADIterator,\n",
|
53 |
+
"collect_chunks) = utils"
|
54 |
+
]
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"attachments": {},
|
58 |
+
"cell_type": "markdown",
|
59 |
+
"metadata": {},
|
60 |
+
"source": [
|
61 |
+
"## Define a vad process function"
|
62 |
+
]
|
63 |
+
},
|
64 |
+
{
|
65 |
+
"cell_type": "code",
|
66 |
+
"execution_count": null,
|
67 |
+
"metadata": {},
|
68 |
+
"outputs": [],
|
69 |
+
"source": [
|
70 |
+
"import multiprocessing\n",
|
71 |
+
"\n",
|
72 |
+
"vad_models = dict()\n",
|
73 |
+
"\n",
|
74 |
+
"def init_model(model):\n",
|
75 |
+
" pid = multiprocessing.current_process().pid\n",
|
76 |
+
" model, _ = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
|
77 |
+
" model='silero_vad',\n",
|
78 |
+
" force_reload=False,\n",
|
79 |
+
" onnx=False)\n",
|
80 |
+
" vad_models[pid] = model\n",
|
81 |
+
"\n",
|
82 |
+
"def vad_process(audio_file: str):\n",
|
83 |
+
" \n",
|
84 |
+
" pid = multiprocessing.current_process().pid\n",
|
85 |
+
" \n",
|
86 |
+
" with torch.no_grad():\n",
|
87 |
+
" wav = read_audio(audio_file, sampling_rate=SAMPLING_RATE)\n",
|
88 |
+
" return get_speech_timestamps(\n",
|
89 |
+
" wav,\n",
|
90 |
+
" vad_models[pid],\n",
|
91 |
+
" 0.46, # speech prob threshold\n",
|
92 |
+
" 16000, # sample rate\n",
|
93 |
+
" 300, # min speech duration in ms\n",
|
94 |
+
" 20, # max speech duration in seconds\n",
|
95 |
+
" 600, # min silence duration\n",
|
96 |
+
" 512, # window size\n",
|
97 |
+
" 200, # spech pad ms\n",
|
98 |
+
" )"
|
99 |
+
]
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"attachments": {},
|
103 |
+
"cell_type": "markdown",
|
104 |
+
"metadata": {},
|
105 |
+
"source": [
|
106 |
+
"## Parallelization"
|
107 |
+
]
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"cell_type": "code",
|
111 |
+
"execution_count": null,
|
112 |
+
"metadata": {},
|
113 |
+
"outputs": [],
|
114 |
+
"source": [
|
115 |
+
"from concurrent.futures import ProcessPoolExecutor, as_completed\n",
|
116 |
+
"\n",
|
117 |
+
"futures = []\n",
|
118 |
+
"\n",
|
119 |
+
"with ProcessPoolExecutor(max_workers=NUM_PROCESS, initializer=init_model, initargs=(model,)) as ex:\n",
|
120 |
+
" for i in range(NUM_COPIES):\n",
|
121 |
+
" futures.append(ex.submit(vad_process, f\"en_example{idx}.wav\"))\n",
|
122 |
+
"\n",
|
123 |
+
"for finished in as_completed(futures):\n",
|
124 |
+
" pprint(finished.result())"
|
125 |
+
]
|
126 |
+
}
|
127 |
+
],
|
128 |
+
"metadata": {
|
129 |
+
"kernelspec": {
|
130 |
+
"display_name": "diarization",
|
131 |
+
"language": "python",
|
132 |
+
"name": "python3"
|
133 |
+
},
|
134 |
+
"language_info": {
|
135 |
+
"codemirror_mode": {
|
136 |
+
"name": "ipython",
|
137 |
+
"version": 3
|
138 |
+
},
|
139 |
+
"file_extension": ".py",
|
140 |
+
"mimetype": "text/x-python",
|
141 |
+
"name": "python",
|
142 |
+
"nbconvert_exporter": "python",
|
143 |
+
"pygments_lexer": "ipython3",
|
144 |
+
"version": "3.9.15"
|
145 |
+
}
|
146 |
+
},
|
147 |
+
"nbformat": 4,
|
148 |
+
"nbformat_minor": 2
|
149 |
+
}
|
snakers4_silero-vad_master/examples/pyaudio-streaming/README.md
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Pyaudio Streaming Example
|
2 |
+
|
3 |
+
This example notebook shows how micophone audio fetched by pyaudio can be processed with Silero-VAD.
|
4 |
+
|
5 |
+
It has been designed as a low-level example for binary real-time streaming using only the prediction of the model, processing the binary data and plotting the speech probabilities at the end to visualize it.
|
6 |
+
|
7 |
+
Currently, the notebook consits of two examples:
|
8 |
+
- One that records audio of a predefined length from the microphone, process it with Silero-VAD, and plots it afterwards.
|
9 |
+
- The other one plots the speech probabilities in real-time (using jupyterplot) and records the audio until you press enter.
|
10 |
+
|
11 |
+
## Example Video for the Real-Time Visualization
|
12 |
+
|
13 |
+
|
14 |
+
https://user-images.githubusercontent.com/8079748/117580455-4622dd00-b0f8-11eb-858d-e6368ed4eada.mp4
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
|
snakers4_silero-vad_master/examples/pyaudio-streaming/pyaudio-streaming-examples.ipynb
ADDED
@@ -0,0 +1,331 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "62a0cccb",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# Pyaudio Microphone Streaming Examples\n",
|
9 |
+
"\n",
|
10 |
+
"A simple notebook that uses pyaudio to get the microphone audio and feeds this audio then to Silero VAD.\n",
|
11 |
+
"\n",
|
12 |
+
"I created it as an example on how binary data from a stream could be feed into Silero VAD.\n",
|
13 |
+
"\n",
|
14 |
+
"\n",
|
15 |
+
"Has been tested on Ubuntu 21.04 (x86). After you installed the dependencies below, no additional setup is required."
|
16 |
+
]
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"cell_type": "markdown",
|
20 |
+
"id": "64cbe1eb",
|
21 |
+
"metadata": {},
|
22 |
+
"source": [
|
23 |
+
"## Dependencies\n",
|
24 |
+
"The cell below lists all used dependencies and the used versions. Uncomment to install them from within the notebook."
|
25 |
+
]
|
26 |
+
},
|
27 |
+
{
|
28 |
+
"cell_type": "code",
|
29 |
+
"execution_count": null,
|
30 |
+
"id": "57bc2aac",
|
31 |
+
"metadata": {},
|
32 |
+
"outputs": [],
|
33 |
+
"source": [
|
34 |
+
"#!pip install numpy==1.20.2\n",
|
35 |
+
"#!pip install torch==1.9.0\n",
|
36 |
+
"#!pip install matplotlib==3.4.2\n",
|
37 |
+
"#!pip install torchaudio==0.9.0\n",
|
38 |
+
"#!pip install soundfile==0.10.3.post1\n",
|
39 |
+
"#!pip install pyaudio==0.2.11"
|
40 |
+
]
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"cell_type": "markdown",
|
44 |
+
"id": "110de761",
|
45 |
+
"metadata": {},
|
46 |
+
"source": [
|
47 |
+
"## Imports"
|
48 |
+
]
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"cell_type": "code",
|
52 |
+
"execution_count": null,
|
53 |
+
"id": "5a647d8d",
|
54 |
+
"metadata": {},
|
55 |
+
"outputs": [],
|
56 |
+
"source": [
|
57 |
+
"import io\n",
|
58 |
+
"import numpy as np\n",
|
59 |
+
"import torch\n",
|
60 |
+
"torch.set_num_threads(1)\n",
|
61 |
+
"import torchaudio\n",
|
62 |
+
"import matplotlib\n",
|
63 |
+
"import matplotlib.pylab as plt\n",
|
64 |
+
"torchaudio.set_audio_backend(\"soundfile\")\n",
|
65 |
+
"import pyaudio"
|
66 |
+
]
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"cell_type": "code",
|
70 |
+
"execution_count": null,
|
71 |
+
"id": "725d7066",
|
72 |
+
"metadata": {},
|
73 |
+
"outputs": [],
|
74 |
+
"source": [
|
75 |
+
"model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
|
76 |
+
" model='silero_vad',\n",
|
77 |
+
" force_reload=True)"
|
78 |
+
]
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"cell_type": "code",
|
82 |
+
"execution_count": null,
|
83 |
+
"id": "1c0b2ea7",
|
84 |
+
"metadata": {},
|
85 |
+
"outputs": [],
|
86 |
+
"source": [
|
87 |
+
"(get_speech_timestamps,\n",
|
88 |
+
" save_audio,\n",
|
89 |
+
" read_audio,\n",
|
90 |
+
" VADIterator,\n",
|
91 |
+
" collect_chunks) = utils"
|
92 |
+
]
|
93 |
+
},
|
94 |
+
{
|
95 |
+
"cell_type": "markdown",
|
96 |
+
"id": "f9112603",
|
97 |
+
"metadata": {},
|
98 |
+
"source": [
|
99 |
+
"### Helper Methods"
|
100 |
+
]
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"cell_type": "code",
|
104 |
+
"execution_count": null,
|
105 |
+
"id": "5abc6330",
|
106 |
+
"metadata": {},
|
107 |
+
"outputs": [],
|
108 |
+
"source": [
|
109 |
+
"# Taken from utils_vad.py\n",
|
110 |
+
"def validate(model,\n",
|
111 |
+
" inputs: torch.Tensor):\n",
|
112 |
+
" with torch.no_grad():\n",
|
113 |
+
" outs = model(inputs)\n",
|
114 |
+
" return outs\n",
|
115 |
+
"\n",
|
116 |
+
"# Provided by Alexander Veysov\n",
|
117 |
+
"def int2float(sound):\n",
|
118 |
+
" abs_max = np.abs(sound).max()\n",
|
119 |
+
" sound = sound.astype('float32')\n",
|
120 |
+
" if abs_max > 0:\n",
|
121 |
+
" sound *= 1/32768\n",
|
122 |
+
" sound = sound.squeeze() # depends on the use case\n",
|
123 |
+
" return sound"
|
124 |
+
]
|
125 |
+
},
|
126 |
+
{
|
127 |
+
"cell_type": "markdown",
|
128 |
+
"id": "5124095e",
|
129 |
+
"metadata": {},
|
130 |
+
"source": [
|
131 |
+
"## Pyaudio Set-up"
|
132 |
+
]
|
133 |
+
},
|
134 |
+
{
|
135 |
+
"cell_type": "code",
|
136 |
+
"execution_count": null,
|
137 |
+
"id": "a845356e",
|
138 |
+
"metadata": {},
|
139 |
+
"outputs": [],
|
140 |
+
"source": [
|
141 |
+
"FORMAT = pyaudio.paInt16\n",
|
142 |
+
"CHANNELS = 1\n",
|
143 |
+
"SAMPLE_RATE = 16000\n",
|
144 |
+
"CHUNK = int(SAMPLE_RATE / 10)\n",
|
145 |
+
"\n",
|
146 |
+
"audio = pyaudio.PyAudio()"
|
147 |
+
]
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"cell_type": "markdown",
|
151 |
+
"id": "0b910c99",
|
152 |
+
"metadata": {},
|
153 |
+
"source": [
|
154 |
+
"## Simple Example\n",
|
155 |
+
"The following example reads the audio as 250ms chunks from the microphone, converts them to a Pytorch Tensor, and gets the probabilities/confidences if the model thinks the frame is voiced."
|
156 |
+
]
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"cell_type": "code",
|
160 |
+
"execution_count": null,
|
161 |
+
"id": "9d3d2c10",
|
162 |
+
"metadata": {},
|
163 |
+
"outputs": [],
|
164 |
+
"source": [
|
165 |
+
"num_samples = 1536"
|
166 |
+
]
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"cell_type": "code",
|
170 |
+
"execution_count": null,
|
171 |
+
"id": "3cb44a4a",
|
172 |
+
"metadata": {},
|
173 |
+
"outputs": [],
|
174 |
+
"source": [
|
175 |
+
"stream = audio.open(format=FORMAT,\n",
|
176 |
+
" channels=CHANNELS,\n",
|
177 |
+
" rate=SAMPLE_RATE,\n",
|
178 |
+
" input=True,\n",
|
179 |
+
" frames_per_buffer=CHUNK)\n",
|
180 |
+
"data = []\n",
|
181 |
+
"voiced_confidences = []\n",
|
182 |
+
"\n",
|
183 |
+
"print(\"Started Recording\")\n",
|
184 |
+
"for i in range(0, frames_to_record):\n",
|
185 |
+
" \n",
|
186 |
+
" audio_chunk = stream.read(num_samples)\n",
|
187 |
+
" \n",
|
188 |
+
" # in case you want to save the audio later\n",
|
189 |
+
" data.append(audio_chunk)\n",
|
190 |
+
" \n",
|
191 |
+
" audio_int16 = np.frombuffer(audio_chunk, np.int16);\n",
|
192 |
+
"\n",
|
193 |
+
" audio_float32 = int2float(audio_int16)\n",
|
194 |
+
" \n",
|
195 |
+
" # get the confidences and add them to the list to plot them later\n",
|
196 |
+
" new_confidence = model(torch.from_numpy(audio_float32), 16000).item()\n",
|
197 |
+
" voiced_confidences.append(new_confidence)\n",
|
198 |
+
" \n",
|
199 |
+
"print(\"Stopped the recording\")\n",
|
200 |
+
"\n",
|
201 |
+
"# plot the confidences for the speech\n",
|
202 |
+
"plt.figure(figsize=(20,6))\n",
|
203 |
+
"plt.plot(voiced_confidences)\n",
|
204 |
+
"plt.show()"
|
205 |
+
]
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"cell_type": "markdown",
|
209 |
+
"id": "a3dda982",
|
210 |
+
"metadata": {},
|
211 |
+
"source": [
|
212 |
+
"## Real Time Visualization\n",
|
213 |
+
"\n",
|
214 |
+
"As an enhancement to plot the speech probabilities in real time I added the implementation below.\n",
|
215 |
+
"In contrast to the simeple one, it records the audio until to stop the recording by pressing enter.\n",
|
216 |
+
"While looking into good ways to update matplotlib plots in real-time, I found a simple libarary that does the job. https://github.com/lvwerra/jupyterplot It has some limitations, but works for this use case really well.\n"
|
217 |
+
]
|
218 |
+
},
|
219 |
+
{
|
220 |
+
"cell_type": "code",
|
221 |
+
"execution_count": null,
|
222 |
+
"id": "05ef4100",
|
223 |
+
"metadata": {},
|
224 |
+
"outputs": [],
|
225 |
+
"source": [
|
226 |
+
"#!pip install jupyterplot==0.0.3"
|
227 |
+
]
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"cell_type": "code",
|
231 |
+
"execution_count": null,
|
232 |
+
"id": "d1d4cdd6",
|
233 |
+
"metadata": {},
|
234 |
+
"outputs": [],
|
235 |
+
"source": [
|
236 |
+
"from jupyterplot import ProgressPlot\n",
|
237 |
+
"import threading\n",
|
238 |
+
"\n",
|
239 |
+
"continue_recording = True\n",
|
240 |
+
"\n",
|
241 |
+
"def stop():\n",
|
242 |
+
" input(\"Press Enter to stop the recording:\")\n",
|
243 |
+
" global continue_recording\n",
|
244 |
+
" continue_recording = False\n",
|
245 |
+
"\n",
|
246 |
+
"def start_recording():\n",
|
247 |
+
" \n",
|
248 |
+
" stream = audio.open(format=FORMAT,\n",
|
249 |
+
" channels=CHANNELS,\n",
|
250 |
+
" rate=SAMPLE_RATE,\n",
|
251 |
+
" input=True,\n",
|
252 |
+
" frames_per_buffer=CHUNK)\n",
|
253 |
+
"\n",
|
254 |
+
" data = []\n",
|
255 |
+
" voiced_confidences = []\n",
|
256 |
+
" \n",
|
257 |
+
" global continue_recording\n",
|
258 |
+
" continue_recording = True\n",
|
259 |
+
" \n",
|
260 |
+
" pp = ProgressPlot(plot_names=[\"Silero VAD\"],line_names=[\"speech probabilities\"], x_label=\"audio chunks\")\n",
|
261 |
+
" \n",
|
262 |
+
" stop_listener = threading.Thread(target=stop)\n",
|
263 |
+
" stop_listener.start()\n",
|
264 |
+
"\n",
|
265 |
+
" while continue_recording:\n",
|
266 |
+
" \n",
|
267 |
+
" audio_chunk = stream.read(num_samples)\n",
|
268 |
+
" \n",
|
269 |
+
" # in case you want to save the audio later\n",
|
270 |
+
" data.append(audio_chunk)\n",
|
271 |
+
" \n",
|
272 |
+
" audio_int16 = np.frombuffer(audio_chunk, np.int16);\n",
|
273 |
+
"\n",
|
274 |
+
" audio_float32 = int2float(audio_int16)\n",
|
275 |
+
" \n",
|
276 |
+
" # get the confidences and add them to the list to plot them later\n",
|
277 |
+
" new_confidence = model(torch.from_numpy(audio_float32), 16000).item()\n",
|
278 |
+
" voiced_confidences.append(new_confidence)\n",
|
279 |
+
" \n",
|
280 |
+
" pp.update(new_confidence)\n",
|
281 |
+
"\n",
|
282 |
+
"\n",
|
283 |
+
" pp.finalize()"
|
284 |
+
]
|
285 |
+
},
|
286 |
+
{
|
287 |
+
"cell_type": "code",
|
288 |
+
"execution_count": null,
|
289 |
+
"id": "1e398009",
|
290 |
+
"metadata": {},
|
291 |
+
"outputs": [],
|
292 |
+
"source": [
|
293 |
+
"start_recording()"
|
294 |
+
]
|
295 |
+
}
|
296 |
+
],
|
297 |
+
"metadata": {
|
298 |
+
"kernelspec": {
|
299 |
+
"display_name": "Python 3",
|
300 |
+
"language": "python",
|
301 |
+
"name": "python3"
|
302 |
+
},
|
303 |
+
"language_info": {
|
304 |
+
"codemirror_mode": {
|
305 |
+
"name": "ipython",
|
306 |
+
"version": 3
|
307 |
+
},
|
308 |
+
"file_extension": ".py",
|
309 |
+
"mimetype": "text/x-python",
|
310 |
+
"name": "python",
|
311 |
+
"nbconvert_exporter": "python",
|
312 |
+
"pygments_lexer": "ipython3",
|
313 |
+
"version": "3.7.10"
|
314 |
+
},
|
315 |
+
"toc": {
|
316 |
+
"base_numbering": 1,
|
317 |
+
"nav_menu": {},
|
318 |
+
"number_sections": true,
|
319 |
+
"sideBar": true,
|
320 |
+
"skip_h1_title": false,
|
321 |
+
"title_cell": "Table of Contents",
|
322 |
+
"title_sidebar": "Contents",
|
323 |
+
"toc_cell": false,
|
324 |
+
"toc_position": {},
|
325 |
+
"toc_section_display": true,
|
326 |
+
"toc_window_display": false
|
327 |
+
}
|
328 |
+
},
|
329 |
+
"nbformat": 4,
|
330 |
+
"nbformat_minor": 5
|
331 |
+
}
|
snakers4_silero-vad_master/examples/rust-example/.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
target/
|
2 |
+
recorder.wav
|
snakers4_silero-vad_master/examples/rust-example/Cargo.lock
ADDED
@@ -0,0 +1,781 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This file is automatically @generated by Cargo.
|
2 |
+
# It is not intended for manual editing.
|
3 |
+
version = 3
|
4 |
+
|
5 |
+
[[package]]
|
6 |
+
name = "adler"
|
7 |
+
version = "1.0.2"
|
8 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
9 |
+
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
|
10 |
+
|
11 |
+
[[package]]
|
12 |
+
name = "autocfg"
|
13 |
+
version = "1.3.0"
|
14 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
15 |
+
checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0"
|
16 |
+
|
17 |
+
[[package]]
|
18 |
+
name = "base64"
|
19 |
+
version = "0.22.1"
|
20 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
21 |
+
checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
|
22 |
+
|
23 |
+
[[package]]
|
24 |
+
name = "bitflags"
|
25 |
+
version = "1.3.2"
|
26 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
27 |
+
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
28 |
+
|
29 |
+
[[package]]
|
30 |
+
name = "bitflags"
|
31 |
+
version = "2.5.0"
|
32 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
33 |
+
checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
|
34 |
+
|
35 |
+
[[package]]
|
36 |
+
name = "block-buffer"
|
37 |
+
version = "0.10.4"
|
38 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
39 |
+
checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
|
40 |
+
dependencies = [
|
41 |
+
"generic-array",
|
42 |
+
]
|
43 |
+
|
44 |
+
[[package]]
|
45 |
+
name = "bumpalo"
|
46 |
+
version = "3.16.0"
|
47 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
48 |
+
checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
|
49 |
+
|
50 |
+
[[package]]
|
51 |
+
name = "cc"
|
52 |
+
version = "1.0.98"
|
53 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
54 |
+
checksum = "41c270e7540d725e65ac7f1b212ac8ce349719624d7bcff99f8e2e488e8cf03f"
|
55 |
+
|
56 |
+
[[package]]
|
57 |
+
name = "cfg-if"
|
58 |
+
version = "1.0.0"
|
59 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
60 |
+
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
61 |
+
|
62 |
+
[[package]]
|
63 |
+
name = "cpufeatures"
|
64 |
+
version = "0.2.12"
|
65 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
66 |
+
checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504"
|
67 |
+
dependencies = [
|
68 |
+
"libc",
|
69 |
+
]
|
70 |
+
|
71 |
+
[[package]]
|
72 |
+
name = "crc32fast"
|
73 |
+
version = "1.4.2"
|
74 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
75 |
+
checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
|
76 |
+
dependencies = [
|
77 |
+
"cfg-if",
|
78 |
+
]
|
79 |
+
|
80 |
+
[[package]]
|
81 |
+
name = "crunchy"
|
82 |
+
version = "0.2.2"
|
83 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
84 |
+
checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
|
85 |
+
|
86 |
+
[[package]]
|
87 |
+
name = "crypto-common"
|
88 |
+
version = "0.1.6"
|
89 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
90 |
+
checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
|
91 |
+
dependencies = [
|
92 |
+
"generic-array",
|
93 |
+
"typenum",
|
94 |
+
]
|
95 |
+
|
96 |
+
[[package]]
|
97 |
+
name = "digest"
|
98 |
+
version = "0.10.7"
|
99 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
100 |
+
checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
|
101 |
+
dependencies = [
|
102 |
+
"block-buffer",
|
103 |
+
"crypto-common",
|
104 |
+
]
|
105 |
+
|
106 |
+
[[package]]
|
107 |
+
name = "errno"
|
108 |
+
version = "0.3.9"
|
109 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
110 |
+
checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
|
111 |
+
dependencies = [
|
112 |
+
"libc",
|
113 |
+
"windows-sys",
|
114 |
+
]
|
115 |
+
|
116 |
+
[[package]]
|
117 |
+
name = "filetime"
|
118 |
+
version = "0.2.23"
|
119 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
120 |
+
checksum = "1ee447700ac8aa0b2f2bd7bc4462ad686ba06baa6727ac149a2d6277f0d240fd"
|
121 |
+
dependencies = [
|
122 |
+
"cfg-if",
|
123 |
+
"libc",
|
124 |
+
"redox_syscall",
|
125 |
+
"windows-sys",
|
126 |
+
]
|
127 |
+
|
128 |
+
[[package]]
|
129 |
+
name = "flate2"
|
130 |
+
version = "1.0.30"
|
131 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
132 |
+
checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae"
|
133 |
+
dependencies = [
|
134 |
+
"crc32fast",
|
135 |
+
"miniz_oxide",
|
136 |
+
]
|
137 |
+
|
138 |
+
[[package]]
|
139 |
+
name = "form_urlencoded"
|
140 |
+
version = "1.2.1"
|
141 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
142 |
+
checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
|
143 |
+
dependencies = [
|
144 |
+
"percent-encoding",
|
145 |
+
]
|
146 |
+
|
147 |
+
[[package]]
|
148 |
+
name = "generic-array"
|
149 |
+
version = "0.14.7"
|
150 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
151 |
+
checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
|
152 |
+
dependencies = [
|
153 |
+
"typenum",
|
154 |
+
"version_check",
|
155 |
+
]
|
156 |
+
|
157 |
+
[[package]]
|
158 |
+
name = "getrandom"
|
159 |
+
version = "0.2.15"
|
160 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
161 |
+
checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
|
162 |
+
dependencies = [
|
163 |
+
"cfg-if",
|
164 |
+
"libc",
|
165 |
+
"wasi",
|
166 |
+
]
|
167 |
+
|
168 |
+
[[package]]
|
169 |
+
name = "half"
|
170 |
+
version = "2.4.1"
|
171 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
172 |
+
checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
|
173 |
+
dependencies = [
|
174 |
+
"cfg-if",
|
175 |
+
"crunchy",
|
176 |
+
]
|
177 |
+
|
178 |
+
[[package]]
|
179 |
+
name = "hound"
|
180 |
+
version = "3.5.1"
|
181 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
182 |
+
checksum = "62adaabb884c94955b19907d60019f4e145d091c75345379e70d1ee696f7854f"
|
183 |
+
|
184 |
+
[[package]]
|
185 |
+
name = "idna"
|
186 |
+
version = "0.5.0"
|
187 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
188 |
+
checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6"
|
189 |
+
dependencies = [
|
190 |
+
"unicode-bidi",
|
191 |
+
"unicode-normalization",
|
192 |
+
]
|
193 |
+
|
194 |
+
[[package]]
|
195 |
+
name = "js-sys"
|
196 |
+
version = "0.3.69"
|
197 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
198 |
+
checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
|
199 |
+
dependencies = [
|
200 |
+
"wasm-bindgen",
|
201 |
+
]
|
202 |
+
|
203 |
+
[[package]]
|
204 |
+
name = "libc"
|
205 |
+
version = "0.2.155"
|
206 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
207 |
+
checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
|
208 |
+
|
209 |
+
[[package]]
|
210 |
+
name = "libloading"
|
211 |
+
version = "0.8.3"
|
212 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
213 |
+
checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19"
|
214 |
+
dependencies = [
|
215 |
+
"cfg-if",
|
216 |
+
"windows-targets",
|
217 |
+
]
|
218 |
+
|
219 |
+
[[package]]
|
220 |
+
name = "linux-raw-sys"
|
221 |
+
version = "0.4.14"
|
222 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
223 |
+
checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
|
224 |
+
|
225 |
+
[[package]]
|
226 |
+
name = "log"
|
227 |
+
version = "0.4.21"
|
228 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
229 |
+
checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
|
230 |
+
|
231 |
+
[[package]]
|
232 |
+
name = "matrixmultiply"
|
233 |
+
version = "0.3.8"
|
234 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
235 |
+
checksum = "7574c1cf36da4798ab73da5b215bbf444f50718207754cb522201d78d1cd0ff2"
|
236 |
+
dependencies = [
|
237 |
+
"autocfg",
|
238 |
+
"rawpointer",
|
239 |
+
]
|
240 |
+
|
241 |
+
[[package]]
|
242 |
+
name = "miniz_oxide"
|
243 |
+
version = "0.7.3"
|
244 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
245 |
+
checksum = "87dfd01fe195c66b572b37921ad8803d010623c0aca821bea2302239d155cdae"
|
246 |
+
dependencies = [
|
247 |
+
"adler",
|
248 |
+
]
|
249 |
+
|
250 |
+
[[package]]
|
251 |
+
name = "ndarray"
|
252 |
+
version = "0.15.6"
|
253 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
254 |
+
checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32"
|
255 |
+
dependencies = [
|
256 |
+
"matrixmultiply",
|
257 |
+
"num-complex",
|
258 |
+
"num-integer",
|
259 |
+
"num-traits",
|
260 |
+
"rawpointer",
|
261 |
+
]
|
262 |
+
|
263 |
+
[[package]]
|
264 |
+
name = "num-complex"
|
265 |
+
version = "0.4.6"
|
266 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
267 |
+
checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
|
268 |
+
dependencies = [
|
269 |
+
"num-traits",
|
270 |
+
]
|
271 |
+
|
272 |
+
[[package]]
|
273 |
+
name = "num-integer"
|
274 |
+
version = "0.1.46"
|
275 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
276 |
+
checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
|
277 |
+
dependencies = [
|
278 |
+
"num-traits",
|
279 |
+
]
|
280 |
+
|
281 |
+
[[package]]
|
282 |
+
name = "num-traits"
|
283 |
+
version = "0.2.19"
|
284 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
285 |
+
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
|
286 |
+
dependencies = [
|
287 |
+
"autocfg",
|
288 |
+
]
|
289 |
+
|
290 |
+
[[package]]
|
291 |
+
name = "once_cell"
|
292 |
+
version = "1.19.0"
|
293 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
294 |
+
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
|
295 |
+
|
296 |
+
[[package]]
|
297 |
+
name = "ort"
|
298 |
+
version = "2.0.0-rc.2"
|
299 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
300 |
+
checksum = "0bc80894094c6a875bfac64415ed456fa661081a278a035e22be661305c87e14"
|
301 |
+
dependencies = [
|
302 |
+
"half",
|
303 |
+
"js-sys",
|
304 |
+
"libloading",
|
305 |
+
"ndarray",
|
306 |
+
"ort-sys",
|
307 |
+
"thiserror",
|
308 |
+
"tracing",
|
309 |
+
"web-sys",
|
310 |
+
]
|
311 |
+
|
312 |
+
[[package]]
|
313 |
+
name = "ort-sys"
|
314 |
+
version = "2.0.0-rc.2"
|
315 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
316 |
+
checksum = "b3d9c1373fc813d3f024d394f621f4c6dde0734c79b1c17113c3bb5bf0084bbe"
|
317 |
+
dependencies = [
|
318 |
+
"flate2",
|
319 |
+
"sha2",
|
320 |
+
"tar",
|
321 |
+
"ureq",
|
322 |
+
]
|
323 |
+
|
324 |
+
[[package]]
|
325 |
+
name = "percent-encoding"
|
326 |
+
version = "2.3.1"
|
327 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
328 |
+
checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
|
329 |
+
|
330 |
+
[[package]]
|
331 |
+
name = "pin-project-lite"
|
332 |
+
version = "0.2.14"
|
333 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
334 |
+
checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02"
|
335 |
+
|
336 |
+
[[package]]
|
337 |
+
name = "proc-macro2"
|
338 |
+
version = "1.0.84"
|
339 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
340 |
+
checksum = "ec96c6a92621310b51366f1e28d05ef11489516e93be030060e5fc12024a49d6"
|
341 |
+
dependencies = [
|
342 |
+
"unicode-ident",
|
343 |
+
]
|
344 |
+
|
345 |
+
[[package]]
|
346 |
+
name = "quote"
|
347 |
+
version = "1.0.36"
|
348 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
349 |
+
checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
|
350 |
+
dependencies = [
|
351 |
+
"proc-macro2",
|
352 |
+
]
|
353 |
+
|
354 |
+
[[package]]
|
355 |
+
name = "rawpointer"
|
356 |
+
version = "0.2.1"
|
357 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
358 |
+
checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3"
|
359 |
+
|
360 |
+
[[package]]
|
361 |
+
name = "redox_syscall"
|
362 |
+
version = "0.4.1"
|
363 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
364 |
+
checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
|
365 |
+
dependencies = [
|
366 |
+
"bitflags 1.3.2",
|
367 |
+
]
|
368 |
+
|
369 |
+
[[package]]
|
370 |
+
name = "ring"
|
371 |
+
version = "0.17.8"
|
372 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
373 |
+
checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d"
|
374 |
+
dependencies = [
|
375 |
+
"cc",
|
376 |
+
"cfg-if",
|
377 |
+
"getrandom",
|
378 |
+
"libc",
|
379 |
+
"spin",
|
380 |
+
"untrusted",
|
381 |
+
"windows-sys",
|
382 |
+
]
|
383 |
+
|
384 |
+
[[package]]
|
385 |
+
name = "rust-example"
|
386 |
+
version = "0.1.0"
|
387 |
+
dependencies = [
|
388 |
+
"hound",
|
389 |
+
"ndarray",
|
390 |
+
"ort",
|
391 |
+
]
|
392 |
+
|
393 |
+
[[package]]
|
394 |
+
name = "rustix"
|
395 |
+
version = "0.38.34"
|
396 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
397 |
+
checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f"
|
398 |
+
dependencies = [
|
399 |
+
"bitflags 2.5.0",
|
400 |
+
"errno",
|
401 |
+
"libc",
|
402 |
+
"linux-raw-sys",
|
403 |
+
"windows-sys",
|
404 |
+
]
|
405 |
+
|
406 |
+
[[package]]
|
407 |
+
name = "rustls"
|
408 |
+
version = "0.22.4"
|
409 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
410 |
+
checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432"
|
411 |
+
dependencies = [
|
412 |
+
"log",
|
413 |
+
"ring",
|
414 |
+
"rustls-pki-types",
|
415 |
+
"rustls-webpki",
|
416 |
+
"subtle",
|
417 |
+
"zeroize",
|
418 |
+
]
|
419 |
+
|
420 |
+
[[package]]
|
421 |
+
name = "rustls-pki-types"
|
422 |
+
version = "1.7.0"
|
423 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
424 |
+
checksum = "976295e77ce332211c0d24d92c0e83e50f5c5f046d11082cea19f3df13a3562d"
|
425 |
+
|
426 |
+
[[package]]
|
427 |
+
name = "rustls-webpki"
|
428 |
+
version = "0.102.4"
|
429 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
430 |
+
checksum = "ff448f7e92e913c4b7d4c6d8e4540a1724b319b4152b8aef6d4cf8339712b33e"
|
431 |
+
dependencies = [
|
432 |
+
"ring",
|
433 |
+
"rustls-pki-types",
|
434 |
+
"untrusted",
|
435 |
+
]
|
436 |
+
|
437 |
+
[[package]]
|
438 |
+
name = "sha2"
|
439 |
+
version = "0.10.8"
|
440 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
441 |
+
checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
|
442 |
+
dependencies = [
|
443 |
+
"cfg-if",
|
444 |
+
"cpufeatures",
|
445 |
+
"digest",
|
446 |
+
]
|
447 |
+
|
448 |
+
[[package]]
|
449 |
+
name = "spin"
|
450 |
+
version = "0.9.8"
|
451 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
452 |
+
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
|
453 |
+
|
454 |
+
[[package]]
|
455 |
+
name = "subtle"
|
456 |
+
version = "2.5.0"
|
457 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
458 |
+
checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
|
459 |
+
|
460 |
+
[[package]]
|
461 |
+
name = "syn"
|
462 |
+
version = "2.0.66"
|
463 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
464 |
+
checksum = "c42f3f41a2de00b01c0aaad383c5a45241efc8b2d1eda5661812fda5f3cdcff5"
|
465 |
+
dependencies = [
|
466 |
+
"proc-macro2",
|
467 |
+
"quote",
|
468 |
+
"unicode-ident",
|
469 |
+
]
|
470 |
+
|
471 |
+
[[package]]
|
472 |
+
name = "tar"
|
473 |
+
version = "0.4.40"
|
474 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
475 |
+
checksum = "b16afcea1f22891c49a00c751c7b63b2233284064f11a200fc624137c51e2ddb"
|
476 |
+
dependencies = [
|
477 |
+
"filetime",
|
478 |
+
"libc",
|
479 |
+
"xattr",
|
480 |
+
]
|
481 |
+
|
482 |
+
[[package]]
|
483 |
+
name = "thiserror"
|
484 |
+
version = "1.0.61"
|
485 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
486 |
+
checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709"
|
487 |
+
dependencies = [
|
488 |
+
"thiserror-impl",
|
489 |
+
]
|
490 |
+
|
491 |
+
[[package]]
|
492 |
+
name = "thiserror-impl"
|
493 |
+
version = "1.0.61"
|
494 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
495 |
+
checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533"
|
496 |
+
dependencies = [
|
497 |
+
"proc-macro2",
|
498 |
+
"quote",
|
499 |
+
"syn",
|
500 |
+
]
|
501 |
+
|
502 |
+
[[package]]
|
503 |
+
name = "tinyvec"
|
504 |
+
version = "1.6.0"
|
505 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
506 |
+
checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50"
|
507 |
+
dependencies = [
|
508 |
+
"tinyvec_macros",
|
509 |
+
]
|
510 |
+
|
511 |
+
[[package]]
|
512 |
+
name = "tinyvec_macros"
|
513 |
+
version = "0.1.1"
|
514 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
515 |
+
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
|
516 |
+
|
517 |
+
[[package]]
|
518 |
+
name = "tracing"
|
519 |
+
version = "0.1.40"
|
520 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
521 |
+
checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
|
522 |
+
dependencies = [
|
523 |
+
"pin-project-lite",
|
524 |
+
"tracing-attributes",
|
525 |
+
"tracing-core",
|
526 |
+
]
|
527 |
+
|
528 |
+
[[package]]
|
529 |
+
name = "tracing-attributes"
|
530 |
+
version = "0.1.27"
|
531 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
532 |
+
checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
|
533 |
+
dependencies = [
|
534 |
+
"proc-macro2",
|
535 |
+
"quote",
|
536 |
+
"syn",
|
537 |
+
]
|
538 |
+
|
539 |
+
[[package]]
|
540 |
+
name = "tracing-core"
|
541 |
+
version = "0.1.32"
|
542 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
543 |
+
checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
|
544 |
+
dependencies = [
|
545 |
+
"once_cell",
|
546 |
+
]
|
547 |
+
|
548 |
+
[[package]]
|
549 |
+
name = "typenum"
|
550 |
+
version = "1.17.0"
|
551 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
552 |
+
checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825"
|
553 |
+
|
554 |
+
[[package]]
|
555 |
+
name = "unicode-bidi"
|
556 |
+
version = "0.3.15"
|
557 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
558 |
+
checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75"
|
559 |
+
|
560 |
+
[[package]]
|
561 |
+
name = "unicode-ident"
|
562 |
+
version = "1.0.12"
|
563 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
564 |
+
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
|
565 |
+
|
566 |
+
[[package]]
|
567 |
+
name = "unicode-normalization"
|
568 |
+
version = "0.1.23"
|
569 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
570 |
+
checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
|
571 |
+
dependencies = [
|
572 |
+
"tinyvec",
|
573 |
+
]
|
574 |
+
|
575 |
+
[[package]]
|
576 |
+
name = "untrusted"
|
577 |
+
version = "0.9.0"
|
578 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
579 |
+
checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
|
580 |
+
|
581 |
+
[[package]]
|
582 |
+
name = "ureq"
|
583 |
+
version = "2.9.7"
|
584 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
585 |
+
checksum = "d11a831e3c0b56e438a28308e7c810799e3c118417f342d30ecec080105395cd"
|
586 |
+
dependencies = [
|
587 |
+
"base64",
|
588 |
+
"log",
|
589 |
+
"once_cell",
|
590 |
+
"rustls",
|
591 |
+
"rustls-pki-types",
|
592 |
+
"rustls-webpki",
|
593 |
+
"url",
|
594 |
+
"webpki-roots",
|
595 |
+
]
|
596 |
+
|
597 |
+
[[package]]
|
598 |
+
name = "url"
|
599 |
+
version = "2.5.0"
|
600 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
601 |
+
checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633"
|
602 |
+
dependencies = [
|
603 |
+
"form_urlencoded",
|
604 |
+
"idna",
|
605 |
+
"percent-encoding",
|
606 |
+
]
|
607 |
+
|
608 |
+
[[package]]
|
609 |
+
name = "version_check"
|
610 |
+
version = "0.9.4"
|
611 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
612 |
+
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
|
613 |
+
|
614 |
+
[[package]]
|
615 |
+
name = "wasi"
|
616 |
+
version = "0.11.0+wasi-snapshot-preview1"
|
617 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
618 |
+
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
619 |
+
|
620 |
+
[[package]]
|
621 |
+
name = "wasm-bindgen"
|
622 |
+
version = "0.2.92"
|
623 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
624 |
+
checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
|
625 |
+
dependencies = [
|
626 |
+
"cfg-if",
|
627 |
+
"wasm-bindgen-macro",
|
628 |
+
]
|
629 |
+
|
630 |
+
[[package]]
|
631 |
+
name = "wasm-bindgen-backend"
|
632 |
+
version = "0.2.92"
|
633 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
634 |
+
checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
|
635 |
+
dependencies = [
|
636 |
+
"bumpalo",
|
637 |
+
"log",
|
638 |
+
"once_cell",
|
639 |
+
"proc-macro2",
|
640 |
+
"quote",
|
641 |
+
"syn",
|
642 |
+
"wasm-bindgen-shared",
|
643 |
+
]
|
644 |
+
|
645 |
+
[[package]]
|
646 |
+
name = "wasm-bindgen-macro"
|
647 |
+
version = "0.2.92"
|
648 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
649 |
+
checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
|
650 |
+
dependencies = [
|
651 |
+
"quote",
|
652 |
+
"wasm-bindgen-macro-support",
|
653 |
+
]
|
654 |
+
|
655 |
+
[[package]]
|
656 |
+
name = "wasm-bindgen-macro-support"
|
657 |
+
version = "0.2.92"
|
658 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
659 |
+
checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
|
660 |
+
dependencies = [
|
661 |
+
"proc-macro2",
|
662 |
+
"quote",
|
663 |
+
"syn",
|
664 |
+
"wasm-bindgen-backend",
|
665 |
+
"wasm-bindgen-shared",
|
666 |
+
]
|
667 |
+
|
668 |
+
[[package]]
|
669 |
+
name = "wasm-bindgen-shared"
|
670 |
+
version = "0.2.92"
|
671 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
672 |
+
checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
|
673 |
+
|
674 |
+
[[package]]
|
675 |
+
name = "web-sys"
|
676 |
+
version = "0.3.69"
|
677 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
678 |
+
checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef"
|
679 |
+
dependencies = [
|
680 |
+
"js-sys",
|
681 |
+
"wasm-bindgen",
|
682 |
+
]
|
683 |
+
|
684 |
+
[[package]]
|
685 |
+
name = "webpki-roots"
|
686 |
+
version = "0.26.1"
|
687 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
688 |
+
checksum = "b3de34ae270483955a94f4b21bdaaeb83d508bb84a01435f393818edb0012009"
|
689 |
+
dependencies = [
|
690 |
+
"rustls-pki-types",
|
691 |
+
]
|
692 |
+
|
693 |
+
[[package]]
|
694 |
+
name = "windows-sys"
|
695 |
+
version = "0.52.0"
|
696 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
697 |
+
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
|
698 |
+
dependencies = [
|
699 |
+
"windows-targets",
|
700 |
+
]
|
701 |
+
|
702 |
+
[[package]]
|
703 |
+
name = "windows-targets"
|
704 |
+
version = "0.52.5"
|
705 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
706 |
+
checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb"
|
707 |
+
dependencies = [
|
708 |
+
"windows_aarch64_gnullvm",
|
709 |
+
"windows_aarch64_msvc",
|
710 |
+
"windows_i686_gnu",
|
711 |
+
"windows_i686_gnullvm",
|
712 |
+
"windows_i686_msvc",
|
713 |
+
"windows_x86_64_gnu",
|
714 |
+
"windows_x86_64_gnullvm",
|
715 |
+
"windows_x86_64_msvc",
|
716 |
+
]
|
717 |
+
|
718 |
+
[[package]]
|
719 |
+
name = "windows_aarch64_gnullvm"
|
720 |
+
version = "0.52.5"
|
721 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
722 |
+
checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263"
|
723 |
+
|
724 |
+
[[package]]
|
725 |
+
name = "windows_aarch64_msvc"
|
726 |
+
version = "0.52.5"
|
727 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
728 |
+
checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6"
|
729 |
+
|
730 |
+
[[package]]
|
731 |
+
name = "windows_i686_gnu"
|
732 |
+
version = "0.52.5"
|
733 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
734 |
+
checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670"
|
735 |
+
|
736 |
+
[[package]]
|
737 |
+
name = "windows_i686_gnullvm"
|
738 |
+
version = "0.52.5"
|
739 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
740 |
+
checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9"
|
741 |
+
|
742 |
+
[[package]]
|
743 |
+
name = "windows_i686_msvc"
|
744 |
+
version = "0.52.5"
|
745 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
746 |
+
checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf"
|
747 |
+
|
748 |
+
[[package]]
|
749 |
+
name = "windows_x86_64_gnu"
|
750 |
+
version = "0.52.5"
|
751 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
752 |
+
checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9"
|
753 |
+
|
754 |
+
[[package]]
|
755 |
+
name = "windows_x86_64_gnullvm"
|
756 |
+
version = "0.52.5"
|
757 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
758 |
+
checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596"
|
759 |
+
|
760 |
+
[[package]]
|
761 |
+
name = "windows_x86_64_msvc"
|
762 |
+
version = "0.52.5"
|
763 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
764 |
+
checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
|
765 |
+
|
766 |
+
[[package]]
|
767 |
+
name = "xattr"
|
768 |
+
version = "1.3.1"
|
769 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
770 |
+
checksum = "8da84f1a25939b27f6820d92aed108f83ff920fdf11a7b19366c27c4cda81d4f"
|
771 |
+
dependencies = [
|
772 |
+
"libc",
|
773 |
+
"linux-raw-sys",
|
774 |
+
"rustix",
|
775 |
+
]
|
776 |
+
|
777 |
+
[[package]]
|
778 |
+
name = "zeroize"
|
779 |
+
version = "1.8.1"
|
780 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
781 |
+
checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde"
|
snakers4_silero-vad_master/examples/rust-example/Cargo.toml
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[package]
|
2 |
+
name = "rust-example"
|
3 |
+
version = "0.1.0"
|
4 |
+
edition = "2021"
|
5 |
+
|
6 |
+
[dependencies]
|
7 |
+
ort = { version = "2.0.0-rc.2", features = ["load-dynamic", "ndarray"] }
|
8 |
+
ndarray = "0.15"
|
9 |
+
hound = "3"
|
snakers4_silero-vad_master/examples/rust-example/README.md
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Stream example in Rust
|
2 |
+
Made after [C++ stream example](https://github.com/snakers4/silero-vad/tree/master/examples/cpp)
|
3 |
+
|
4 |
+
## Dependencies
|
5 |
+
- To build Rust crate `ort` you need `cc` installed.
|
6 |
+
|
7 |
+
## Usage
|
8 |
+
Just
|
9 |
+
```
|
10 |
+
cargo run
|
11 |
+
```
|
12 |
+
If you run example outside of this repo adjust environment variable
|
13 |
+
```
|
14 |
+
SILERO_MODEL_PATH=/path/to/silero_vad.onnx cargo run
|
15 |
+
```
|
16 |
+
If you need to test against other wav file, not `recorder.wav`, specify it as the first argument
|
17 |
+
```
|
18 |
+
cargo run -- /path/to/audio/file.wav
|
19 |
+
```
|
snakers4_silero-vad_master/examples/rust-example/src/main.rs
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
mod silero;
|
2 |
+
mod utils;
|
3 |
+
mod vad_iter;
|
4 |
+
|
5 |
+
fn main() {
|
6 |
+
let model_path = std::env::var("SILERO_MODEL_PATH")
|
7 |
+
.unwrap_or_else(|_| String::from("../../files/silero_vad.onnx"));
|
8 |
+
let audio_path = std::env::args()
|
9 |
+
.nth(1)
|
10 |
+
.unwrap_or_else(|| String::from("recorder.wav"));
|
11 |
+
let mut wav_reader = hound::WavReader::open(audio_path).unwrap();
|
12 |
+
let sample_rate = match wav_reader.spec().sample_rate {
|
13 |
+
8000 => utils::SampleRate::EightkHz,
|
14 |
+
16000 => utils::SampleRate::SixteenkHz,
|
15 |
+
_ => panic!("Unsupported sample rate. Expect 8 kHz or 16 kHz."),
|
16 |
+
};
|
17 |
+
if wav_reader.spec().sample_format != hound::SampleFormat::Int {
|
18 |
+
panic!("Unsupported sample format. Expect Int.");
|
19 |
+
}
|
20 |
+
let content = wav_reader
|
21 |
+
.samples()
|
22 |
+
.filter_map(|x| x.ok())
|
23 |
+
.collect::<Vec<i16>>();
|
24 |
+
assert!(!content.is_empty());
|
25 |
+
let silero = silero::Silero::new(sample_rate, model_path).unwrap();
|
26 |
+
let vad_params = utils::VadParams {
|
27 |
+
sample_rate: sample_rate.into(),
|
28 |
+
..Default::default()
|
29 |
+
};
|
30 |
+
let mut vad_iterator = vad_iter::VadIter::new(silero, vad_params);
|
31 |
+
vad_iterator.process(&content).unwrap();
|
32 |
+
for timestamp in vad_iterator.speeches() {
|
33 |
+
println!("{}", timestamp);
|
34 |
+
}
|
35 |
+
println!("Finished.");
|
36 |
+
}
|
snakers4_silero-vad_master/examples/rust-example/src/silero.rs
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
use crate::utils;
|
2 |
+
use ndarray::{Array, Array2, ArrayBase, ArrayD, Dim, IxDynImpl, OwnedRepr};
|
3 |
+
use std::path::Path;
|
4 |
+
|
5 |
+
#[derive(Debug)]
|
6 |
+
pub struct Silero {
|
7 |
+
session: ort::Session,
|
8 |
+
sample_rate: ArrayBase<OwnedRepr<i64>, Dim<[usize; 1]>>,
|
9 |
+
h: ArrayBase<OwnedRepr<f32>, Dim<IxDynImpl>>,
|
10 |
+
c: ArrayBase<OwnedRepr<f32>, Dim<IxDynImpl>>,
|
11 |
+
}
|
12 |
+
|
13 |
+
impl Silero {
|
14 |
+
pub fn new(
|
15 |
+
sample_rate: utils::SampleRate,
|
16 |
+
model_path: impl AsRef<Path>,
|
17 |
+
) -> Result<Self, ort::Error> {
|
18 |
+
let session = ort::Session::builder()?.commit_from_file(model_path)?;
|
19 |
+
let h = ArrayD::<f32>::zeros([2, 1, 64].as_slice());
|
20 |
+
let c = ArrayD::<f32>::zeros([2, 1, 64].as_slice());
|
21 |
+
let sample_rate = Array::from_shape_vec([1], vec![sample_rate.into()]).unwrap();
|
22 |
+
Ok(Self {
|
23 |
+
session,
|
24 |
+
sample_rate,
|
25 |
+
h,
|
26 |
+
c,
|
27 |
+
})
|
28 |
+
}
|
29 |
+
|
30 |
+
pub fn reset(&mut self) {
|
31 |
+
self.h = ArrayD::<f32>::zeros([2, 1, 64].as_slice());
|
32 |
+
self.c = ArrayD::<f32>::zeros([2, 1, 64].as_slice());
|
33 |
+
}
|
34 |
+
|
35 |
+
pub fn calc_level(&mut self, audio_frame: &[i16]) -> Result<f32, ort::Error> {
|
36 |
+
let data = audio_frame
|
37 |
+
.iter()
|
38 |
+
.map(|x| (*x as f32) / (i16::MAX as f32))
|
39 |
+
.collect::<Vec<_>>();
|
40 |
+
let frame = Array2::<f32>::from_shape_vec([1, data.len()], data).unwrap();
|
41 |
+
let inps = ort::inputs![
|
42 |
+
frame,
|
43 |
+
self.sample_rate.clone(),
|
44 |
+
std::mem::take(&mut self.h),
|
45 |
+
std::mem::take(&mut self.c)
|
46 |
+
]?;
|
47 |
+
let res = self
|
48 |
+
.session
|
49 |
+
.run(ort::SessionInputs::ValueSlice::<4>(&inps))?;
|
50 |
+
self.h = res["hn"].try_extract_tensor().unwrap().to_owned();
|
51 |
+
self.c = res["cn"].try_extract_tensor().unwrap().to_owned();
|
52 |
+
Ok(*res["output"]
|
53 |
+
.try_extract_raw_tensor::<f32>()
|
54 |
+
.unwrap()
|
55 |
+
.1
|
56 |
+
.first()
|
57 |
+
.unwrap())
|
58 |
+
}
|
59 |
+
}
|
snakers4_silero-vad_master/examples/rust-example/src/utils.rs
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#[derive(Debug, Clone, Copy)]
|
2 |
+
pub enum SampleRate {
|
3 |
+
EightkHz,
|
4 |
+
SixteenkHz,
|
5 |
+
}
|
6 |
+
|
7 |
+
impl From<SampleRate> for i64 {
|
8 |
+
fn from(value: SampleRate) -> Self {
|
9 |
+
match value {
|
10 |
+
SampleRate::EightkHz => 8000,
|
11 |
+
SampleRate::SixteenkHz => 16000,
|
12 |
+
}
|
13 |
+
}
|
14 |
+
}
|
15 |
+
|
16 |
+
impl From<SampleRate> for usize {
|
17 |
+
fn from(value: SampleRate) -> Self {
|
18 |
+
match value {
|
19 |
+
SampleRate::EightkHz => 8000,
|
20 |
+
SampleRate::SixteenkHz => 16000,
|
21 |
+
}
|
22 |
+
}
|
23 |
+
}
|
24 |
+
|
25 |
+
#[derive(Debug)]
|
26 |
+
pub struct VadParams {
|
27 |
+
pub frame_size: usize,
|
28 |
+
pub threshold: f32,
|
29 |
+
pub min_silence_duration_ms: usize,
|
30 |
+
pub speech_pad_ms: usize,
|
31 |
+
pub min_speech_duration_ms: usize,
|
32 |
+
pub max_speech_duration_s: f32,
|
33 |
+
pub sample_rate: usize,
|
34 |
+
}
|
35 |
+
|
36 |
+
impl Default for VadParams {
|
37 |
+
fn default() -> Self {
|
38 |
+
Self {
|
39 |
+
frame_size: 64,
|
40 |
+
threshold: 0.5,
|
41 |
+
min_silence_duration_ms: 0,
|
42 |
+
speech_pad_ms: 64,
|
43 |
+
min_speech_duration_ms: 64,
|
44 |
+
max_speech_duration_s: f32::INFINITY,
|
45 |
+
sample_rate: 16000,
|
46 |
+
}
|
47 |
+
}
|
48 |
+
}
|
49 |
+
|
50 |
+
#[derive(Debug, Default)]
|
51 |
+
pub struct TimeStamp {
|
52 |
+
pub start: i64,
|
53 |
+
pub end: i64,
|
54 |
+
}
|
55 |
+
|
56 |
+
impl std::fmt::Display for TimeStamp {
|
57 |
+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
58 |
+
write!(f, "[start:{:08}, end:{:08}]", self.start, self.end)
|
59 |
+
}
|
60 |
+
}
|
snakers4_silero-vad_master/examples/rust-example/src/vad_iter.rs
ADDED
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
use crate::{silero, utils};
|
2 |
+
|
3 |
+
const DEBUG_SPEECH_PROB: bool = true;
|
4 |
+
#[derive(Debug)]
|
5 |
+
pub struct VadIter {
|
6 |
+
silero: silero::Silero,
|
7 |
+
params: Params,
|
8 |
+
state: State,
|
9 |
+
}
|
10 |
+
|
11 |
+
impl VadIter {
|
12 |
+
pub fn new(silero: silero::Silero, params: utils::VadParams) -> Self {
|
13 |
+
Self {
|
14 |
+
silero,
|
15 |
+
params: Params::from(params),
|
16 |
+
state: State::new(),
|
17 |
+
}
|
18 |
+
}
|
19 |
+
|
20 |
+
pub fn process(&mut self, samples: &[i16]) -> Result<(), ort::Error> {
|
21 |
+
self.reset_states();
|
22 |
+
for audio_frame in samples.chunks_exact(self.params.frame_size_samples) {
|
23 |
+
let speech_prob = self.silero.calc_level(audio_frame)?;
|
24 |
+
self.state.update(&self.params, speech_prob);
|
25 |
+
}
|
26 |
+
self.state.check_for_last_speech(samples.len());
|
27 |
+
Ok(())
|
28 |
+
}
|
29 |
+
|
30 |
+
pub fn speeches(&self) -> &[utils::TimeStamp] {
|
31 |
+
&self.state.speeches
|
32 |
+
}
|
33 |
+
}
|
34 |
+
|
35 |
+
impl VadIter {
|
36 |
+
fn reset_states(&mut self) {
|
37 |
+
self.silero.reset();
|
38 |
+
self.state = State::new()
|
39 |
+
}
|
40 |
+
}
|
41 |
+
|
42 |
+
#[allow(unused)]
|
43 |
+
#[derive(Debug)]
|
44 |
+
struct Params {
|
45 |
+
frame_size: usize,
|
46 |
+
threshold: f32,
|
47 |
+
min_silence_duration_ms: usize,
|
48 |
+
speech_pad_ms: usize,
|
49 |
+
min_speech_duration_ms: usize,
|
50 |
+
max_speech_duration_s: f32,
|
51 |
+
sample_rate: usize,
|
52 |
+
sr_per_ms: usize,
|
53 |
+
frame_size_samples: usize,
|
54 |
+
min_speech_samples: usize,
|
55 |
+
speech_pad_samples: usize,
|
56 |
+
max_speech_samples: f32,
|
57 |
+
min_silence_samples: usize,
|
58 |
+
min_silence_samples_at_max_speech: usize,
|
59 |
+
}
|
60 |
+
|
61 |
+
impl From<utils::VadParams> for Params {
|
62 |
+
fn from(value: utils::VadParams) -> Self {
|
63 |
+
let frame_size = value.frame_size;
|
64 |
+
let threshold = value.threshold;
|
65 |
+
let min_silence_duration_ms = value.min_silence_duration_ms;
|
66 |
+
let speech_pad_ms = value.speech_pad_ms;
|
67 |
+
let min_speech_duration_ms = value.min_speech_duration_ms;
|
68 |
+
let max_speech_duration_s = value.max_speech_duration_s;
|
69 |
+
let sample_rate = value.sample_rate;
|
70 |
+
let sr_per_ms = sample_rate / 1000;
|
71 |
+
let frame_size_samples = frame_size * sr_per_ms;
|
72 |
+
let min_speech_samples = sr_per_ms * min_speech_duration_ms;
|
73 |
+
let speech_pad_samples = sr_per_ms * speech_pad_ms;
|
74 |
+
let max_speech_samples = sample_rate as f32 * max_speech_duration_s
|
75 |
+
- frame_size_samples as f32
|
76 |
+
- 2.0 * speech_pad_samples as f32;
|
77 |
+
let min_silence_samples = sr_per_ms * min_silence_duration_ms;
|
78 |
+
let min_silence_samples_at_max_speech = sr_per_ms * 98;
|
79 |
+
Self {
|
80 |
+
frame_size,
|
81 |
+
threshold,
|
82 |
+
min_silence_duration_ms,
|
83 |
+
speech_pad_ms,
|
84 |
+
min_speech_duration_ms,
|
85 |
+
max_speech_duration_s,
|
86 |
+
sample_rate,
|
87 |
+
sr_per_ms,
|
88 |
+
frame_size_samples,
|
89 |
+
min_speech_samples,
|
90 |
+
speech_pad_samples,
|
91 |
+
max_speech_samples,
|
92 |
+
min_silence_samples,
|
93 |
+
min_silence_samples_at_max_speech,
|
94 |
+
}
|
95 |
+
}
|
96 |
+
}
|
97 |
+
|
98 |
+
#[derive(Debug, Default)]
|
99 |
+
struct State {
|
100 |
+
current_sample: usize,
|
101 |
+
temp_end: usize,
|
102 |
+
next_start: usize,
|
103 |
+
prev_end: usize,
|
104 |
+
triggered: bool,
|
105 |
+
current_speech: utils::TimeStamp,
|
106 |
+
speeches: Vec<utils::TimeStamp>,
|
107 |
+
}
|
108 |
+
|
109 |
+
impl State {
|
110 |
+
fn new() -> Self {
|
111 |
+
Default::default()
|
112 |
+
}
|
113 |
+
|
114 |
+
fn update(&mut self, params: &Params, speech_prob: f32) {
|
115 |
+
self.current_sample += params.frame_size_samples;
|
116 |
+
if speech_prob > params.threshold {
|
117 |
+
if self.temp_end != 0 {
|
118 |
+
self.temp_end = 0;
|
119 |
+
if self.next_start < self.prev_end {
|
120 |
+
self.next_start = self
|
121 |
+
.current_sample
|
122 |
+
.saturating_sub(params.frame_size_samples)
|
123 |
+
}
|
124 |
+
}
|
125 |
+
if !self.triggered {
|
126 |
+
self.debug(speech_prob, params, "start");
|
127 |
+
self.triggered = true;
|
128 |
+
self.current_speech.start =
|
129 |
+
self.current_sample as i64 - params.frame_size_samples as i64;
|
130 |
+
}
|
131 |
+
return;
|
132 |
+
}
|
133 |
+
if self.triggered
|
134 |
+
&& (self.current_sample as i64 - self.current_speech.start) as f32
|
135 |
+
> params.max_speech_samples
|
136 |
+
{
|
137 |
+
if self.prev_end > 0 {
|
138 |
+
self.current_speech.end = self.prev_end as _;
|
139 |
+
self.take_speech();
|
140 |
+
if self.next_start < self.prev_end {
|
141 |
+
self.triggered = false
|
142 |
+
} else {
|
143 |
+
self.current_speech.start = self.next_start as _;
|
144 |
+
}
|
145 |
+
self.prev_end = 0;
|
146 |
+
self.next_start = 0;
|
147 |
+
self.temp_end = 0;
|
148 |
+
} else {
|
149 |
+
self.current_speech.end = self.current_sample as _;
|
150 |
+
self.take_speech();
|
151 |
+
self.prev_end = 0;
|
152 |
+
self.next_start = 0;
|
153 |
+
self.temp_end = 0;
|
154 |
+
self.triggered = false;
|
155 |
+
}
|
156 |
+
return;
|
157 |
+
}
|
158 |
+
if speech_prob >= (params.threshold - 0.15) && (speech_prob < params.threshold) {
|
159 |
+
if self.triggered {
|
160 |
+
self.debug(speech_prob, params, "speaking")
|
161 |
+
} else {
|
162 |
+
self.debug(speech_prob, params, "silence")
|
163 |
+
}
|
164 |
+
}
|
165 |
+
if self.triggered && speech_prob < (params.threshold - 0.15) {
|
166 |
+
self.debug(speech_prob, params, "end");
|
167 |
+
if self.temp_end == 0 {
|
168 |
+
self.temp_end = self.current_sample;
|
169 |
+
}
|
170 |
+
if self.current_sample.saturating_sub(self.temp_end)
|
171 |
+
> params.min_silence_samples_at_max_speech
|
172 |
+
{
|
173 |
+
self.prev_end = self.temp_end;
|
174 |
+
}
|
175 |
+
if self.current_sample.saturating_sub(self.temp_end) >= params.min_silence_samples {
|
176 |
+
self.current_speech.end = self.temp_end as _;
|
177 |
+
if self.current_speech.end - self.current_speech.start
|
178 |
+
> params.min_speech_samples as _
|
179 |
+
{
|
180 |
+
self.take_speech();
|
181 |
+
self.prev_end = 0;
|
182 |
+
self.next_start = 0;
|
183 |
+
self.temp_end = 0;
|
184 |
+
self.triggered = false;
|
185 |
+
}
|
186 |
+
}
|
187 |
+
}
|
188 |
+
}
|
189 |
+
|
190 |
+
fn take_speech(&mut self) {
|
191 |
+
self.speeches.push(std::mem::take(&mut self.current_speech)); // current speech becomes TimeStamp::default() due to take()
|
192 |
+
}
|
193 |
+
|
194 |
+
fn check_for_last_speech(&mut self, last_sample: usize) {
|
195 |
+
if self.current_speech.start > 0 {
|
196 |
+
self.current_speech.end = last_sample as _;
|
197 |
+
self.take_speech();
|
198 |
+
self.prev_end = 0;
|
199 |
+
self.next_start = 0;
|
200 |
+
self.temp_end = 0;
|
201 |
+
self.triggered = false;
|
202 |
+
}
|
203 |
+
}
|
204 |
+
|
205 |
+
fn debug(&self, speech_prob: f32, params: &Params, title: &str) {
|
206 |
+
if DEBUG_SPEECH_PROB {
|
207 |
+
let speech = self.current_sample as f32
|
208 |
+
- params.frame_size_samples as f32
|
209 |
+
- if title == "end" {
|
210 |
+
params.speech_pad_samples
|
211 |
+
} else {
|
212 |
+
0
|
213 |
+
} as f32; // minus window_size_samples to get precise start time point.
|
214 |
+
println!(
|
215 |
+
"[{:10}: {:.3} s ({:.3}) {:8}]",
|
216 |
+
title,
|
217 |
+
speech / params.sample_rate as f32,
|
218 |
+
speech_prob,
|
219 |
+
self.current_sample - params.frame_size_samples,
|
220 |
+
);
|
221 |
+
}
|
222 |
+
}
|
223 |
+
}
|
snakers4_silero-vad_master/files/lang_dict_95.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"59": "mg, Malagasy", "76": "tk, Turkmen", "20": "lb, Luxembourgish, Letzeburgesch", "62": "or, Oriya", "30": "en, English", "26": "oc, Occitan", "69": "no, Norwegian", "77": "sr, Serbian", "90": "bs, Bosnian", "71": "el, Greek, Modern (1453\u2013)", "15": "az, Azerbaijani", "12": "lo, Lao", "85": "zh-HK, Chinese", "79": "cs, Czech", "43": "sv, Swedish", "37": "mn, Mongolian", "32": "fi, Finnish", "51": "tg, Tajik", "46": "am, Amharic", "17": "nn, Norwegian Nynorsk", "40": "ja, Japanese", "8": "it, Italian", "21": "ha, Hausa", "11": "as, Assamese", "29": "fa, Persian", "82": "bn, Bengali", "54": "mk, Macedonian", "31": "sw, Swahili", "45": "vi, Vietnamese", "41": "ur, Urdu", "74": "bo, Tibetan", "4": "hi, Hindi", "86": "mr, Marathi", "3": "fy-NL, Western Frisian", "65": "sk, Slovak", "2": "ln, Lingala", "92": "gl, Galician", "53": "sn, Shona", "87": "su, Sundanese", "35": "tt, Tatar", "93": "kn, Kannada", "6": "yo, Yoruba", "27": "ps, Pashto, Pushto", "34": "hy, Armenian", "25": "pa-IN, Punjabi, Panjabi", "23": "nl, Dutch, Flemish", "48": "th, Thai", "73": "mt, Maltese", "55": "ar, Arabic", "89": "ba, Bashkir", "78": "bg, Bulgarian", "42": "yi, Yiddish", "5": "ru, Russian", "84": "sv-SE, Swedish", "80": "tr, Turkish", "33": "sq, Albanian", "38": "kk, Kazakh", "50": "pl, Polish", "9": "hr, Croatian", "66": "ky, Kirghiz, Kyrgyz", "49": "hu, Hungarian", "10": "si, Sinhala, Sinhalese", "56": "la, Latin", "75": "de, German", "14": "ko, Korean", "22": "id, Indonesian", "47": "sl, Slovenian", "57": "be, Belarusian", "36": "ta, Tamil", "7": "da, Danish", "91": "sd, Sindhi", "28": "et, Estonian", "63": "pt, Portuguese", "60": "ne, Nepali", "94": "zh-TW, Chinese", "18": "zh-CN, Chinese", "88": "rw, Kinyarwanda", "19": "es, Spanish, Castilian", "39": "ht, Haitian, Haitian Creole", "64": "tl, Tagalog", "83": "ms, Malay", "70": "ro, Romanian, Moldavian, Moldovan", "68": "pa, Punjabi, Panjabi", "52": "uz, Uzbek", "58": "km, Central Khmer", "67": "my, Burmese", "0": "fr, French", "24": "af, Afrikaans", "16": "gu, Gujarati", "81": "so, Somali", "13": "uk, Ukrainian", "44": "ca, Catalan, Valencian", "72": "ml, Malayalam", "61": "te, Telugu", "1": "zh, Chinese"}
|
snakers4_silero-vad_master/files/lang_group_dict_95.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"0": ["Afrikaans", "Dutch, Flemish", "Western Frisian"], "1": ["Turkish", "Azerbaijani"], "2": ["Russian", "Slovak", "Ukrainian", "Czech", "Polish", "Belarusian"], "3": ["Bulgarian", "Macedonian", "Serbian", "Croatian", "Bosnian", "Slovenian"], "4": ["Norwegian Nynorsk", "Swedish", "Danish", "Norwegian"], "5": ["English"], "6": ["Finnish", "Estonian"], "7": ["Yiddish", "Luxembourgish, Letzeburgesch", "German"], "8": ["Spanish", "Occitan", "Portuguese", "Catalan, Valencian", "Galician", "Spanish, Castilian", "Italian"], "9": ["Maltese", "Arabic"], "10": ["Marathi"], "11": ["Hindi", "Urdu"], "12": ["Lao", "Thai"], "13": ["Malay", "Indonesian"], "14": ["Romanian, Moldavian, Moldovan"], "15": ["Tagalog"], "16": ["Tajik", "Persian"], "17": ["Kazakh", "Uzbek", "Kirghiz, Kyrgyz"], "18": ["Kinyarwanda"], "19": ["Tatar", "Bashkir"], "20": ["French"], "21": ["Chinese"], "22": ["Lingala"], "23": ["Yoruba"], "24": ["Sinhala, Sinhalese"], "25": ["Assamese"], "26": ["Korean"], "27": ["Gujarati"], "28": ["Hausa"], "29": ["Punjabi, Panjabi"], "30": ["Pashto, Pushto"], "31": ["Swahili"], "32": ["Albanian"], "33": ["Armenian"], "34": ["Mongolian"], "35": ["Tamil"], "36": ["Haitian, Haitian Creole"], "37": ["Japanese"], "38": ["Vietnamese"], "39": ["Amharic"], "40": ["Hungarian"], "41": ["Shona"], "42": ["Latin"], "43": ["Central Khmer"], "44": ["Malagasy"], "45": ["Nepali"], "46": ["Telugu"], "47": ["Oriya"], "48": ["Burmese"], "49": ["Greek, Modern (1453\u2013)"], "50": ["Malayalam"], "51": ["Tibetan"], "52": ["Turkmen"], "53": ["Somali"], "54": ["Bengali"], "55": ["Sundanese"], "56": ["Sindhi"], "57": ["Kannada"]}
|
snakers4_silero-vad_master/files/silero_logo.jpg
ADDED
snakers4_silero-vad_master/files/silero_vad.jit
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:99033608562094bbb44e2363198cd47647a668f846c4c9a9edde68b4800b5fd4
|
3 |
+
size 1439299
|
snakers4_silero-vad_master/files/silero_vad.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a35ebf52fd3ce5f1469b2a36158dba761bc47b973ea3382b3186ca15b1f5af28
|
3 |
+
size 1807522
|
snakers4_silero-vad_master/hubconf.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dependencies = ['torch', 'torchaudio']
|
2 |
+
import torch
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
from utils_vad import (init_jit_model,
|
6 |
+
get_speech_timestamps,
|
7 |
+
get_number_ts,
|
8 |
+
get_language,
|
9 |
+
get_language_and_group,
|
10 |
+
save_audio,
|
11 |
+
read_audio,
|
12 |
+
VADIterator,
|
13 |
+
collect_chunks,
|
14 |
+
drop_chunks,
|
15 |
+
Validator,
|
16 |
+
OnnxWrapper)
|
17 |
+
|
18 |
+
|
19 |
+
def versiontuple(v):
|
20 |
+
splitted = v.split('+')[0].split(".")
|
21 |
+
version_list = []
|
22 |
+
for i in splitted:
|
23 |
+
try:
|
24 |
+
version_list.append(int(i))
|
25 |
+
except:
|
26 |
+
version_list.append(0)
|
27 |
+
return tuple(version_list)
|
28 |
+
|
29 |
+
|
30 |
+
def silero_vad(onnx=False, force_onnx_cpu=False):
|
31 |
+
"""Silero Voice Activity Detector
|
32 |
+
Returns a model with a set of utils
|
33 |
+
Please see https://github.com/snakers4/silero-vad for usage examples
|
34 |
+
"""
|
35 |
+
|
36 |
+
if not onnx:
|
37 |
+
installed_version = torch.__version__
|
38 |
+
supported_version = '1.12.0'
|
39 |
+
if versiontuple(installed_version) < versiontuple(supported_version):
|
40 |
+
raise Exception(f'Please install torch {supported_version} or greater ({installed_version} installed)')
|
41 |
+
|
42 |
+
model_dir = os.path.join(os.path.dirname(__file__), 'files')
|
43 |
+
if onnx:
|
44 |
+
model = OnnxWrapper(os.path.join(model_dir, 'silero_vad.onnx'), force_onnx_cpu)
|
45 |
+
else:
|
46 |
+
model = init_jit_model(os.path.join(model_dir, 'silero_vad.jit'))
|
47 |
+
utils = (get_speech_timestamps,
|
48 |
+
save_audio,
|
49 |
+
read_audio,
|
50 |
+
VADIterator,
|
51 |
+
collect_chunks)
|
52 |
+
|
53 |
+
return model, utils
|
54 |
+
|
55 |
+
|
56 |
+
def silero_number_detector(onnx=False, force_onnx_cpu=False):
|
57 |
+
"""Silero Number Detector
|
58 |
+
Returns a model with a set of utils
|
59 |
+
Please see https://github.com/snakers4/silero-vad for usage examples
|
60 |
+
"""
|
61 |
+
raise NotImplementedError('This model has been deprecated and is not supported anymore.')
|
62 |
+
if onnx:
|
63 |
+
url = 'https://models.silero.ai/vad_models/number_detector.onnx'
|
64 |
+
else:
|
65 |
+
url = 'https://models.silero.ai/vad_models/number_detector.jit'
|
66 |
+
model = Validator(url, force_onnx_cpu)
|
67 |
+
utils = (get_number_ts,
|
68 |
+
save_audio,
|
69 |
+
read_audio,
|
70 |
+
collect_chunks,
|
71 |
+
drop_chunks)
|
72 |
+
|
73 |
+
return model, utils
|
74 |
+
|
75 |
+
|
76 |
+
def silero_lang_detector(onnx=False, force_onnx_cpu=False):
|
77 |
+
"""Silero Language Classifier
|
78 |
+
Returns a model with a set of utils
|
79 |
+
Please see https://github.com/snakers4/silero-vad for usage examples
|
80 |
+
"""
|
81 |
+
raise NotImplementedError('This model has been deprecated and is not supported anymore.')
|
82 |
+
if onnx:
|
83 |
+
url = 'https://models.silero.ai/vad_models/number_detector.onnx'
|
84 |
+
else:
|
85 |
+
url = 'https://models.silero.ai/vad_models/number_detector.jit'
|
86 |
+
model = Validator(url, force_onnx_cpu)
|
87 |
+
utils = (get_language,
|
88 |
+
read_audio)
|
89 |
+
|
90 |
+
return model, utils
|
91 |
+
|
92 |
+
|
93 |
+
def silero_lang_detector_95(onnx=False, force_onnx_cpu=False):
|
94 |
+
"""Silero Language Classifier (95 languages)
|
95 |
+
Returns a model with a set of utils
|
96 |
+
Please see https://github.com/snakers4/silero-vad for usage examples
|
97 |
+
"""
|
98 |
+
raise NotImplementedError('This model has been deprecated and is not supported anymore.')
|
99 |
+
if onnx:
|
100 |
+
url = 'https://models.silero.ai/vad_models/lang_classifier_95.onnx'
|
101 |
+
else:
|
102 |
+
url = 'https://models.silero.ai/vad_models/lang_classifier_95.jit'
|
103 |
+
model = Validator(url, force_onnx_cpu)
|
104 |
+
|
105 |
+
model_dir = os.path.join(os.path.dirname(__file__), 'files')
|
106 |
+
with open(os.path.join(model_dir, 'lang_dict_95.json'), 'r') as f:
|
107 |
+
lang_dict = json.load(f)
|
108 |
+
|
109 |
+
with open(os.path.join(model_dir, 'lang_group_dict_95.json'), 'r') as f:
|
110 |
+
lang_group_dict = json.load(f)
|
111 |
+
|
112 |
+
utils = (get_language_and_group, read_audio)
|
113 |
+
|
114 |
+
return model, lang_dict, lang_group_dict, utils
|
snakers4_silero-vad_master/silero-vad.ipynb
ADDED
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {
|
6 |
+
"heading_collapsed": true,
|
7 |
+
"id": "62A6F_072Fwq"
|
8 |
+
},
|
9 |
+
"source": [
|
10 |
+
"## Install Dependencies"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": null,
|
16 |
+
"metadata": {
|
17 |
+
"hidden": true,
|
18 |
+
"id": "5w5AkskZ2Fwr"
|
19 |
+
},
|
20 |
+
"outputs": [],
|
21 |
+
"source": [
|
22 |
+
"#@title Install and Import Dependencies\n",
|
23 |
+
"\n",
|
24 |
+
"# this assumes that you have a relevant version of PyTorch installed\n",
|
25 |
+
"!pip install -q torchaudio\n",
|
26 |
+
"\n",
|
27 |
+
"SAMPLING_RATE = 16000\n",
|
28 |
+
"\n",
|
29 |
+
"import torch\n",
|
30 |
+
"torch.set_num_threads(1)\n",
|
31 |
+
"\n",
|
32 |
+
"from IPython.display import Audio\n",
|
33 |
+
"from pprint import pprint\n",
|
34 |
+
"# download example\n",
|
35 |
+
"torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', 'en_example.wav')"
|
36 |
+
]
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"cell_type": "code",
|
40 |
+
"execution_count": null,
|
41 |
+
"metadata": {
|
42 |
+
"id": "pSifus5IilRp"
|
43 |
+
},
|
44 |
+
"outputs": [],
|
45 |
+
"source": [
|
46 |
+
"USE_ONNX = False # change this to True if you want to test onnx model\n",
|
47 |
+
"if USE_ONNX:\n",
|
48 |
+
" !pip install -q onnxruntime\n",
|
49 |
+
" \n",
|
50 |
+
"model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
|
51 |
+
" model='silero_vad',\n",
|
52 |
+
" force_reload=True,\n",
|
53 |
+
" onnx=USE_ONNX)\n",
|
54 |
+
"\n",
|
55 |
+
"(get_speech_timestamps,\n",
|
56 |
+
" save_audio,\n",
|
57 |
+
" read_audio,\n",
|
58 |
+
" VADIterator,\n",
|
59 |
+
" collect_chunks) = utils"
|
60 |
+
]
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"cell_type": "markdown",
|
64 |
+
"metadata": {
|
65 |
+
"id": "fXbbaUO3jsrw"
|
66 |
+
},
|
67 |
+
"source": [
|
68 |
+
"## Full Audio"
|
69 |
+
]
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"cell_type": "markdown",
|
73 |
+
"metadata": {
|
74 |
+
"id": "RAfJPb_a-Auj"
|
75 |
+
},
|
76 |
+
"source": [
|
77 |
+
"**Speech timestapms from full audio**"
|
78 |
+
]
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"cell_type": "code",
|
82 |
+
"execution_count": null,
|
83 |
+
"metadata": {
|
84 |
+
"id": "aI_eydBPjsrx"
|
85 |
+
},
|
86 |
+
"outputs": [],
|
87 |
+
"source": [
|
88 |
+
"wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)\n",
|
89 |
+
"# get speech timestamps from full audio file\n",
|
90 |
+
"speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLING_RATE)\n",
|
91 |
+
"pprint(speech_timestamps)"
|
92 |
+
]
|
93 |
+
},
|
94 |
+
{
|
95 |
+
"cell_type": "code",
|
96 |
+
"execution_count": null,
|
97 |
+
"metadata": {
|
98 |
+
"id": "OuEobLchjsry"
|
99 |
+
},
|
100 |
+
"outputs": [],
|
101 |
+
"source": [
|
102 |
+
"# merge all speech chunks to one audio\n",
|
103 |
+
"save_audio('only_speech.wav',\n",
|
104 |
+
" collect_chunks(speech_timestamps, wav), sampling_rate=SAMPLING_RATE) \n",
|
105 |
+
"Audio('only_speech.wav')"
|
106 |
+
]
|
107 |
+
},
|
108 |
+
{
|
109 |
+
"cell_type": "markdown",
|
110 |
+
"metadata": {
|
111 |
+
"id": "iDKQbVr8jsry"
|
112 |
+
},
|
113 |
+
"source": [
|
114 |
+
"## Stream imitation example"
|
115 |
+
]
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"cell_type": "code",
|
119 |
+
"execution_count": null,
|
120 |
+
"metadata": {
|
121 |
+
"id": "q-lql_2Wjsry"
|
122 |
+
},
|
123 |
+
"outputs": [],
|
124 |
+
"source": [
|
125 |
+
"## using VADIterator class\n",
|
126 |
+
"\n",
|
127 |
+
"vad_iterator = VADIterator(model)\n",
|
128 |
+
"wav = read_audio(f'en_example.wav', sampling_rate=SAMPLING_RATE)\n",
|
129 |
+
"\n",
|
130 |
+
"window_size_samples = 1536 # number of samples in a single audio chunk\n",
|
131 |
+
"for i in range(0, len(wav), window_size_samples):\n",
|
132 |
+
" chunk = wav[i: i+ window_size_samples]\n",
|
133 |
+
" if len(chunk) < window_size_samples:\n",
|
134 |
+
" break\n",
|
135 |
+
" speech_dict = vad_iterator(chunk, return_seconds=True)\n",
|
136 |
+
" if speech_dict:\n",
|
137 |
+
" print(speech_dict, end=' ')\n",
|
138 |
+
"vad_iterator.reset_states() # reset model states after each audio"
|
139 |
+
]
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"cell_type": "code",
|
143 |
+
"execution_count": null,
|
144 |
+
"metadata": {
|
145 |
+
"id": "BX3UgwwB2Fwv"
|
146 |
+
},
|
147 |
+
"outputs": [],
|
148 |
+
"source": [
|
149 |
+
"## just probabilities\n",
|
150 |
+
"\n",
|
151 |
+
"wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)\n",
|
152 |
+
"speech_probs = []\n",
|
153 |
+
"window_size_samples = 1536\n",
|
154 |
+
"for i in range(0, len(wav), window_size_samples):\n",
|
155 |
+
" chunk = wav[i: i+ window_size_samples]\n",
|
156 |
+
" if len(chunk) < window_size_samples:\n",
|
157 |
+
" break\n",
|
158 |
+
" speech_prob = model(chunk, SAMPLING_RATE).item()\n",
|
159 |
+
" speech_probs.append(speech_prob)\n",
|
160 |
+
"vad_iterator.reset_states() # reset model states after each audio\n",
|
161 |
+
"\n",
|
162 |
+
"print(speech_probs[:10]) # first 10 chunks predicts"
|
163 |
+
]
|
164 |
+
}
|
165 |
+
],
|
166 |
+
"metadata": {
|
167 |
+
"colab": {
|
168 |
+
"name": "silero-vad.ipynb",
|
169 |
+
"provenance": []
|
170 |
+
},
|
171 |
+
"kernelspec": {
|
172 |
+
"display_name": "Python 3",
|
173 |
+
"language": "python",
|
174 |
+
"name": "python3"
|
175 |
+
},
|
176 |
+
"language_info": {
|
177 |
+
"codemirror_mode": {
|
178 |
+
"name": "ipython",
|
179 |
+
"version": 3
|
180 |
+
},
|
181 |
+
"file_extension": ".py",
|
182 |
+
"mimetype": "text/x-python",
|
183 |
+
"name": "python",
|
184 |
+
"nbconvert_exporter": "python",
|
185 |
+
"pygments_lexer": "ipython3",
|
186 |
+
"version": "3.8.8"
|
187 |
+
},
|
188 |
+
"toc": {
|
189 |
+
"base_numbering": 1,
|
190 |
+
"nav_menu": {},
|
191 |
+
"number_sections": true,
|
192 |
+
"sideBar": true,
|
193 |
+
"skip_h1_title": false,
|
194 |
+
"title_cell": "Table of Contents",
|
195 |
+
"title_sidebar": "Contents",
|
196 |
+
"toc_cell": false,
|
197 |
+
"toc_position": {},
|
198 |
+
"toc_section_display": true,
|
199 |
+
"toc_window_display": false
|
200 |
+
}
|
201 |
+
},
|
202 |
+
"nbformat": 4,
|
203 |
+
"nbformat_minor": 0
|
204 |
+
}
|
snakers4_silero-vad_master/utils_vad.py
ADDED
@@ -0,0 +1,545 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torchaudio
|
3 |
+
from typing import Callable, List
|
4 |
+
import torch.nn.functional as F
|
5 |
+
import warnings
|
6 |
+
|
7 |
+
languages = ['ru', 'en', 'de', 'es']
|
8 |
+
|
9 |
+
|
10 |
+
class OnnxWrapper():
|
11 |
+
|
12 |
+
def __init__(self, path, force_onnx_cpu=False):
|
13 |
+
import numpy as np
|
14 |
+
global np
|
15 |
+
import onnxruntime
|
16 |
+
|
17 |
+
opts = onnxruntime.SessionOptions()
|
18 |
+
opts.inter_op_num_threads = 1
|
19 |
+
opts.intra_op_num_threads = 1
|
20 |
+
|
21 |
+
if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers():
|
22 |
+
self.session = onnxruntime.InferenceSession(path, providers=['CPUExecutionProvider'], sess_options=opts)
|
23 |
+
else:
|
24 |
+
self.session = onnxruntime.InferenceSession(path, sess_options=opts)
|
25 |
+
|
26 |
+
self.reset_states()
|
27 |
+
self.sample_rates = [8000, 16000]
|
28 |
+
|
29 |
+
def _validate_input(self, x, sr: int):
|
30 |
+
if x.dim() == 1:
|
31 |
+
x = x.unsqueeze(0)
|
32 |
+
if x.dim() > 2:
|
33 |
+
raise ValueError(f"Too many dimensions for input audio chunk {x.dim()}")
|
34 |
+
|
35 |
+
if sr != 16000 and (sr % 16000 == 0):
|
36 |
+
step = sr // 16000
|
37 |
+
x = x[:,::step]
|
38 |
+
sr = 16000
|
39 |
+
|
40 |
+
if sr not in self.sample_rates:
|
41 |
+
raise ValueError(f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)")
|
42 |
+
|
43 |
+
if sr / x.shape[1] > 31.25:
|
44 |
+
raise ValueError("Input audio chunk is too short")
|
45 |
+
|
46 |
+
return x, sr
|
47 |
+
|
48 |
+
def reset_states(self, batch_size=1):
|
49 |
+
self._h = np.zeros((2, batch_size, 64)).astype('float32')
|
50 |
+
self._c = np.zeros((2, batch_size, 64)).astype('float32')
|
51 |
+
self._last_sr = 0
|
52 |
+
self._last_batch_size = 0
|
53 |
+
|
54 |
+
def __call__(self, x, sr: int):
|
55 |
+
|
56 |
+
x, sr = self._validate_input(x, sr)
|
57 |
+
batch_size = x.shape[0]
|
58 |
+
|
59 |
+
if not self._last_batch_size:
|
60 |
+
self.reset_states(batch_size)
|
61 |
+
if (self._last_sr) and (self._last_sr != sr):
|
62 |
+
self.reset_states(batch_size)
|
63 |
+
if (self._last_batch_size) and (self._last_batch_size != batch_size):
|
64 |
+
self.reset_states(batch_size)
|
65 |
+
|
66 |
+
if sr in [8000, 16000]:
|
67 |
+
ort_inputs = {'input': x.numpy(), 'h': self._h, 'c': self._c, 'sr': np.array(sr, dtype='int64')}
|
68 |
+
ort_outs = self.session.run(None, ort_inputs)
|
69 |
+
out, self._h, self._c = ort_outs
|
70 |
+
else:
|
71 |
+
raise ValueError()
|
72 |
+
|
73 |
+
self._last_sr = sr
|
74 |
+
self._last_batch_size = batch_size
|
75 |
+
|
76 |
+
out = torch.tensor(out)
|
77 |
+
return out
|
78 |
+
|
79 |
+
def audio_forward(self, x, sr: int, num_samples: int = 512):
|
80 |
+
outs = []
|
81 |
+
x, sr = self._validate_input(x, sr)
|
82 |
+
|
83 |
+
if x.shape[1] % num_samples:
|
84 |
+
pad_num = num_samples - (x.shape[1] % num_samples)
|
85 |
+
x = torch.nn.functional.pad(x, (0, pad_num), 'constant', value=0.0)
|
86 |
+
|
87 |
+
self.reset_states(x.shape[0])
|
88 |
+
for i in range(0, x.shape[1], num_samples):
|
89 |
+
wavs_batch = x[:, i:i+num_samples]
|
90 |
+
out_chunk = self.__call__(wavs_batch, sr)
|
91 |
+
outs.append(out_chunk)
|
92 |
+
|
93 |
+
stacked = torch.cat(outs, dim=1)
|
94 |
+
return stacked.cpu()
|
95 |
+
|
96 |
+
|
97 |
+
class Validator():
|
98 |
+
def __init__(self, url, force_onnx_cpu):
|
99 |
+
self.onnx = True if url.endswith('.onnx') else False
|
100 |
+
torch.hub.download_url_to_file(url, 'inf.model')
|
101 |
+
if self.onnx:
|
102 |
+
import onnxruntime
|
103 |
+
if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers():
|
104 |
+
self.model = onnxruntime.InferenceSession('inf.model', providers=['CPUExecutionProvider'])
|
105 |
+
else:
|
106 |
+
self.model = onnxruntime.InferenceSession('inf.model')
|
107 |
+
else:
|
108 |
+
self.model = init_jit_model(model_path='inf.model')
|
109 |
+
|
110 |
+
def __call__(self, inputs: torch.Tensor):
|
111 |
+
with torch.no_grad():
|
112 |
+
if self.onnx:
|
113 |
+
ort_inputs = {'input': inputs.cpu().numpy()}
|
114 |
+
outs = self.model.run(None, ort_inputs)
|
115 |
+
outs = [torch.Tensor(x) for x in outs]
|
116 |
+
else:
|
117 |
+
outs = self.model(inputs)
|
118 |
+
|
119 |
+
return outs
|
120 |
+
|
121 |
+
|
122 |
+
def read_audio(path: str,
|
123 |
+
sampling_rate: int = 16000):
|
124 |
+
|
125 |
+
sox_backends = set(['sox', 'sox_io'])
|
126 |
+
audio_backends = torchaudio.list_audio_backends()
|
127 |
+
|
128 |
+
if len(sox_backends.intersection(audio_backends)) > 0:
|
129 |
+
effects = [
|
130 |
+
['channels', '1'],
|
131 |
+
['rate', str(sampling_rate)]
|
132 |
+
]
|
133 |
+
|
134 |
+
wav, sr = torchaudio.sox_effects.apply_effects_file(path, effects=effects)
|
135 |
+
else:
|
136 |
+
wav, sr = torchaudio.load(path)
|
137 |
+
|
138 |
+
if wav.size(0) > 1:
|
139 |
+
wav = wav.mean(dim=0, keepdim=True)
|
140 |
+
|
141 |
+
if sr != sampling_rate:
|
142 |
+
transform = torchaudio.transforms.Resample(orig_freq=sr,
|
143 |
+
new_freq=sampling_rate)
|
144 |
+
wav = transform(wav)
|
145 |
+
sr = sampling_rate
|
146 |
+
|
147 |
+
assert sr == sampling_rate
|
148 |
+
return wav.squeeze(0)
|
149 |
+
|
150 |
+
|
151 |
+
def save_audio(path: str,
|
152 |
+
tensor: torch.Tensor,
|
153 |
+
sampling_rate: int = 16000):
|
154 |
+
torchaudio.save(path, tensor.unsqueeze(0), sampling_rate, bits_per_sample=16)
|
155 |
+
|
156 |
+
|
157 |
+
def init_jit_model(model_path: str,
|
158 |
+
device=torch.device('cpu')):
|
159 |
+
model = torch.jit.load(model_path, map_location=device)
|
160 |
+
model.eval()
|
161 |
+
return model
|
162 |
+
|
163 |
+
|
164 |
+
def make_visualization(probs, step):
|
165 |
+
import pandas as pd
|
166 |
+
pd.DataFrame({'probs': probs},
|
167 |
+
index=[x * step for x in range(len(probs))]).plot(figsize=(16, 8),
|
168 |
+
kind='area', ylim=[0, 1.05], xlim=[0, len(probs) * step],
|
169 |
+
xlabel='seconds',
|
170 |
+
ylabel='speech probability',
|
171 |
+
colormap='tab20')
|
172 |
+
|
173 |
+
|
174 |
+
@torch.no_grad()
|
175 |
+
def get_speech_timestamps(audio: torch.Tensor,
|
176 |
+
model,
|
177 |
+
threshold: float = 0.5,
|
178 |
+
sampling_rate: int = 16000,
|
179 |
+
min_speech_duration_ms: int = 250,
|
180 |
+
max_speech_duration_s: float = float('inf'),
|
181 |
+
min_silence_duration_ms: int = 100,
|
182 |
+
window_size_samples: int = 512,
|
183 |
+
speech_pad_ms: int = 30,
|
184 |
+
return_seconds: bool = False,
|
185 |
+
visualize_probs: bool = False,
|
186 |
+
progress_tracking_callback: Callable[[float], None] = None):
|
187 |
+
|
188 |
+
"""
|
189 |
+
This method is used for splitting long audios into speech chunks using silero VAD
|
190 |
+
|
191 |
+
Parameters
|
192 |
+
----------
|
193 |
+
audio: torch.Tensor, one dimensional
|
194 |
+
One dimensional float torch.Tensor, other types are casted to torch if possible
|
195 |
+
|
196 |
+
model: preloaded .jit silero VAD model
|
197 |
+
|
198 |
+
threshold: float (default - 0.5)
|
199 |
+
Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
|
200 |
+
It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
|
201 |
+
|
202 |
+
sampling_rate: int (default - 16000)
|
203 |
+
Currently silero VAD models support 8000 and 16000 sample rates
|
204 |
+
|
205 |
+
min_speech_duration_ms: int (default - 250 milliseconds)
|
206 |
+
Final speech chunks shorter min_speech_duration_ms are thrown out
|
207 |
+
|
208 |
+
max_speech_duration_s: int (default - inf)
|
209 |
+
Maximum duration of speech chunks in seconds
|
210 |
+
Chunks longer than max_speech_duration_s will be split at the timestamp of the last silence that lasts more than 100ms (if any), to prevent agressive cutting.
|
211 |
+
Otherwise, they will be split aggressively just before max_speech_duration_s.
|
212 |
+
|
213 |
+
min_silence_duration_ms: int (default - 100 milliseconds)
|
214 |
+
In the end of each speech chunk wait for min_silence_duration_ms before separating it
|
215 |
+
|
216 |
+
window_size_samples: int (default - 1536 samples)
|
217 |
+
Audio chunks of window_size_samples size are fed to the silero VAD model.
|
218 |
+
WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate and 256, 512, 768 samples for 8000 sample rate.
|
219 |
+
Values other than these may affect model perfomance!!
|
220 |
+
|
221 |
+
speech_pad_ms: int (default - 30 milliseconds)
|
222 |
+
Final speech chunks are padded by speech_pad_ms each side
|
223 |
+
|
224 |
+
return_seconds: bool (default - False)
|
225 |
+
whether return timestamps in seconds (default - samples)
|
226 |
+
|
227 |
+
visualize_probs: bool (default - False)
|
228 |
+
whether draw prob hist or not
|
229 |
+
|
230 |
+
progress_tracking_callback: Callable[[float], None] (default - None)
|
231 |
+
callback function taking progress in percents as an argument
|
232 |
+
|
233 |
+
Returns
|
234 |
+
----------
|
235 |
+
speeches: list of dicts
|
236 |
+
list containing ends and beginnings of speech chunks (samples or seconds based on return_seconds)
|
237 |
+
"""
|
238 |
+
|
239 |
+
if not torch.is_tensor(audio):
|
240 |
+
try:
|
241 |
+
audio = torch.Tensor(audio)
|
242 |
+
except:
|
243 |
+
raise TypeError("Audio cannot be casted to tensor. Cast it manually")
|
244 |
+
|
245 |
+
if len(audio.shape) > 1:
|
246 |
+
for i in range(len(audio.shape)): # trying to squeeze empty dimensions
|
247 |
+
audio = audio.squeeze(0)
|
248 |
+
if len(audio.shape) > 1:
|
249 |
+
raise ValueError("More than one dimension in audio. Are you trying to process audio with 2 channels?")
|
250 |
+
|
251 |
+
if sampling_rate > 16000 and (sampling_rate % 16000 == 0):
|
252 |
+
step = sampling_rate // 16000
|
253 |
+
sampling_rate = 16000
|
254 |
+
audio = audio[::step]
|
255 |
+
warnings.warn('Sampling rate is a multiply of 16000, casting to 16000 manually!')
|
256 |
+
else:
|
257 |
+
step = 1
|
258 |
+
|
259 |
+
if sampling_rate == 8000 and window_size_samples > 768:
|
260 |
+
warnings.warn('window_size_samples is too big for 8000 sampling_rate! Better set window_size_samples to 256, 512 or 768 for 8000 sample rate!')
|
261 |
+
if window_size_samples not in [256, 512, 768, 1024, 1536]:
|
262 |
+
warnings.warn('Unusual window_size_samples! Supported window_size_samples:\n - [512, 1024, 1536] for 16000 sampling_rate\n - [256, 512, 768] for 8000 sampling_rate')
|
263 |
+
|
264 |
+
model.reset_states()
|
265 |
+
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
|
266 |
+
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
267 |
+
max_speech_samples = sampling_rate * max_speech_duration_s - window_size_samples - 2 * speech_pad_samples
|
268 |
+
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
269 |
+
min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
|
270 |
+
|
271 |
+
audio_length_samples = len(audio)
|
272 |
+
|
273 |
+
speech_probs = []
|
274 |
+
for current_start_sample in range(0, audio_length_samples, window_size_samples):
|
275 |
+
chunk = audio[current_start_sample: current_start_sample + window_size_samples]
|
276 |
+
if len(chunk) < window_size_samples:
|
277 |
+
chunk = torch.nn.functional.pad(chunk, (0, int(window_size_samples - len(chunk))))
|
278 |
+
speech_prob = model(chunk, sampling_rate).item()
|
279 |
+
speech_probs.append(speech_prob)
|
280 |
+
# caculate progress and seng it to callback function
|
281 |
+
progress = current_start_sample + window_size_samples
|
282 |
+
if progress > audio_length_samples:
|
283 |
+
progress = audio_length_samples
|
284 |
+
progress_percent = (progress / audio_length_samples) * 100
|
285 |
+
if progress_tracking_callback:
|
286 |
+
progress_tracking_callback(progress_percent)
|
287 |
+
|
288 |
+
triggered = False
|
289 |
+
speeches = []
|
290 |
+
current_speech = {}
|
291 |
+
neg_threshold = threshold - 0.15
|
292 |
+
temp_end = 0 # to save potential segment end (and tolerate some silence)
|
293 |
+
prev_end = next_start = 0 # to save potential segment limits in case of maximum segment size reached
|
294 |
+
|
295 |
+
for i, speech_prob in enumerate(speech_probs):
|
296 |
+
if (speech_prob >= threshold) and temp_end:
|
297 |
+
temp_end = 0
|
298 |
+
if next_start < prev_end:
|
299 |
+
next_start = window_size_samples * i
|
300 |
+
|
301 |
+
if (speech_prob >= threshold) and not triggered:
|
302 |
+
triggered = True
|
303 |
+
current_speech['start'] = window_size_samples * i
|
304 |
+
continue
|
305 |
+
|
306 |
+
if triggered and (window_size_samples * i) - current_speech['start'] > max_speech_samples:
|
307 |
+
if prev_end:
|
308 |
+
current_speech['end'] = prev_end
|
309 |
+
speeches.append(current_speech)
|
310 |
+
current_speech = {}
|
311 |
+
if next_start < prev_end: # previously reached silence (< neg_thres) and is still not speech (< thres)
|
312 |
+
triggered = False
|
313 |
+
else:
|
314 |
+
current_speech['start'] = next_start
|
315 |
+
prev_end = next_start = temp_end = 0
|
316 |
+
else:
|
317 |
+
current_speech['end'] = window_size_samples * i
|
318 |
+
speeches.append(current_speech)
|
319 |
+
current_speech = {}
|
320 |
+
prev_end = next_start = temp_end = 0
|
321 |
+
triggered = False
|
322 |
+
continue
|
323 |
+
|
324 |
+
if (speech_prob < neg_threshold) and triggered:
|
325 |
+
if not temp_end:
|
326 |
+
temp_end = window_size_samples * i
|
327 |
+
if ((window_size_samples * i) - temp_end) > min_silence_samples_at_max_speech : # condition to avoid cutting in very short silence
|
328 |
+
prev_end = temp_end
|
329 |
+
if (window_size_samples * i) - temp_end < min_silence_samples:
|
330 |
+
continue
|
331 |
+
else:
|
332 |
+
current_speech['end'] = temp_end
|
333 |
+
if (current_speech['end'] - current_speech['start']) > min_speech_samples:
|
334 |
+
speeches.append(current_speech)
|
335 |
+
current_speech = {}
|
336 |
+
prev_end = next_start = temp_end = 0
|
337 |
+
triggered = False
|
338 |
+
continue
|
339 |
+
|
340 |
+
if current_speech and (audio_length_samples - current_speech['start']) > min_speech_samples:
|
341 |
+
current_speech['end'] = audio_length_samples
|
342 |
+
speeches.append(current_speech)
|
343 |
+
|
344 |
+
for i, speech in enumerate(speeches):
|
345 |
+
if i == 0:
|
346 |
+
speech['start'] = int(max(0, speech['start'] - speech_pad_samples))
|
347 |
+
if i != len(speeches) - 1:
|
348 |
+
silence_duration = speeches[i+1]['start'] - speech['end']
|
349 |
+
if silence_duration < 2 * speech_pad_samples:
|
350 |
+
speech['end'] += int(silence_duration // 2)
|
351 |
+
speeches[i+1]['start'] = int(max(0, speeches[i+1]['start'] - silence_duration // 2))
|
352 |
+
else:
|
353 |
+
speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))
|
354 |
+
speeches[i+1]['start'] = int(max(0, speeches[i+1]['start'] - speech_pad_samples))
|
355 |
+
else:
|
356 |
+
speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))
|
357 |
+
|
358 |
+
if return_seconds:
|
359 |
+
for speech_dict in speeches:
|
360 |
+
speech_dict['start'] = round(speech_dict['start'] / sampling_rate, 1)
|
361 |
+
speech_dict['end'] = round(speech_dict['end'] / sampling_rate, 1)
|
362 |
+
elif step > 1:
|
363 |
+
for speech_dict in speeches:
|
364 |
+
speech_dict['start'] *= step
|
365 |
+
speech_dict['end'] *= step
|
366 |
+
|
367 |
+
if visualize_probs:
|
368 |
+
make_visualization(speech_probs, window_size_samples / sampling_rate)
|
369 |
+
|
370 |
+
return speeches
|
371 |
+
|
372 |
+
|
373 |
+
def get_number_ts(wav: torch.Tensor,
|
374 |
+
model,
|
375 |
+
model_stride=8,
|
376 |
+
hop_length=160,
|
377 |
+
sample_rate=16000):
|
378 |
+
wav = torch.unsqueeze(wav, dim=0)
|
379 |
+
perframe_logits = model(wav)[0]
|
380 |
+
perframe_preds = torch.argmax(torch.softmax(perframe_logits, dim=1), dim=1).squeeze() # (1, num_frames_strided)
|
381 |
+
extended_preds = []
|
382 |
+
for i in perframe_preds:
|
383 |
+
extended_preds.extend([i.item()] * model_stride)
|
384 |
+
# len(extended_preds) is *num_frames_real*; for each frame of audio we know if it has a number in it.
|
385 |
+
triggered = False
|
386 |
+
timings = []
|
387 |
+
cur_timing = {}
|
388 |
+
for i, pred in enumerate(extended_preds):
|
389 |
+
if pred == 1:
|
390 |
+
if not triggered:
|
391 |
+
cur_timing['start'] = int((i * hop_length) / (sample_rate / 1000))
|
392 |
+
triggered = True
|
393 |
+
elif pred == 0:
|
394 |
+
if triggered:
|
395 |
+
cur_timing['end'] = int((i * hop_length) / (sample_rate / 1000))
|
396 |
+
timings.append(cur_timing)
|
397 |
+
cur_timing = {}
|
398 |
+
triggered = False
|
399 |
+
if cur_timing:
|
400 |
+
cur_timing['end'] = int(len(wav) / (sample_rate / 1000))
|
401 |
+
timings.append(cur_timing)
|
402 |
+
return timings
|
403 |
+
|
404 |
+
|
405 |
+
def get_language(wav: torch.Tensor,
|
406 |
+
model):
|
407 |
+
wav = torch.unsqueeze(wav, dim=0)
|
408 |
+
lang_logits = model(wav)[2]
|
409 |
+
lang_pred = torch.argmax(torch.softmax(lang_logits, dim=1), dim=1).item() # from 0 to len(languages) - 1
|
410 |
+
assert lang_pred < len(languages)
|
411 |
+
return languages[lang_pred]
|
412 |
+
|
413 |
+
|
414 |
+
def get_language_and_group(wav: torch.Tensor,
|
415 |
+
model,
|
416 |
+
lang_dict: dict,
|
417 |
+
lang_group_dict: dict,
|
418 |
+
top_n=1):
|
419 |
+
wav = torch.unsqueeze(wav, dim=0)
|
420 |
+
lang_logits, lang_group_logits = model(wav)
|
421 |
+
|
422 |
+
softm = torch.softmax(lang_logits, dim=1).squeeze()
|
423 |
+
softm_group = torch.softmax(lang_group_logits, dim=1).squeeze()
|
424 |
+
|
425 |
+
srtd = torch.argsort(softm, descending=True)
|
426 |
+
srtd_group = torch.argsort(softm_group, descending=True)
|
427 |
+
|
428 |
+
outs = []
|
429 |
+
outs_group = []
|
430 |
+
for i in range(top_n):
|
431 |
+
prob = round(softm[srtd[i]].item(), 2)
|
432 |
+
prob_group = round(softm_group[srtd_group[i]].item(), 2)
|
433 |
+
outs.append((lang_dict[str(srtd[i].item())], prob))
|
434 |
+
outs_group.append((lang_group_dict[str(srtd_group[i].item())], prob_group))
|
435 |
+
|
436 |
+
return outs, outs_group
|
437 |
+
|
438 |
+
|
439 |
+
class VADIterator:
|
440 |
+
def __init__(self,
|
441 |
+
model,
|
442 |
+
threshold: float = 0.5,
|
443 |
+
sampling_rate: int = 16000,
|
444 |
+
min_silence_duration_ms: int = 100,
|
445 |
+
speech_pad_ms: int = 30
|
446 |
+
):
|
447 |
+
|
448 |
+
"""
|
449 |
+
Class for stream imitation
|
450 |
+
|
451 |
+
Parameters
|
452 |
+
----------
|
453 |
+
model: preloaded .jit silero VAD model
|
454 |
+
|
455 |
+
threshold: float (default - 0.5)
|
456 |
+
Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
|
457 |
+
It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
|
458 |
+
|
459 |
+
sampling_rate: int (default - 16000)
|
460 |
+
Currently silero VAD models support 8000 and 16000 sample rates
|
461 |
+
|
462 |
+
min_silence_duration_ms: int (default - 100 milliseconds)
|
463 |
+
In the end of each speech chunk wait for min_silence_duration_ms before separating it
|
464 |
+
|
465 |
+
speech_pad_ms: int (default - 30 milliseconds)
|
466 |
+
Final speech chunks are padded by speech_pad_ms each side
|
467 |
+
"""
|
468 |
+
|
469 |
+
self.model = model
|
470 |
+
self.threshold = threshold
|
471 |
+
self.sampling_rate = sampling_rate
|
472 |
+
|
473 |
+
if sampling_rate not in [8000, 16000]:
|
474 |
+
raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')
|
475 |
+
|
476 |
+
self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
477 |
+
self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
478 |
+
self.reset_states()
|
479 |
+
|
480 |
+
def reset_states(self):
|
481 |
+
|
482 |
+
self.model.reset_states()
|
483 |
+
self.triggered = False
|
484 |
+
self.temp_end = 0
|
485 |
+
self.current_sample = 0
|
486 |
+
|
487 |
+
@torch.no_grad()
|
488 |
+
def __call__(self, x, return_seconds=False):
|
489 |
+
"""
|
490 |
+
x: torch.Tensor
|
491 |
+
audio chunk (see examples in repo)
|
492 |
+
|
493 |
+
return_seconds: bool (default - False)
|
494 |
+
whether return timestamps in seconds (default - samples)
|
495 |
+
"""
|
496 |
+
|
497 |
+
if not torch.is_tensor(x):
|
498 |
+
try:
|
499 |
+
x = torch.Tensor(x)
|
500 |
+
except:
|
501 |
+
raise TypeError("Audio cannot be casted to tensor. Cast it manually")
|
502 |
+
|
503 |
+
window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
|
504 |
+
self.current_sample += window_size_samples
|
505 |
+
|
506 |
+
speech_prob = self.model(x, self.sampling_rate).item()
|
507 |
+
|
508 |
+
if (speech_prob >= self.threshold) and self.temp_end:
|
509 |
+
self.temp_end = 0
|
510 |
+
|
511 |
+
if (speech_prob >= self.threshold) and not self.triggered:
|
512 |
+
self.triggered = True
|
513 |
+
speech_start = self.current_sample - self.speech_pad_samples - window_size_samples
|
514 |
+
return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
|
515 |
+
|
516 |
+
if (speech_prob < self.threshold - 0.15) and self.triggered:
|
517 |
+
if not self.temp_end:
|
518 |
+
self.temp_end = self.current_sample
|
519 |
+
if self.current_sample - self.temp_end < self.min_silence_samples:
|
520 |
+
return None
|
521 |
+
else:
|
522 |
+
speech_end = self.temp_end + self.speech_pad_samples - window_size_samples
|
523 |
+
self.temp_end = 0
|
524 |
+
self.triggered = False
|
525 |
+
return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
|
526 |
+
|
527 |
+
return None
|
528 |
+
|
529 |
+
|
530 |
+
def collect_chunks(tss: List[dict],
|
531 |
+
wav: torch.Tensor):
|
532 |
+
chunks = []
|
533 |
+
for i in tss:
|
534 |
+
chunks.append(wav[i['start']: i['end']])
|
535 |
+
return torch.cat(chunks)
|
536 |
+
|
537 |
+
|
538 |
+
def drop_chunks(tss: List[dict],
|
539 |
+
wav: torch.Tensor):
|
540 |
+
chunks = []
|
541 |
+
cur_start = 0
|
542 |
+
for i in tss:
|
543 |
+
chunks.append((wav[cur_start: i['start']]))
|
544 |
+
cur_start = i['end']
|
545 |
+
return torch.cat(chunks)
|