lucy1118 commited on
Commit
8bac14e
1 Parent(s): 72e2d67

Upload 42 files

Browse files
Files changed (43) hide show
  1. .gitattributes +1 -0
  2. snakers4_silero-vad_master/.github/ISSUE_TEMPLATE/bug_report.md +52 -0
  3. snakers4_silero-vad_master/.github/ISSUE_TEMPLATE/feature_request.md +27 -0
  4. snakers4_silero-vad_master/.github/ISSUE_TEMPLATE/questions---help---support.md +12 -0
  5. snakers4_silero-vad_master/CODE_OF_CONDUCT.md +76 -0
  6. snakers4_silero-vad_master/LICENSE +21 -0
  7. snakers4_silero-vad_master/README.md +113 -0
  8. snakers4_silero-vad_master/__pycache__/hubconf.cpython-310.pyc +0 -0
  9. snakers4_silero-vad_master/__pycache__/utils_vad.cpython-310.pyc +0 -0
  10. snakers4_silero-vad_master/datasets/README.md +84 -0
  11. snakers4_silero-vad_master/examples/colab_record_example.ipynb +241 -0
  12. snakers4_silero-vad_master/examples/cpp/README.md +43 -0
  13. snakers4_silero-vad_master/examples/cpp/silero-vad-onnx.cpp +486 -0
  14. snakers4_silero-vad_master/examples/cpp/wav.h +235 -0
  15. snakers4_silero-vad_master/examples/go/README.md +19 -0
  16. snakers4_silero-vad_master/examples/go/cmd/main.go +60 -0
  17. snakers4_silero-vad_master/examples/go/go.mod +13 -0
  18. snakers4_silero-vad_master/examples/go/go.sum +16 -0
  19. snakers4_silero-vad_master/examples/java-example/pom.xml +30 -0
  20. snakers4_silero-vad_master/examples/java-example/src/main/java/org/example/App.java +69 -0
  21. snakers4_silero-vad_master/examples/java-example/src/main/java/org/example/SlieroVadDetector.java +145 -0
  22. snakers4_silero-vad_master/examples/java-example/src/main/java/org/example/SlieroVadOnnxModel.java +180 -0
  23. snakers4_silero-vad_master/examples/microphone_and_webRTC_integration/README.md +28 -0
  24. snakers4_silero-vad_master/examples/microphone_and_webRTC_integration/microphone_and_webRTC_integration.py +201 -0
  25. snakers4_silero-vad_master/examples/parallel_example.ipynb +149 -0
  26. snakers4_silero-vad_master/examples/pyaudio-streaming/README.md +20 -0
  27. snakers4_silero-vad_master/examples/pyaudio-streaming/pyaudio-streaming-examples.ipynb +331 -0
  28. snakers4_silero-vad_master/examples/rust-example/.gitignore +2 -0
  29. snakers4_silero-vad_master/examples/rust-example/Cargo.lock +781 -0
  30. snakers4_silero-vad_master/examples/rust-example/Cargo.toml +9 -0
  31. snakers4_silero-vad_master/examples/rust-example/README.md +19 -0
  32. snakers4_silero-vad_master/examples/rust-example/src/main.rs +36 -0
  33. snakers4_silero-vad_master/examples/rust-example/src/silero.rs +59 -0
  34. snakers4_silero-vad_master/examples/rust-example/src/utils.rs +60 -0
  35. snakers4_silero-vad_master/examples/rust-example/src/vad_iter.rs +223 -0
  36. snakers4_silero-vad_master/files/lang_dict_95.json +1 -0
  37. snakers4_silero-vad_master/files/lang_group_dict_95.json +1 -0
  38. snakers4_silero-vad_master/files/silero_logo.jpg +0 -0
  39. snakers4_silero-vad_master/files/silero_vad.jit +3 -0
  40. snakers4_silero-vad_master/files/silero_vad.onnx +3 -0
  41. snakers4_silero-vad_master/hubconf.py +114 -0
  42. snakers4_silero-vad_master/silero-vad.ipynb +204 -0
  43. snakers4_silero-vad_master/utils_vad.py +545 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ snakers4_silero-vad_master/files/silero_vad.jit filter=lfs diff=lfs merge=lfs -text
snakers4_silero-vad_master/.github/ISSUE_TEMPLATE/bug_report.md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Bug report
3
+ about: Create a report to help us improve
4
+ title: Bug report - [X]
5
+ labels: bug
6
+ assignees: snakers4
7
+
8
+ ---
9
+
10
+ ## 🐛 Bug
11
+
12
+ <!-- A clear and concise description of what the bug is. -->
13
+
14
+ ## To Reproduce
15
+
16
+ Steps to reproduce the behavior:
17
+
18
+ 1.
19
+ 2.
20
+ 3.
21
+
22
+ <!-- If you have a code sample, error messages, stack traces, please provide it here as well -->
23
+
24
+ ## Expected behavior
25
+
26
+ <!-- A clear and concise description of what you expected to happen. -->
27
+
28
+ ## Environment
29
+
30
+ Please copy and paste the output from this
31
+ [environment collection script](https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py)
32
+ (or fill out the checklist below manually).
33
+
34
+ You can get the script and run it with:
35
+ ```
36
+ wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py
37
+ # For security purposes, please check the contents of collect_env.py before running it.
38
+ python collect_env.py
39
+ ```
40
+
41
+ - PyTorch Version (e.g., 1.0):
42
+ - OS (e.g., Linux):
43
+ - How you installed PyTorch (`conda`, `pip`, source):
44
+ - Build command you used (if compiling from source):
45
+ - Python version:
46
+ - CUDA/cuDNN version:
47
+ - GPU models and configuration:
48
+ - Any other relevant information:
49
+
50
+ ## Additional context
51
+
52
+ <!-- Add any other context about the problem here. -->
snakers4_silero-vad_master/.github/ISSUE_TEMPLATE/feature_request.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Feature request
3
+ about: Suggest an idea for this project
4
+ title: Feature request - [X]
5
+ labels: enhancement
6
+ assignees: snakers4
7
+
8
+ ---
9
+
10
+ ## 🚀 Feature
11
+ <!-- A clear and concise description of the feature proposal -->
12
+
13
+ ## Motivation
14
+
15
+ <!-- Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too -->
16
+
17
+ ## Pitch
18
+
19
+ <!-- A clear and concise description of what you want to happen. -->
20
+
21
+ ## Alternatives
22
+
23
+ <!-- A clear and concise description of any alternative solutions or features you've considered, if any. -->
24
+
25
+ ## Additional context
26
+
27
+ <!-- Add any other context or screenshots about the feature request here. -->
snakers4_silero-vad_master/.github/ISSUE_TEMPLATE/questions---help---support.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Questions / Help / Support
3
+ about: Ask for help, support or ask a question
4
+ title: "❓ Questions / Help / Support"
5
+ labels: help wanted
6
+ assignees: snakers4
7
+
8
+ ---
9
+
10
+ ## ❓ Questions and Help
11
+
12
+ We have a [wiki](https://github.com/snakers4/silero-models/wiki) available for our users. Please make sure you have checked it out first.
snakers4_silero-vad_master/CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ In the interest of fostering an open and welcoming environment, we as
6
+ contributors and maintainers pledge to making participation in our project and
7
+ our community a harassment-free experience for everyone, regardless of age, body
8
+ size, disability, ethnicity, sex characteristics, gender identity and expression,
9
+ level of experience, education, socio-economic status, nationality, personal
10
+ appearance, race, religion, or sexual identity and orientation.
11
+
12
+ ## Our Standards
13
+
14
+ Examples of behavior that contributes to creating a positive environment
15
+ include:
16
+
17
+ * Using welcoming and inclusive language
18
+ * Being respectful of differing viewpoints and experiences
19
+ * Gracefully accepting constructive criticism
20
+ * Focusing on what is best for the community
21
+ * Showing empathy towards other community members
22
+
23
+ Examples of unacceptable behavior by participants include:
24
+
25
+ * The use of sexualized language or imagery and unwelcome sexual attention or
26
+ advances
27
+ * Trolling, insulting/derogatory comments, and personal or political attacks
28
+ * Public or private harassment
29
+ * Publishing others' private information, such as a physical or electronic
30
+ address, without explicit permission
31
+ * Other conduct which could reasonably be considered inappropriate in a
32
+ professional setting
33
+
34
+ ## Our Responsibilities
35
+
36
+ Project maintainers are responsible for clarifying the standards of acceptable
37
+ behavior and are expected to take appropriate and fair corrective action in
38
+ response to any instances of unacceptable behavior.
39
+
40
+ Project maintainers have the right and responsibility to remove, edit, or
41
+ reject comments, commits, code, wiki edits, issues, and other contributions
42
+ that are not aligned to this Code of Conduct, or to ban temporarily or
43
+ permanently any contributor for other behaviors that they deem inappropriate,
44
+ threatening, offensive, or harmful.
45
+
46
+ ## Scope
47
+
48
+ This Code of Conduct applies both within project spaces and in public spaces
49
+ when an individual is representing the project or its community. Examples of
50
+ representing a project or community include using an official project e-mail
51
+ address, posting via an official social media account, or acting as an appointed
52
+ representative at an online or offline event. Representation of a project may be
53
+ further defined and clarified by project maintainers.
54
+
55
+ ## Enforcement
56
+
57
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
58
+ reported by contacting the project team at aveysov@gmail.com. All
59
+ complaints will be reviewed and investigated and will result in a response that
60
+ is deemed necessary and appropriate to the circumstances. The project team is
61
+ obligated to maintain confidentiality with regard to the reporter of an incident.
62
+ Further details of specific enforcement policies may be posted separately.
63
+
64
+ Project maintainers who do not follow or enforce the Code of Conduct in good
65
+ faith may face temporary or permanent repercussions as determined by other
66
+ members of the project's leadership.
67
+
68
+ ## Attribution
69
+
70
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71
+ available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72
+
73
+ [homepage]: https://www.contributor-covenant.org
74
+
75
+ For answers to common questions about this code of conduct, see
76
+ https://www.contributor-covenant.org/faq
snakers4_silero-vad_master/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2020-present Silero Team
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
snakers4_silero-vad_master/README.md ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [![Mailing list : test](http://img.shields.io/badge/Email-gray.svg?style=for-the-badge&logo=gmail)](mailto:hello@silero.ai) [![Mailing list : test](http://img.shields.io/badge/Telegram-blue.svg?style=for-the-badge&logo=telegram)](https://t.me/silero_speech) [![License: CC BY-NC 4.0](https://img.shields.io/badge/License-MIT-lightgrey.svg?style=for-the-badge)](https://github.com/snakers4/silero-vad/blob/master/LICENSE)
2
+
3
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb)
4
+
5
+ ![header](https://user-images.githubusercontent.com/12515440/89997349-b3523080-dc94-11ea-9906-ca2e8bc50535.png)
6
+
7
+ <br/>
8
+ <h1 align="center">Silero VAD</h1>
9
+ <br/>
10
+
11
+ **Silero VAD** - pre-trained enterprise-grade [Voice Activity Detector](https://en.wikipedia.org/wiki/Voice_activity_detection) (also see our [STT models](https://github.com/snakers4/silero-models)).
12
+
13
+ <br/>
14
+
15
+ <p align="center">
16
+ <img src="https://user-images.githubusercontent.com/12515440/228639780-876f7801-8ec5-4daf-89f3-b45b22dd1a73.png" />
17
+ </p>
18
+
19
+
20
+ <details>
21
+ <summary>Real Time Example</summary>
22
+
23
+ https://user-images.githubusercontent.com/36505480/144874384-95f80f6d-a4f1-42cc-9be7-004c891dd481.mp4
24
+
25
+ </details>
26
+
27
+ <br/>
28
+ <h2 align="center">Key Features</h2>
29
+ <br/>
30
+
31
+ - **Stellar accuracy**
32
+
33
+ Silero VAD has [excellent results](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics#vs-other-available-solutions) on speech detection tasks.
34
+
35
+ - **Fast**
36
+
37
+ One audio chunk (30+ ms) [takes](https://github.com/snakers4/silero-vad/wiki/Performance-Metrics#silero-vad-performance-metrics) less than **1ms** to be processed on a single CPU thread. Using batching or GPU can also improve performance considerably. Under certain conditions ONNX may even run up to 4-5x faster.
38
+
39
+ - **Lightweight**
40
+
41
+ JIT model is around one megabyte in size.
42
+
43
+ - **General**
44
+
45
+ Silero VAD was trained on huge corpora that include over **100** languages and it performs well on audios from different domains with various background noise and quality levels.
46
+
47
+ - **Flexible sampling rate**
48
+
49
+ Silero VAD [supports](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics#sample-rate-comparison) **8000 Hz** and **16000 Hz** [sampling rates](https://en.wikipedia.org/wiki/Sampling_(signal_processing)#Sampling_rate).
50
+
51
+ - **Flexible chunk size**
52
+
53
+ Model was trained on **30 ms**. Longer chunks are supported directly, others may work as well.
54
+
55
+ - **Highly Portable**
56
+
57
+ Silero VAD reaps benefits from the rich ecosystems built around **PyTorch** and **ONNX** running everywhere where these runtimes are available.
58
+
59
+ - **No Strings Attached**
60
+
61
+ Published under permissive license (MIT) Silero VAD has zero strings attached - no telemetry, no keys, no registration, no built-in expiration, no keys or vendor lock.
62
+
63
+ <br/>
64
+ <h2 align="center">Typical Use Cases</h2>
65
+ <br/>
66
+
67
+ - Voice activity detection for IOT / edge / mobile use cases
68
+ - Data cleaning and preparation, voice detection in general
69
+ - Telephony and call-center automation, voice bots
70
+ - Voice interfaces
71
+
72
+ <br/>
73
+ <h2 align="center">Links</h2>
74
+ <br/>
75
+
76
+
77
+ - [Examples and Dependencies](https://github.com/snakers4/silero-vad/wiki/Examples-and-Dependencies#dependencies)
78
+ - [Quality Metrics](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics)
79
+ - [Performance Metrics](https://github.com/snakers4/silero-vad/wiki/Performance-Metrics)
80
+ - [Versions and Available Models](https://github.com/snakers4/silero-vad/wiki/Version-history-and-Available-Models)
81
+ - [Further reading](https://github.com/snakers4/silero-models#further-reading)
82
+ - [FAQ](https://github.com/snakers4/silero-vad/wiki/FAQ)
83
+
84
+ <br/>
85
+ <h2 align="center">Get In Touch</h2>
86
+ <br/>
87
+
88
+ Try our models, create an [issue](https://github.com/snakers4/silero-vad/issues/new), start a [discussion](https://github.com/snakers4/silero-vad/discussions/new), join our telegram [chat](https://t.me/silero_speech), [email](mailto:hello@silero.ai) us, read our [news](https://t.me/silero_news).
89
+
90
+ Please see our [wiki](https://github.com/snakers4/silero-models/wiki) and [tiers](https://github.com/snakers4/silero-models/wiki/Licensing-and-Tiers) for relevant information and [email](mailto:hello@silero.ai) us directly.
91
+
92
+ **Citations**
93
+
94
+ ```
95
+ @misc{Silero VAD,
96
+ author = {Silero Team},
97
+ title = {Silero VAD: pre-trained enterprise-grade Voice Activity Detector (VAD), Number Detector and Language Classifier},
98
+ year = {2021},
99
+ publisher = {GitHub},
100
+ journal = {GitHub repository},
101
+ howpublished = {\url{https://github.com/snakers4/silero-vad}},
102
+ commit = {insert_some_commit_here},
103
+ email = {hello@silero.ai}
104
+ }
105
+ ```
106
+
107
+ <br/>
108
+ <h2 align="center">Examples and VAD-based Community Apps</h2>
109
+ <br/>
110
+
111
+ - Example of VAD ONNX Runtime model usage in [C++](https://github.com/snakers4/silero-vad/tree/master/examples/cpp)
112
+
113
+ - Voice activity detection for the [browser](https://github.com/ricky0123/vad) using ONNX Runtime Web
snakers4_silero-vad_master/__pycache__/hubconf.cpython-310.pyc ADDED
Binary file (2.61 kB). View file
 
snakers4_silero-vad_master/__pycache__/utils_vad.cpython-310.pyc ADDED
Binary file (15.7 kB). View file
 
snakers4_silero-vad_master/datasets/README.md ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Датасет Silero-VAD
2
+
3
+ > Датасет создан при поддержке Фонда содействия инновациям в рамках федерального проекта «Искусственный
4
+ интеллект» национальной программы «Цифровая экономика Российской Федерации».
5
+
6
+ По ссылкам ниже представлены `.feather` файлы, содержащие размеченные с помощью Silero VAD открытые наборы аудиоданных, а также короткое описание каждого набора данных с примерами загрузки. `.feather` файлы можно открыть с помощью библиотеки `pandas`:
7
+ ```python3
8
+ import pandas as pd
9
+ dataframe = pd.read_feather(PATH_TO_FEATHER_FILE)
10
+ ```
11
+
12
+ Каждый `.feather` файл с разметкой содержит следующие колонки:
13
+ - `speech_timings` - разметка данного аудио. Это список, содержащий словари вида `{'start': START_SECOND, 'end': END_SECOND}`, где `START_SECOND` и `END_SECOND` - время начала и конца речи в секундах. Количество данных словарей равно количеству речевых аудио отрывков, найденных в данном аудио;
14
+ - `language` - ISO код языка данного аудио.
15
+
16
+ Колонки, содержащие информацию о загрузке аудио файла различаются и описаны для каждого набора данных ниже.
17
+
18
+ **Все данные размечены при временной дискретизации в ~30 миллисекунд (`num_samples` - 512)**
19
+
20
+ | Название | Число часов | Число языков | Ссылка | Лицензия | md5sum |
21
+ |----------------------|-------------|-------------|--------|----------|----------|
22
+ | **Bible.is** | 53,138 | 1,596 | [URL](https://live.bible.is/) | [Уникальная](https://live.bible.is/terms) | ea404eeaf2cd283b8223f63002be11f9 |
23
+ | **globalrecordings.net** | 9,743 | 6,171[^1] | [URL](https://globalrecordings.net/en) | CC BY-NC-SA 4.0 | 3c5c0f31b0abd9fe94ddbe8b1e2eb326 |
24
+ | **VoxLingua107** | 6,628 | 107 | [URL](https://bark.phon.ioc.ee/voxlingua107/) | CC BY 4.0 | 5dfef33b4d091b6d399cfaf3d05f2140 |
25
+ | **Common Voice** | 30,329 | 120 | [URL](https://commonvoice.mozilla.org/en/datasets) | CC0 | 5e30a85126adf74a5fd1496e6ac8695d |
26
+ | **MLS** | 50,709 | 8 | [URL](https://www.openslr.org/94/) | CC BY 4.0 | a339d0e94bdf41bba3c003756254ac4e |
27
+ | **Итого** | **150,547** | **6,171+** | | | |
28
+
29
+ ## Bible.is
30
+
31
+ [Ссылка на `.feather` файл с разметкой](https://models.silero.ai/vad_datasets/BibleIs.feather)
32
+
33
+ - Колонка `audio_link` содержит ссылки на конкретные аудио файлы.
34
+
35
+ ## globalrecordings.net
36
+
37
+ [Ссылка на `.feather` файл с разметкой](https://models.silero.ai/vad_datasets/globalrecordings.feather)
38
+
39
+ - Колонка `folder_link` содержит ссылки на скачивание `.zip` архива для конкретного языка. Внимание! Ссылки на архивы дублируются, т.к каждый архив может содержать множество аудио.
40
+ - Колонка `audio_path` содержит пути до конкретного аудио после распаковки соответствующего архива из колонки `folder_link`
41
+
42
+ ``Количество уникальных ISO кодов данного датасета не совпадает с фактическим количеством представленных языков, т.к некоторые близкие языки могут кодироваться одним и тем же ISO кодом.``
43
+
44
+ ## VoxLingua107
45
+
46
+ [Ссылка на `.feather` файл с разметкой](https://models.silero.ai/vad_datasets/VoxLingua107.feather)
47
+
48
+ - Колонка `folder_link` содержит ссылки на скачивание `.zip` архива для конкретного языка. Внимание! Ссылки на архивы дублируются, т.к каждый архив может содержать множество аудио.
49
+ - Колонка `audio_path` содержит пути до конкретного аудио после распаковки соответствующего архива из колонки `folder_link`
50
+
51
+ ## Common Voice
52
+
53
+ [Ссылка на `.feather` файл с разметкой](https://models.silero.ai/vad_datasets/common_voice.feather)
54
+
55
+ Этот датасет невозможно скачать по статичным ссылкам. Для загрузки необходимо перейти по [ссылке](https://commonvoice.mozilla.org/en/datasets) и, получив доступ в соответствующей форме, скачать архивы для каждого доступного языка. Внимание! Представленная разметка актуальна для версии исходного датасета `Common Voice Corpus 16.1`.
56
+
57
+ - Колонка `audio_path` содержит уникальные названия `.mp3` файлов, полученных после скачивания соответствующего датасета.
58
+
59
+ ## MLS
60
+
61
+ [Ссылка на `.feather` файл с разметкой](https://models.silero.ai/vad_datasets/MLS.feather)
62
+
63
+ - Колонка `folder_link` содержит ссылки на скачивание `.zip` архива для конкретного языка. Внимание! Ссылки на архивы дублируются, т.к каждый архив может содержать множество аудио.
64
+ - Колонка `audio_path` содержит пути до конкретного аудио после распаковки соответствующего архива из колонки `folder_link`
65
+
66
+ ## Лицензия
67
+
68
+ Данный датасет распространяется под [лицензией](https://creativecommons.org/licenses/by-nc-sa/4.0/deed.en) `CC BY-NC-SA 4.0`.
69
+
70
+ ## Цитирование
71
+
72
+ ```
73
+ @misc{Silero VAD Dataset,
74
+ author = {Silero Team},
75
+ title = {Silero-VAD Dataset: a large public Internet-scale dataset for voice activity detection for 6000+ languages},
76
+ year = {2024},
77
+ publisher = {GitHub},
78
+ journal = {GitHub repository},
79
+ howpublished = {\url{https://github.com/snakers4/silero-vad/datasets/README.md}},
80
+ email = {hello@silero.ai}
81
+ }
82
+ ```
83
+
84
+ [^1]: ``Количество уникальных ISO кодов данного датасета не совпадает с фактическим количеством представленных языков, т.к некоторые близкие языки могут кодироваться одним и тем же ISO кодом.``
snakers4_silero-vad_master/examples/colab_record_example.ipynb ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "bccAucKjnPHm"
7
+ },
8
+ "source": [
9
+ "### Dependencies and inputs"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": null,
15
+ "metadata": {
16
+ "id": "cSih95WFmwgi"
17
+ },
18
+ "outputs": [],
19
+ "source": [
20
+ "!pip -q install pydub\n",
21
+ "from google.colab import output\n",
22
+ "from base64 import b64decode, b64encode\n",
23
+ "from io import BytesIO\n",
24
+ "import numpy as np\n",
25
+ "from pydub import AudioSegment\n",
26
+ "from IPython.display import HTML, display\n",
27
+ "import torch\n",
28
+ "import matplotlib.pyplot as plt\n",
29
+ "import moviepy.editor as mpe\n",
30
+ "from matplotlib.animation import FuncAnimation, FFMpegWriter\n",
31
+ "import matplotlib\n",
32
+ "matplotlib.use('Agg')\n",
33
+ "\n",
34
+ "torch.set_num_threads(1)\n",
35
+ "\n",
36
+ "model, _ = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
37
+ " model='silero_vad',\n",
38
+ " force_reload=True)\n",
39
+ "\n",
40
+ "def int2float(sound):\n",
41
+ " abs_max = np.abs(sound).max()\n",
42
+ " sound = sound.astype('float32')\n",
43
+ " if abs_max > 0:\n",
44
+ " sound *= 1/32768\n",
45
+ " sound = sound.squeeze()\n",
46
+ " return sound\n",
47
+ "\n",
48
+ "AUDIO_HTML = \"\"\"\n",
49
+ "<script>\n",
50
+ "var my_div = document.createElement(\"DIV\");\n",
51
+ "var my_p = document.createElement(\"P\");\n",
52
+ "var my_btn = document.createElement(\"BUTTON\");\n",
53
+ "var t = document.createTextNode(\"Press to start recording\");\n",
54
+ "\n",
55
+ "my_btn.appendChild(t);\n",
56
+ "//my_p.appendChild(my_btn);\n",
57
+ "my_div.appendChild(my_btn);\n",
58
+ "document.body.appendChild(my_div);\n",
59
+ "\n",
60
+ "var base64data = 0;\n",
61
+ "var reader;\n",
62
+ "var recorder, gumStream;\n",
63
+ "var recordButton = my_btn;\n",
64
+ "\n",
65
+ "var handleSuccess = function(stream) {\n",
66
+ " gumStream = stream;\n",
67
+ " var options = {\n",
68
+ " //bitsPerSecond: 8000, //chrome seems to ignore, always 48k\n",
69
+ " mimeType : 'audio/webm;codecs=opus'\n",
70
+ " //mimeType : 'audio/webm;codecs=pcm'\n",
71
+ " }; \n",
72
+ " //recorder = new MediaRecorder(stream, options);\n",
73
+ " recorder = new MediaRecorder(stream);\n",
74
+ " recorder.ondataavailable = function(e) { \n",
75
+ " var url = URL.createObjectURL(e.data);\n",
76
+ " // var preview = document.createElement('audio');\n",
77
+ " // preview.controls = true;\n",
78
+ " // preview.src = url;\n",
79
+ " // document.body.appendChild(preview);\n",
80
+ "\n",
81
+ " reader = new FileReader();\n",
82
+ " reader.readAsDataURL(e.data); \n",
83
+ " reader.onloadend = function() {\n",
84
+ " base64data = reader.result;\n",
85
+ " //console.log(\"Inside FileReader:\" + base64data);\n",
86
+ " }\n",
87
+ " };\n",
88
+ " recorder.start();\n",
89
+ " };\n",
90
+ "\n",
91
+ "recordButton.innerText = \"Recording... press to stop\";\n",
92
+ "\n",
93
+ "navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);\n",
94
+ "\n",
95
+ "\n",
96
+ "function toggleRecording() {\n",
97
+ " if (recorder && recorder.state == \"recording\") {\n",
98
+ " recorder.stop();\n",
99
+ " gumStream.getAudioTracks()[0].stop();\n",
100
+ " recordButton.innerText = \"Saving recording...\"\n",
101
+ " }\n",
102
+ "}\n",
103
+ "\n",
104
+ "// https://stackoverflow.com/a/951057\n",
105
+ "function sleep(ms) {\n",
106
+ " return new Promise(resolve => setTimeout(resolve, ms));\n",
107
+ "}\n",
108
+ "\n",
109
+ "var data = new Promise(resolve=>{\n",
110
+ "//recordButton.addEventListener(\"click\", toggleRecording);\n",
111
+ "recordButton.onclick = ()=>{\n",
112
+ "toggleRecording()\n",
113
+ "\n",
114
+ "sleep(2000).then(() => {\n",
115
+ " // wait 2000ms for the data to be available...\n",
116
+ " // ideally this should use something like await...\n",
117
+ " //console.log(\"Inside data:\" + base64data)\n",
118
+ " resolve(base64data.toString())\n",
119
+ "\n",
120
+ "});\n",
121
+ "\n",
122
+ "}\n",
123
+ "});\n",
124
+ " \n",
125
+ "</script>\n",
126
+ "\"\"\"\n",
127
+ "\n",
128
+ "def record(sec=10):\n",
129
+ " display(HTML(AUDIO_HTML))\n",
130
+ " s = output.eval_js(\"data\")\n",
131
+ " b = b64decode(s.split(',')[1])\n",
132
+ " audio = AudioSegment.from_file(BytesIO(b))\n",
133
+ " audio.export('test.mp3', format='mp3')\n",
134
+ " audio = audio.set_channels(1)\n",
135
+ " audio = audio.set_frame_rate(16000)\n",
136
+ " audio_float = int2float(np.array(audio.get_array_of_samples()))\n",
137
+ " audio_tens = torch.tensor(audio_float )\n",
138
+ " return audio_tens\n",
139
+ "\n",
140
+ "def make_animation(probs, audio_duration, interval=40):\n",
141
+ " fig = plt.figure(figsize=(16, 9))\n",
142
+ " ax = plt.axes(xlim=(0, audio_duration), ylim=(0, 1.02))\n",
143
+ " line, = ax.plot([], [], lw=2)\n",
144
+ " x = [i / 16000 * 512 for i in range(len(probs))]\n",
145
+ " plt.xlabel('Time, seconds', fontsize=16)\n",
146
+ " plt.ylabel('Speech Probability', fontsize=16)\n",
147
+ "\n",
148
+ " def init():\n",
149
+ " plt.fill_between(x, probs, color='#064273')\n",
150
+ " line.set_data([], [])\n",
151
+ " line.set_color('#990000')\n",
152
+ " return line,\n",
153
+ "\n",
154
+ " def animate(i):\n",
155
+ " x = i * interval / 1000 - 0.04\n",
156
+ " y = np.linspace(0, 1.02, 2)\n",
157
+ " \n",
158
+ " line.set_data(x, y)\n",
159
+ " line.set_color('#990000')\n",
160
+ " return line,\n",
161
+ "\n",
162
+ " anim = FuncAnimation(fig, animate, init_func=init, interval=interval, save_count=audio_duration / (interval / 1000))\n",
163
+ "\n",
164
+ " f = r\"animation.mp4\" \n",
165
+ " writervideo = FFMpegWriter(fps=1000/interval) \n",
166
+ " anim.save(f, writer=writervideo)\n",
167
+ " plt.close('all')\n",
168
+ "\n",
169
+ "def combine_audio(vidname, audname, outname, fps=25): \n",
170
+ " my_clip = mpe.VideoFileClip(vidname, verbose=False)\n",
171
+ " audio_background = mpe.AudioFileClip(audname)\n",
172
+ " final_clip = my_clip.set_audio(audio_background)\n",
173
+ " final_clip.write_videofile(outname,fps=fps,verbose=False)\n",
174
+ "\n",
175
+ "def record_make_animation():\n",
176
+ " tensor = record()\n",
177
+ "\n",
178
+ " print('Calculating probabilities...')\n",
179
+ " speech_probs = []\n",
180
+ " window_size_samples = 512\n",
181
+ " for i in range(0, len(tensor), window_size_samples):\n",
182
+ " if len(tensor[i: i+ window_size_samples]) < window_size_samples:\n",
183
+ " break\n",
184
+ " speech_prob = model(tensor[i: i+ window_size_samples], 16000).item()\n",
185
+ " speech_probs.append(speech_prob)\n",
186
+ " model.reset_states()\n",
187
+ " print('Making animation...')\n",
188
+ " make_animation(speech_probs, len(tensor) / 16000)\n",
189
+ "\n",
190
+ " print('Merging your voice with animation...')\n",
191
+ " combine_audio('animation.mp4', 'test.mp3', 'merged.mp4')\n",
192
+ " print('Done!')\n",
193
+ " mp4 = open('merged.mp4','rb').read()\n",
194
+ " data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
195
+ " display(HTML(\"\"\"\n",
196
+ " <video width=800 controls>\n",
197
+ " <source src=\"%s\" type=\"video/mp4\">\n",
198
+ " </video>\n",
199
+ " \"\"\" % data_url))"
200
+ ]
201
+ },
202
+ {
203
+ "cell_type": "markdown",
204
+ "metadata": {
205
+ "id": "IFVs3GvTnpB1"
206
+ },
207
+ "source": [
208
+ "## Record example"
209
+ ]
210
+ },
211
+ {
212
+ "cell_type": "code",
213
+ "execution_count": null,
214
+ "metadata": {
215
+ "id": "5EBjrTwiqAaQ"
216
+ },
217
+ "outputs": [],
218
+ "source": [
219
+ "record_make_animation()"
220
+ ]
221
+ }
222
+ ],
223
+ "metadata": {
224
+ "colab": {
225
+ "collapsed_sections": [
226
+ "bccAucKjnPHm"
227
+ ],
228
+ "name": "Untitled2.ipynb",
229
+ "provenance": []
230
+ },
231
+ "kernelspec": {
232
+ "display_name": "Python 3",
233
+ "name": "python3"
234
+ },
235
+ "language_info": {
236
+ "name": "python"
237
+ }
238
+ },
239
+ "nbformat": 4,
240
+ "nbformat_minor": 0
241
+ }
snakers4_silero-vad_master/examples/cpp/README.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Stream example in C++
2
+
3
+ Here's a simple example of the vad model in c++ onnxruntime.
4
+
5
+
6
+
7
+ ## Requirements
8
+
9
+ Code are tested in the environments bellow, feel free to try others.
10
+
11
+ - WSL2 + Debian-bullseye (docker)
12
+ - gcc 12.2.0
13
+ - onnxruntime-linux-x64-1.12.1
14
+
15
+
16
+
17
+ ## Usage
18
+
19
+ 1. Install gcc 12.2.0, or just pull the docker image with `docker pull gcc:12.2.0-bullseye`
20
+
21
+ 2. Install onnxruntime-linux-x64-1.12.1
22
+
23
+ - Download lib onnxruntime:
24
+
25
+ `wget https://github.com/microsoft/onnxruntime/releases/download/v1.12.1/onnxruntime-linux-x64-1.12.1.tgz`
26
+
27
+ - Unzip. Assume the path is `/root/onnxruntime-linux-x64-1.12.1`
28
+
29
+ 3. Modify wav path & Test configs in main function
30
+
31
+ `wav::WavReader wav_reader("${path_to_your_wav_file}");`
32
+
33
+ test sample rate, frame per ms, threshold...
34
+
35
+ 4. Build with gcc and run
36
+
37
+ ```bash
38
+ # Build
39
+ g++ silero-vad-onnx.cpp -I /root/onnxruntime-linux-x64-1.12.1/include/ -L /root/onnxruntime-linux-x64-1.12.1/lib/ -lonnxruntime -Wl,-rpath,/root/onnxruntime-linux-x64-1.12.1/lib/ -o test
40
+
41
+ # Run
42
+ ./test
43
+ ```
snakers4_silero-vad_master/examples/cpp/silero-vad-onnx.cpp ADDED
@@ -0,0 +1,486 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <iostream>
2
+ #include <vector>
3
+ #include <sstream>
4
+ #include <cstring>
5
+ #include <limits>
6
+ #include <chrono>
7
+ #include <memory>
8
+ #include <string>
9
+ #include <stdexcept>
10
+ #include <iostream>
11
+ #include <string>
12
+ #include "onnxruntime_cxx_api.h"
13
+ #include "wav.h"
14
+ #include <cstdio>
15
+ #include <cstdarg>
16
+ #if __cplusplus < 201703L
17
+ #include <memory>
18
+ #endif
19
+
20
+ //#define __DEBUG_SPEECH_PROB___
21
+
22
+ class timestamp_t
23
+ {
24
+ public:
25
+ int start;
26
+ int end;
27
+
28
+ // default + parameterized constructor
29
+ timestamp_t(int start = -1, int end = -1)
30
+ : start(start), end(end)
31
+ {
32
+ };
33
+
34
+ // assignment operator modifies object, therefore non-const
35
+ timestamp_t& operator=(const timestamp_t& a)
36
+ {
37
+ start = a.start;
38
+ end = a.end;
39
+ return *this;
40
+ };
41
+
42
+ // equality comparison. doesn't modify object. therefore const.
43
+ bool operator==(const timestamp_t& a) const
44
+ {
45
+ return (start == a.start && end == a.end);
46
+ };
47
+ std::string c_str()
48
+ {
49
+ //return std::format("timestamp {:08d}, {:08d}", start, end);
50
+ return format("{start:%08d,end:%08d}", start, end);
51
+ };
52
+ private:
53
+
54
+ std::string format(const char* fmt, ...)
55
+ {
56
+ char buf[256];
57
+
58
+ va_list args;
59
+ va_start(args, fmt);
60
+ const auto r = std::vsnprintf(buf, sizeof buf, fmt, args);
61
+ va_end(args);
62
+
63
+ if (r < 0)
64
+ // conversion failed
65
+ return {};
66
+
67
+ const size_t len = r;
68
+ if (len < sizeof buf)
69
+ // we fit in the buffer
70
+ return { buf, len };
71
+
72
+ #if __cplusplus >= 201703L
73
+ // C++17: Create a string and write to its underlying array
74
+ std::string s(len, '\0');
75
+ va_start(args, fmt);
76
+ std::vsnprintf(s.data(), len + 1, fmt, args);
77
+ va_end(args);
78
+
79
+ return s;
80
+ #else
81
+ // C++11 or C++14: We need to allocate scratch memory
82
+ auto vbuf = std::unique_ptr<char[]>(new char[len + 1]);
83
+ va_start(args, fmt);
84
+ std::vsnprintf(vbuf.get(), len + 1, fmt, args);
85
+ va_end(args);
86
+
87
+ return { vbuf.get(), len };
88
+ #endif
89
+ };
90
+ };
91
+
92
+
93
+ class VadIterator
94
+ {
95
+ private:
96
+ // OnnxRuntime resources
97
+ Ort::Env env;
98
+ Ort::SessionOptions session_options;
99
+ std::shared_ptr<Ort::Session> session = nullptr;
100
+ Ort::AllocatorWithDefaultOptions allocator;
101
+ Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeCPU);
102
+
103
+ private:
104
+ void init_engine_threads(int inter_threads, int intra_threads)
105
+ {
106
+ // The method should be called in each thread/proc in multi-thread/proc work
107
+ session_options.SetIntraOpNumThreads(intra_threads);
108
+ session_options.SetInterOpNumThreads(inter_threads);
109
+ session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
110
+ };
111
+
112
+ void init_onnx_model(const std::wstring& model_path)
113
+ {
114
+ // Init threads = 1 for
115
+ init_engine_threads(1, 1);
116
+ // Load model
117
+ session = std::make_shared<Ort::Session>(env, model_path.c_str(), session_options);
118
+ };
119
+
120
+ void reset_states()
121
+ {
122
+ // Call reset before each audio start
123
+ std::memset(_h.data(), 0.0f, _h.size() * sizeof(float));
124
+ std::memset(_c.data(), 0.0f, _c.size() * sizeof(float));
125
+ triggered = false;
126
+ temp_end = 0;
127
+ current_sample = 0;
128
+
129
+ prev_end = next_start = 0;
130
+
131
+ speeches.clear();
132
+ current_speech = timestamp_t();
133
+ };
134
+
135
+ void predict(const std::vector<float> &data)
136
+ {
137
+ // Infer
138
+ // Create ort tensors
139
+ input.assign(data.begin(), data.end());
140
+ Ort::Value input_ort = Ort::Value::CreateTensor<float>(
141
+ memory_info, input.data(), input.size(), input_node_dims, 2);
142
+ Ort::Value sr_ort = Ort::Value::CreateTensor<int64_t>(
143
+ memory_info, sr.data(), sr.size(), sr_node_dims, 1);
144
+ Ort::Value h_ort = Ort::Value::CreateTensor<float>(
145
+ memory_info, _h.data(), _h.size(), hc_node_dims, 3);
146
+ Ort::Value c_ort = Ort::Value::CreateTensor<float>(
147
+ memory_info, _c.data(), _c.size(), hc_node_dims, 3);
148
+
149
+ // Clear and add inputs
150
+ ort_inputs.clear();
151
+ ort_inputs.emplace_back(std::move(input_ort));
152
+ ort_inputs.emplace_back(std::move(sr_ort));
153
+ ort_inputs.emplace_back(std::move(h_ort));
154
+ ort_inputs.emplace_back(std::move(c_ort));
155
+
156
+ // Infer
157
+ ort_outputs = session->Run(
158
+ Ort::RunOptions{nullptr},
159
+ input_node_names.data(), ort_inputs.data(), ort_inputs.size(),
160
+ output_node_names.data(), output_node_names.size());
161
+
162
+ // Output probability & update h,c recursively
163
+ float speech_prob = ort_outputs[0].GetTensorMutableData<float>()[0];
164
+ float *hn = ort_outputs[1].GetTensorMutableData<float>();
165
+ std::memcpy(_h.data(), hn, size_hc * sizeof(float));
166
+ float *cn = ort_outputs[2].GetTensorMutableData<float>();
167
+ std::memcpy(_c.data(), cn, size_hc * sizeof(float));
168
+
169
+ // Push forward sample index
170
+ current_sample += window_size_samples;
171
+
172
+ // Reset temp_end when > threshold
173
+ if ((speech_prob >= threshold))
174
+ {
175
+ #ifdef __DEBUG_SPEECH_PROB___
176
+ float speech = current_sample - window_size_samples; // minus window_size_samples to get precise start time point.
177
+ printf("{ start: %.3f s (%.3f) %08d}\n", 1.0 * speech / sample_rate, speech_prob, current_sample- window_size_samples);
178
+ #endif //__DEBUG_SPEECH_PROB___
179
+ if (temp_end != 0)
180
+ {
181
+ temp_end = 0;
182
+ if (next_start < prev_end)
183
+ next_start = current_sample - window_size_samples;
184
+ }
185
+ if (triggered == false)
186
+ {
187
+ triggered = true;
188
+
189
+ current_speech.start = current_sample - window_size_samples;
190
+ }
191
+ return;
192
+ }
193
+
194
+ if (
195
+ (triggered == true)
196
+ && ((current_sample - current_speech.start) > max_speech_samples)
197
+ ) {
198
+ if (prev_end > 0) {
199
+ current_speech.end = prev_end;
200
+ speeches.push_back(current_speech);
201
+ current_speech = timestamp_t();
202
+
203
+ // previously reached silence(< neg_thres) and is still not speech(< thres)
204
+ if (next_start < prev_end)
205
+ triggered = false;
206
+ else{
207
+ current_speech.start = next_start;
208
+ }
209
+ prev_end = 0;
210
+ next_start = 0;
211
+ temp_end = 0;
212
+
213
+ }
214
+ else{
215
+ current_speech.end = current_sample;
216
+ speeches.push_back(current_speech);
217
+ current_speech = timestamp_t();
218
+ prev_end = 0;
219
+ next_start = 0;
220
+ temp_end = 0;
221
+ triggered = false;
222
+ }
223
+ return;
224
+
225
+ }
226
+ if ((speech_prob >= (threshold - 0.15)) && (speech_prob < threshold))
227
+ {
228
+ if (triggered) {
229
+ #ifdef __DEBUG_SPEECH_PROB___
230
+ float speech = current_sample - window_size_samples; // minus window_size_samples to get precise start time point.
231
+ printf("{ speeking: %.3f s (%.3f) %08d}\n", 1.0 * speech / sample_rate, speech_prob, current_sample - window_size_samples);
232
+ #endif //__DEBUG_SPEECH_PROB___
233
+ }
234
+ else {
235
+ #ifdef __DEBUG_SPEECH_PROB___
236
+ float speech = current_sample - window_size_samples; // minus window_size_samples to get precise start time point.
237
+ printf("{ silence: %.3f s (%.3f) %08d}\n", 1.0 * speech / sample_rate, speech_prob, current_sample - window_size_samples);
238
+ #endif //__DEBUG_SPEECH_PROB___
239
+ }
240
+ return;
241
+ }
242
+
243
+
244
+ // 4) End
245
+ if ((speech_prob < (threshold - 0.15)))
246
+ {
247
+ #ifdef __DEBUG_SPEECH_PROB___
248
+ float speech = current_sample - window_size_samples - speech_pad_samples; // minus window_size_samples to get precise start time point.
249
+ printf("{ end: %.3f s (%.3f) %08d}\n", 1.0 * speech / sample_rate, speech_prob, current_sample - window_size_samples);
250
+ #endif //__DEBUG_SPEECH_PROB___
251
+ if (triggered == true)
252
+ {
253
+ if (temp_end == 0)
254
+ {
255
+ temp_end = current_sample;
256
+ }
257
+ if (current_sample - temp_end > min_silence_samples_at_max_speech)
258
+ prev_end = temp_end;
259
+ // a. silence < min_slience_samples, continue speaking
260
+ if ((current_sample - temp_end) < min_silence_samples)
261
+ {
262
+
263
+ }
264
+ // b. silence >= min_slience_samples, end speaking
265
+ else
266
+ {
267
+ current_speech.end = temp_end;
268
+ if (current_speech.end - current_speech.start > min_speech_samples)
269
+ {
270
+ speeches.push_back(current_speech);
271
+ current_speech = timestamp_t();
272
+ prev_end = 0;
273
+ next_start = 0;
274
+ temp_end = 0;
275
+ triggered = false;
276
+ }
277
+ }
278
+ }
279
+ else {
280
+ // may first windows see end state.
281
+ }
282
+ return;
283
+ }
284
+ };
285
+ public:
286
+ void process(const std::vector<float>& input_wav)
287
+ {
288
+ reset_states();
289
+
290
+ audio_length_samples = input_wav.size();
291
+
292
+ for (int j = 0; j < audio_length_samples; j += window_size_samples)
293
+ {
294
+ if (j + window_size_samples > audio_length_samples)
295
+ break;
296
+ std::vector<float> r{ &input_wav[0] + j, &input_wav[0] + j + window_size_samples };
297
+ predict(r);
298
+ }
299
+
300
+ if (current_speech.start >= 0) {
301
+ current_speech.end = audio_length_samples;
302
+ speeches.push_back(current_speech);
303
+ current_speech = timestamp_t();
304
+ prev_end = 0;
305
+ next_start = 0;
306
+ temp_end = 0;
307
+ triggered = false;
308
+ }
309
+ };
310
+
311
+ void process(const std::vector<float>& input_wav, std::vector<float>& output_wav)
312
+ {
313
+ process(input_wav);
314
+ collect_chunks(input_wav, output_wav);
315
+ }
316
+
317
+ void collect_chunks(const std::vector<float>& input_wav, std::vector<float>& output_wav)
318
+ {
319
+ output_wav.clear();
320
+ for (int i = 0; i < speeches.size(); i++) {
321
+ #ifdef __DEBUG_SPEECH_PROB___
322
+ std::cout << speeches[i].c_str() << std::endl;
323
+ #endif //#ifdef __DEBUG_SPEECH_PROB___
324
+ std::vector<float> slice(&input_wav[speeches[i].start], &input_wav[speeches[i].end]);
325
+ output_wav.insert(output_wav.end(),slice.begin(),slice.end());
326
+ }
327
+ };
328
+
329
+ const std::vector<timestamp_t> get_speech_timestamps() const
330
+ {
331
+ return speeches;
332
+ }
333
+
334
+ void drop_chunks(const std::vector<float>& input_wav, std::vector<float>& output_wav)
335
+ {
336
+ output_wav.clear();
337
+ int current_start = 0;
338
+ for (int i = 0; i < speeches.size(); i++) {
339
+
340
+ std::vector<float> slice(&input_wav[current_start],&input_wav[speeches[i].start]);
341
+ output_wav.insert(output_wav.end(), slice.begin(), slice.end());
342
+ current_start = speeches[i].end;
343
+ }
344
+
345
+ std::vector<float> slice(&input_wav[current_start], &input_wav[input_wav.size()]);
346
+ output_wav.insert(output_wav.end(), slice.begin(), slice.end());
347
+ };
348
+
349
+ private:
350
+ // model config
351
+ int64_t window_size_samples; // Assign when init, support 256 512 768 for 8k; 512 1024 1536 for 16k.
352
+ int sample_rate; //Assign when init support 16000 or 8000
353
+ int sr_per_ms; // Assign when init, support 8 or 16
354
+ float threshold;
355
+ int min_silence_samples; // sr_per_ms * #ms
356
+ int min_silence_samples_at_max_speech; // sr_per_ms * #98
357
+ int min_speech_samples; // sr_per_ms * #ms
358
+ float max_speech_samples;
359
+ int speech_pad_samples; // usually a
360
+ int audio_length_samples;
361
+
362
+ // model states
363
+ bool triggered = false;
364
+ unsigned int temp_end = 0;
365
+ unsigned int current_sample = 0;
366
+ // MAX 4294967295 samples / 8sample per ms / 1000 / 60 = 8947 minutes
367
+ int prev_end;
368
+ int next_start = 0;
369
+
370
+ //Output timestamp
371
+ std::vector<timestamp_t> speeches;
372
+ timestamp_t current_speech;
373
+
374
+
375
+ // Onnx model
376
+ // Inputs
377
+ std::vector<Ort::Value> ort_inputs;
378
+
379
+ std::vector<const char *> input_node_names = {"input", "sr", "h", "c"};
380
+ std::vector<float> input;
381
+ std::vector<int64_t> sr;
382
+ unsigned int size_hc = 2 * 1 * 64; // It's FIXED.
383
+ std::vector<float> _h;
384
+ std::vector<float> _c;
385
+
386
+ int64_t input_node_dims[2] = {};
387
+ const int64_t sr_node_dims[1] = {1};
388
+ const int64_t hc_node_dims[3] = {2, 1, 64};
389
+
390
+ // Outputs
391
+ std::vector<Ort::Value> ort_outputs;
392
+ std::vector<const char *> output_node_names = {"output", "hn", "cn"};
393
+
394
+ public:
395
+ // Construction
396
+ VadIterator(const std::wstring ModelPath,
397
+ int Sample_rate = 16000, int windows_frame_size = 64,
398
+ float Threshold = 0.5, int min_silence_duration_ms = 0,
399
+ int speech_pad_ms = 64, int min_speech_duration_ms = 64,
400
+ float max_speech_duration_s = std::numeric_limits<float>::infinity())
401
+ {
402
+ init_onnx_model(ModelPath);
403
+ threshold = Threshold;
404
+ sample_rate = Sample_rate;
405
+ sr_per_ms = sample_rate / 1000;
406
+
407
+ window_size_samples = windows_frame_size * sr_per_ms;
408
+
409
+ min_speech_samples = sr_per_ms * min_speech_duration_ms;
410
+ speech_pad_samples = sr_per_ms * speech_pad_ms;
411
+
412
+ max_speech_samples = (
413
+ sample_rate * max_speech_duration_s
414
+ - window_size_samples
415
+ - 2 * speech_pad_samples
416
+ );
417
+
418
+ min_silence_samples = sr_per_ms * min_silence_duration_ms;
419
+ min_silence_samples_at_max_speech = sr_per_ms * 98;
420
+
421
+ input.resize(window_size_samples);
422
+ input_node_dims[0] = 1;
423
+ input_node_dims[1] = window_size_samples;
424
+
425
+ _h.resize(size_hc);
426
+ _c.resize(size_hc);
427
+ sr.resize(1);
428
+ sr[0] = sample_rate;
429
+ };
430
+ };
431
+
432
+ int main()
433
+ {
434
+ std::vector<timestamp_t> stamps;
435
+
436
+ // Read wav
437
+ wav::WavReader wav_reader("recorder.wav"); //16000,1,32float
438
+ std::vector<float> input_wav(wav_reader.num_samples());
439
+ std::vector<float> output_wav;
440
+
441
+ for (int i = 0; i < wav_reader.num_samples(); i++)
442
+ {
443
+ input_wav[i] = static_cast<float>(*(wav_reader.data() + i));
444
+ }
445
+
446
+
447
+
448
+ // ===== Test configs =====
449
+ std::wstring path = L"silero_vad.onnx";
450
+ VadIterator vad(path);
451
+
452
+ // ==============================================
453
+ // ==== = Example 1 of full function =====
454
+ // ==============================================
455
+ vad.process(input_wav);
456
+
457
+ // 1.a get_speech_timestamps
458
+ stamps = vad.get_speech_timestamps();
459
+ for (int i = 0; i < stamps.size(); i++) {
460
+
461
+ std::cout << stamps[i].c_str() << std::endl;
462
+ }
463
+
464
+ // 1.b collect_chunks output wav
465
+ vad.collect_chunks(input_wav, output_wav);
466
+
467
+ // 1.c drop_chunks output wav
468
+ vad.drop_chunks(input_wav, output_wav);
469
+
470
+ // ==============================================
471
+ // ===== Example 2 of simple full function =====
472
+ // ==============================================
473
+ vad.process(input_wav, output_wav);
474
+
475
+ stamps = vad.get_speech_timestamps();
476
+ for (int i = 0; i < stamps.size(); i++) {
477
+
478
+ std::cout << stamps[i].c_str() << std::endl;
479
+ }
480
+
481
+ // ==============================================
482
+ // ===== Example 3 of full function =====
483
+ // ==============================================
484
+ for(int i = 0; i<2; i++)
485
+ vad.process(input_wav, output_wav);
486
+ }
snakers4_silero-vad_master/examples/cpp/wav.h ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2016 Personal (Binbin Zhang)
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+
16
+ #ifndef FRONTEND_WAV_H_
17
+ #define FRONTEND_WAV_H_
18
+
19
+ #include <assert.h>
20
+ #include <stdint.h>
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+
25
+ #include <string>
26
+
27
+ // #include "utils/log.h"
28
+
29
+ namespace wav {
30
+
31
+ struct WavHeader {
32
+ char riff[4]; // "riff"
33
+ unsigned int size;
34
+ char wav[4]; // "WAVE"
35
+ char fmt[4]; // "fmt "
36
+ unsigned int fmt_size;
37
+ uint16_t format;
38
+ uint16_t channels;
39
+ unsigned int sample_rate;
40
+ unsigned int bytes_per_second;
41
+ uint16_t block_size;
42
+ uint16_t bit;
43
+ char data[4]; // "data"
44
+ unsigned int data_size;
45
+ };
46
+
47
+ class WavReader {
48
+ public:
49
+ WavReader() : data_(nullptr) {}
50
+ explicit WavReader(const std::string& filename) { Open(filename); }
51
+
52
+ bool Open(const std::string& filename) {
53
+ FILE* fp = fopen(filename.c_str(), "rb"); //文件读取
54
+ if (NULL == fp) {
55
+ std::cout << "Error in read " << filename;
56
+ return false;
57
+ }
58
+
59
+ WavHeader header;
60
+ fread(&header, 1, sizeof(header), fp);
61
+ if (header.fmt_size < 16) {
62
+ printf("WaveData: expect PCM format data "
63
+ "to have fmt chunk of at least size 16.\n");
64
+ return false;
65
+ } else if (header.fmt_size > 16) {
66
+ int offset = 44 - 8 + header.fmt_size - 16;
67
+ fseek(fp, offset, SEEK_SET);
68
+ fread(header.data, 8, sizeof(char), fp);
69
+ }
70
+ // check "riff" "WAVE" "fmt " "data"
71
+
72
+ // Skip any sub-chunks between "fmt" and "data". Usually there will
73
+ // be a single "fact" sub chunk, but on Windows there can also be a
74
+ // "list" sub chunk.
75
+ while (0 != strncmp(header.data, "data", 4)) {
76
+ // We will just ignore the data in these chunks.
77
+ fseek(fp, header.data_size, SEEK_CUR);
78
+ // read next sub chunk
79
+ fread(header.data, 8, sizeof(char), fp);
80
+ }
81
+
82
+ if (header.data_size == 0) {
83
+ int offset = ftell(fp);
84
+ fseek(fp, 0, SEEK_END);
85
+ header.data_size = ftell(fp) - offset;
86
+ fseek(fp, offset, SEEK_SET);
87
+ }
88
+
89
+ num_channel_ = header.channels;
90
+ sample_rate_ = header.sample_rate;
91
+ bits_per_sample_ = header.bit;
92
+ int num_data = header.data_size / (bits_per_sample_ / 8);
93
+ data_ = new float[num_data]; // Create 1-dim array
94
+ num_samples_ = num_data / num_channel_;
95
+
96
+ std::cout << "num_channel_ :" << num_channel_ << std::endl;
97
+ std::cout << "sample_rate_ :" << sample_rate_ << std::endl;
98
+ std::cout << "bits_per_sample_:" << bits_per_sample_ << std::endl;
99
+ std::cout << "num_samples :" << num_data << std::endl;
100
+ std::cout << "num_data_size :" << header.data_size << std::endl;
101
+
102
+ switch (bits_per_sample_) {
103
+ case 8: {
104
+ char sample;
105
+ for (int i = 0; i < num_data; ++i) {
106
+ fread(&sample, 1, sizeof(char), fp);
107
+ data_[i] = static_cast<float>(sample) / 32768;
108
+ }
109
+ break;
110
+ }
111
+ case 16: {
112
+ int16_t sample;
113
+ for (int i = 0; i < num_data; ++i) {
114
+ fread(&sample, 1, sizeof(int16_t), fp);
115
+ data_[i] = static_cast<float>(sample) / 32768;
116
+ }
117
+ break;
118
+ }
119
+ case 32:
120
+ {
121
+ if (header.format == 1) //S32
122
+ {
123
+ int sample;
124
+ for (int i = 0; i < num_data; ++i) {
125
+ fread(&sample, 1, sizeof(int), fp);
126
+ data_[i] = static_cast<float>(sample) / 32768;
127
+ }
128
+ }
129
+ else if (header.format == 3) // IEEE-float
130
+ {
131
+ float sample;
132
+ for (int i = 0; i < num_data; ++i) {
133
+ fread(&sample, 1, sizeof(float), fp);
134
+ data_[i] = static_cast<float>(sample);
135
+ }
136
+ }
137
+ else {
138
+ printf("unsupported quantization bits\n");
139
+ }
140
+ break;
141
+ }
142
+ default:
143
+ printf("unsupported quantization bits\n");
144
+ break;
145
+ }
146
+
147
+ fclose(fp);
148
+ return true;
149
+ }
150
+
151
+ int num_channel() const { return num_channel_; }
152
+ int sample_rate() const { return sample_rate_; }
153
+ int bits_per_sample() const { return bits_per_sample_; }
154
+ int num_samples() const { return num_samples_; }
155
+
156
+ ~WavReader() {
157
+ delete[] data_;
158
+ }
159
+
160
+ const float* data() const { return data_; }
161
+
162
+ private:
163
+ int num_channel_;
164
+ int sample_rate_;
165
+ int bits_per_sample_;
166
+ int num_samples_; // sample points per channel
167
+ float* data_;
168
+ };
169
+
170
+ class WavWriter {
171
+ public:
172
+ WavWriter(const float* data, int num_samples, int num_channel,
173
+ int sample_rate, int bits_per_sample)
174
+ : data_(data),
175
+ num_samples_(num_samples),
176
+ num_channel_(num_channel),
177
+ sample_rate_(sample_rate),
178
+ bits_per_sample_(bits_per_sample) {}
179
+
180
+ void Write(const std::string& filename) {
181
+ FILE* fp = fopen(filename.c_str(), "w");
182
+ // init char 'riff' 'WAVE' 'fmt ' 'data'
183
+ WavHeader header;
184
+ char wav_header[44] = {0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57,
185
+ 0x41, 0x56, 0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00,
186
+ 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
187
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
188
+ 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00};
189
+ memcpy(&header, wav_header, sizeof(header));
190
+ header.channels = num_channel_;
191
+ header.bit = bits_per_sample_;
192
+ header.sample_rate = sample_rate_;
193
+ header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8);
194
+ header.size = sizeof(header) - 8 + header.data_size;
195
+ header.bytes_per_second =
196
+ sample_rate_ * num_channel_ * (bits_per_sample_ / 8);
197
+ header.block_size = num_channel_ * (bits_per_sample_ / 8);
198
+
199
+ fwrite(&header, 1, sizeof(header), fp);
200
+
201
+ for (int i = 0; i < num_samples_; ++i) {
202
+ for (int j = 0; j < num_channel_; ++j) {
203
+ switch (bits_per_sample_) {
204
+ case 8: {
205
+ char sample = static_cast<char>(data_[i * num_channel_ + j]);
206
+ fwrite(&sample, 1, sizeof(sample), fp);
207
+ break;
208
+ }
209
+ case 16: {
210
+ int16_t sample = static_cast<int16_t>(data_[i * num_channel_ + j]);
211
+ fwrite(&sample, 1, sizeof(sample), fp);
212
+ break;
213
+ }
214
+ case 32: {
215
+ int sample = static_cast<int>(data_[i * num_channel_ + j]);
216
+ fwrite(&sample, 1, sizeof(sample), fp);
217
+ break;
218
+ }
219
+ }
220
+ }
221
+ }
222
+ fclose(fp);
223
+ }
224
+
225
+ private:
226
+ const float* data_;
227
+ int num_samples_; // total float points in data_
228
+ int num_channel_;
229
+ int sample_rate_;
230
+ int bits_per_sample_;
231
+ };
232
+
233
+ } // namespace wenet
234
+
235
+ #endif // FRONTEND_WAV_H_
snakers4_silero-vad_master/examples/go/README.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Golang Example
2
+
3
+ This is a sample program of how to run speech detection using `silero-vad` from Golang (CGO + ONNX Runtime).
4
+
5
+ ### Requirements
6
+
7
+ - Golang >= v1.21
8
+ - ONNX Runtime
9
+
10
+ ### Usage
11
+
12
+ ```sh
13
+ go run ./cmd/main.go test.wav
14
+ ```
15
+
16
+ > **_Note_**
17
+ >
18
+ > Make sure you have the ONNX Runtime library and C headers installed in your path.
19
+
snakers4_silero-vad_master/examples/go/cmd/main.go ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package main
2
+
3
+ import (
4
+ "log"
5
+ "os"
6
+
7
+ "github.com/streamer45/silero-vad-go/speech"
8
+
9
+ "github.com/go-audio/wav"
10
+ )
11
+
12
+ func main() {
13
+ sd, err := speech.NewDetector(speech.DetectorConfig{
14
+ ModelPath: "../../files/silero_vad.onnx",
15
+ SampleRate: 16000,
16
+ WindowSize: 1536,
17
+ Threshold: 0.5,
18
+ MinSilenceDurationMs: 0,
19
+ SpeechPadMs: 0,
20
+ })
21
+ if err != nil {
22
+ log.Fatalf("failed to create speech detector: %s", err)
23
+ }
24
+
25
+ f, err := os.Open(os.Args[1])
26
+ if err != nil {
27
+ log.Fatalf("failed to open sample audio file: %s", err)
28
+ }
29
+ defer f.Close()
30
+
31
+ dec := wav.NewDecoder(f)
32
+
33
+ if ok := dec.IsValidFile(); !ok {
34
+ log.Fatalf("invalid WAV file")
35
+ }
36
+
37
+ buf, err := dec.FullPCMBuffer()
38
+ if err != nil {
39
+ log.Fatalf("failed to get PCM buffer")
40
+ }
41
+
42
+ pcmBuf := buf.AsFloat32Buffer()
43
+
44
+ segments, err := sd.Detect(pcmBuf.Data)
45
+ if err != nil {
46
+ log.Fatalf("Detect failed: %s", err)
47
+ }
48
+
49
+ for _, s := range segments {
50
+ log.Printf("speech starts at %0.2fs", s.SpeechStartAt)
51
+ if s.SpeechEndAt > 0 {
52
+ log.Printf("speech ends at %0.2fs", s.SpeechEndAt)
53
+ }
54
+ }
55
+
56
+ err = sd.Destroy()
57
+ if err != nil {
58
+ log.Fatalf("failed to destroy detector: %s", err)
59
+ }
60
+ }
snakers4_silero-vad_master/examples/go/go.mod ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module silero
2
+
3
+ go 1.21.4
4
+
5
+ require (
6
+ github.com/go-audio/wav v1.1.0
7
+ github.com/streamer45/silero-vad-go v0.1.0
8
+ )
9
+
10
+ require (
11
+ github.com/go-audio/audio v1.0.0 // indirect
12
+ github.com/go-audio/riff v1.0.0 // indirect
13
+ )
snakers4_silero-vad_master/examples/go/go.sum ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
2
+ github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
3
+ github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
4
+ github.com/go-audio/audio v1.0.0/go.mod h1:6uAu0+H2lHkwdGsAY+j2wHPNPpPoeg5AaEFh9FlA+Zs=
5
+ github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA=
6
+ github.com/go-audio/riff v1.0.0/go.mod h1:l3cQwc85y79NQFCRB7TiPoNiaijp6q8Z0Uv38rVG498=
7
+ github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
8
+ github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
9
+ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
10
+ github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
11
+ github.com/streamer45/silero-vad-go v0.1.0 h1:0nGZ6VT3LKOkBG/w+4kljIB6brxtgQn6YuNjTVYjOQ4=
12
+ github.com/streamer45/silero-vad-go v0.1.0/go.mod h1:B+2FXs/5fZ6pzl6unUZYhZqkYdOB+3saBVzjOzdZnUs=
13
+ github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
14
+ github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
15
+ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
16
+ gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
snakers4_silero-vad_master/examples/java-example/pom.xml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
2
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
3
+ <modelVersion>4.0.0</modelVersion>
4
+
5
+ <groupId>org.example</groupId>
6
+ <artifactId>java-example</artifactId>
7
+ <version>1.0-SNAPSHOT</version>
8
+ <packaging>jar</packaging>
9
+
10
+ <name>sliero-vad-example</name>
11
+ <url>http://maven.apache.org</url>
12
+
13
+ <properties>
14
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
15
+ </properties>
16
+
17
+ <dependencies>
18
+ <dependency>
19
+ <groupId>junit</groupId>
20
+ <artifactId>junit</artifactId>
21
+ <version>3.8.1</version>
22
+ <scope>test</scope>
23
+ </dependency>
24
+ <dependency>
25
+ <groupId>com.microsoft.onnxruntime</groupId>
26
+ <artifactId>onnxruntime</artifactId>
27
+ <version>1.16.0-rc1</version>
28
+ </dependency>
29
+ </dependencies>
30
+ </project>
snakers4_silero-vad_master/examples/java-example/src/main/java/org/example/App.java ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package org.example;
2
+
3
+ import ai.onnxruntime.OrtException;
4
+ import javax.sound.sampled.*;
5
+ import java.util.Map;
6
+
7
+ public class App {
8
+
9
+ private static final String MODEL_PATH = "src/main/resources/silero_vad.onnx";
10
+ private static final int SAMPLE_RATE = 16000;
11
+ private static final float START_THRESHOLD = 0.6f;
12
+ private static final float END_THRESHOLD = 0.45f;
13
+ private static final int MIN_SILENCE_DURATION_MS = 600;
14
+ private static final int SPEECH_PAD_MS = 500;
15
+ private static final int WINDOW_SIZE_SAMPLES = 2048;
16
+
17
+ public static void main(String[] args) {
18
+ // Initialize the Voice Activity Detector
19
+ SlieroVadDetector vadDetector;
20
+ try {
21
+ vadDetector = new SlieroVadDetector(MODEL_PATH, START_THRESHOLD, END_THRESHOLD, SAMPLE_RATE, MIN_SILENCE_DURATION_MS, SPEECH_PAD_MS);
22
+ } catch (OrtException e) {
23
+ System.err.println("Error initializing the VAD detector: " + e.getMessage());
24
+ return;
25
+ }
26
+
27
+ // Set audio format
28
+ AudioFormat format = new AudioFormat(SAMPLE_RATE, 16, 1, true, false);
29
+ DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);
30
+
31
+ // Get the target data line and open it with the specified format
32
+ TargetDataLine targetDataLine;
33
+ try {
34
+ targetDataLine = (TargetDataLine) AudioSystem.getLine(info);
35
+ targetDataLine.open(format);
36
+ targetDataLine.start();
37
+ } catch (LineUnavailableException e) {
38
+ System.err.println("Error opening target data line: " + e.getMessage());
39
+ return;
40
+ }
41
+
42
+ // Main loop to continuously read data and apply Voice Activity Detection
43
+ while (targetDataLine.isOpen()) {
44
+ byte[] data = new byte[WINDOW_SIZE_SAMPLES];
45
+
46
+ int numBytesRead = targetDataLine.read(data, 0, data.length);
47
+ if (numBytesRead <= 0) {
48
+ System.err.println("Error reading data from target data line.");
49
+ continue;
50
+ }
51
+
52
+ // Apply the Voice Activity Detector to the data and get the result
53
+ Map<String, Double> detectResult;
54
+ try {
55
+ detectResult = vadDetector.apply(data, true);
56
+ } catch (Exception e) {
57
+ System.err.println("Error applying VAD detector: " + e.getMessage());
58
+ continue;
59
+ }
60
+
61
+ if (!detectResult.isEmpty()) {
62
+ System.out.println(detectResult);
63
+ }
64
+ }
65
+
66
+ // Close the target data line to release audio resources
67
+ targetDataLine.close();
68
+ }
69
+ }
snakers4_silero-vad_master/examples/java-example/src/main/java/org/example/SlieroVadDetector.java ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package org.example;
2
+
3
+ import ai.onnxruntime.OrtException;
4
+
5
+ import java.math.BigDecimal;
6
+ import java.math.RoundingMode;
7
+ import java.util.Collections;
8
+ import java.util.HashMap;
9
+ import java.util.Map;
10
+
11
+
12
+ public class SlieroVadDetector {
13
+ // OnnxModel model used for speech processing
14
+ private final SlieroVadOnnxModel model;
15
+ // Threshold for speech start
16
+ private final float startThreshold;
17
+ // Threshold for speech end
18
+ private final float endThreshold;
19
+ // Sampling rate
20
+ private final int samplingRate;
21
+ // Minimum number of silence samples to determine the end threshold of speech
22
+ private final float minSilenceSamples;
23
+ // Additional number of samples for speech start or end to calculate speech start or end time
24
+ private final float speechPadSamples;
25
+ // Whether in the triggered state (i.e. whether speech is being detected)
26
+ private boolean triggered;
27
+ // Temporarily stored number of speech end samples
28
+ private int tempEnd;
29
+ // Number of samples currently being processed
30
+ private int currentSample;
31
+
32
+
33
+ public SlieroVadDetector(String modelPath,
34
+ float startThreshold,
35
+ float endThreshold,
36
+ int samplingRate,
37
+ int minSilenceDurationMs,
38
+ int speechPadMs) throws OrtException {
39
+ // Check if the sampling rate is 8000 or 16000, if not, throw an exception
40
+ if (samplingRate != 8000 && samplingRate != 16000) {
41
+ throw new IllegalArgumentException("does not support sampling rates other than [8000, 16000]");
42
+ }
43
+
44
+ // Initialize the parameters
45
+ this.model = new SlieroVadOnnxModel(modelPath);
46
+ this.startThreshold = startThreshold;
47
+ this.endThreshold = endThreshold;
48
+ this.samplingRate = samplingRate;
49
+ this.minSilenceSamples = samplingRate * minSilenceDurationMs / 1000f;
50
+ this.speechPadSamples = samplingRate * speechPadMs / 1000f;
51
+ // Reset the state
52
+ reset();
53
+ }
54
+
55
+ // Method to reset the state, including the model state, trigger state, temporary end time, and current sample count
56
+ public void reset() {
57
+ model.resetStates();
58
+ triggered = false;
59
+ tempEnd = 0;
60
+ currentSample = 0;
61
+ }
62
+
63
+ // apply method for processing the audio array, returning possible speech start or end times
64
+ public Map<String, Double> apply(byte[] data, boolean returnSeconds) {
65
+
66
+ // Convert the byte array to a float array
67
+ float[] audioData = new float[data.length / 2];
68
+ for (int i = 0; i < audioData.length; i++) {
69
+ audioData[i] = ((data[i * 2] & 0xff) | (data[i * 2 + 1] << 8)) / 32767.0f;
70
+ }
71
+
72
+ // Get the length of the audio array as the window size
73
+ int windowSizeSamples = audioData.length;
74
+ // Update the current sample count
75
+ currentSample += windowSizeSamples;
76
+
77
+ // Call the model to get the prediction probability of speech
78
+ float speechProb = 0;
79
+ try {
80
+ speechProb = model.call(new float[][]{audioData}, samplingRate)[0];
81
+ } catch (OrtException e) {
82
+ throw new RuntimeException(e);
83
+ }
84
+
85
+ // If the speech probability is greater than the threshold and the temporary end time is not 0, reset the temporary end time
86
+ // This indicates that the speech duration has exceeded expectations and needs to recalculate the end time
87
+ if (speechProb >= startThreshold && tempEnd != 0) {
88
+ tempEnd = 0;
89
+ }
90
+
91
+ // If the speech probability is greater than the threshold and not in the triggered state, set to triggered state and calculate the speech start time
92
+ if (speechProb >= startThreshold && !triggered) {
93
+ triggered = true;
94
+ int speechStart = (int) (currentSample - speechPadSamples);
95
+ speechStart = Math.max(speechStart, 0);
96
+ Map<String, Double> result = new HashMap<>();
97
+ // Decide whether to return the result in seconds or sample count based on the returnSeconds parameter
98
+ if (returnSeconds) {
99
+ double speechStartSeconds = speechStart / (double) samplingRate;
100
+ double roundedSpeechStart = BigDecimal.valueOf(speechStartSeconds).setScale(1, RoundingMode.HALF_UP).doubleValue();
101
+ result.put("start", roundedSpeechStart);
102
+ } else {
103
+ result.put("start", (double) speechStart);
104
+ }
105
+
106
+ return result;
107
+ }
108
+
109
+ // If the speech probability is less than a certain threshold and in the triggered state, calculate the speech end time
110
+ if (speechProb < endThreshold && triggered) {
111
+ // Initialize or update the temporary end time
112
+ if (tempEnd == 0) {
113
+ tempEnd = currentSample;
114
+ }
115
+ // If the number of silence samples between the current sample and the temporary end time is less than the minimum silence samples, return null
116
+ // This indicates that it is not yet possible to determine whether the speech has ended
117
+ if (currentSample - tempEnd < minSilenceSamples) {
118
+ return Collections.emptyMap();
119
+ } else {
120
+ // Calculate the speech end time, reset the trigger state and temporary end time
121
+ int speechEnd = (int) (tempEnd + speechPadSamples);
122
+ tempEnd = 0;
123
+ triggered = false;
124
+ Map<String, Double> result = new HashMap<>();
125
+
126
+ if (returnSeconds) {
127
+ double speechEndSeconds = speechEnd / (double) samplingRate;
128
+ double roundedSpeechEnd = BigDecimal.valueOf(speechEndSeconds).setScale(1, RoundingMode.HALF_UP).doubleValue();
129
+ result.put("end", roundedSpeechEnd);
130
+ } else {
131
+ result.put("end", (double) speechEnd);
132
+ }
133
+ return result;
134
+ }
135
+ }
136
+
137
+ // If the above conditions are not met, return null by default
138
+ return Collections.emptyMap();
139
+ }
140
+
141
+ public void close() throws OrtException {
142
+ reset();
143
+ model.close();
144
+ }
145
+ }
snakers4_silero-vad_master/examples/java-example/src/main/java/org/example/SlieroVadOnnxModel.java ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package org.example;
2
+
3
+ import ai.onnxruntime.OnnxTensor;
4
+ import ai.onnxruntime.OrtEnvironment;
5
+ import ai.onnxruntime.OrtException;
6
+ import ai.onnxruntime.OrtSession;
7
+ import java.util.Arrays;
8
+ import java.util.HashMap;
9
+ import java.util.List;
10
+ import java.util.Map;
11
+
12
+ public class SlieroVadOnnxModel {
13
+ // Define private variable OrtSession
14
+ private final OrtSession session;
15
+ private float[][][] h;
16
+ private float[][][] c;
17
+ // Define the last sample rate
18
+ private int lastSr = 0;
19
+ // Define the last batch size
20
+ private int lastBatchSize = 0;
21
+ // Define a list of supported sample rates
22
+ private static final List<Integer> SAMPLE_RATES = Arrays.asList(8000, 16000);
23
+
24
+ // Constructor
25
+ public SlieroVadOnnxModel(String modelPath) throws OrtException {
26
+ // Get the ONNX runtime environment
27
+ OrtEnvironment env = OrtEnvironment.getEnvironment();
28
+ // Create an ONNX session options object
29
+ OrtSession.SessionOptions opts = new OrtSession.SessionOptions();
30
+ // Set the InterOp thread count to 1, InterOp threads are used for parallel processing of different computation graph operations
31
+ opts.setInterOpNumThreads(1);
32
+ // Set the IntraOp thread count to 1, IntraOp threads are used for parallel processing within a single operation
33
+ opts.setIntraOpNumThreads(1);
34
+ // Add a CPU device, setting to false disables CPU execution optimization
35
+ opts.addCPU(true);
36
+ // Create an ONNX session using the environment, model path, and options
37
+ session = env.createSession(modelPath, opts);
38
+ // Reset states
39
+ resetStates();
40
+ }
41
+
42
+ /**
43
+ * Reset states
44
+ */
45
+ void resetStates() {
46
+ h = new float[2][1][64];
47
+ c = new float[2][1][64];
48
+ lastSr = 0;
49
+ lastBatchSize = 0;
50
+ }
51
+
52
+ public void close() throws OrtException {
53
+ session.close();
54
+ }
55
+
56
+ /**
57
+ * Define inner class ValidationResult
58
+ */
59
+ public static class ValidationResult {
60
+ public final float[][] x;
61
+ public final int sr;
62
+
63
+ // Constructor
64
+ public ValidationResult(float[][] x, int sr) {
65
+ this.x = x;
66
+ this.sr = sr;
67
+ }
68
+ }
69
+
70
+ /**
71
+ * Function to validate input data
72
+ */
73
+ private ValidationResult validateInput(float[][] x, int sr) {
74
+ // Process the input data with dimension 1
75
+ if (x.length == 1) {
76
+ x = new float[][]{x[0]};
77
+ }
78
+ // Throw an exception when the input data dimension is greater than 2
79
+ if (x.length > 2) {
80
+ throw new IllegalArgumentException("Incorrect audio data dimension: " + x[0].length);
81
+ }
82
+
83
+ // Process the input data when the sample rate is not equal to 16000 and is a multiple of 16000
84
+ if (sr != 16000 && (sr % 16000 == 0)) {
85
+ int step = sr / 16000;
86
+ float[][] reducedX = new float[x.length][];
87
+
88
+ for (int i = 0; i < x.length; i++) {
89
+ float[] current = x[i];
90
+ float[] newArr = new float[(current.length + step - 1) / step];
91
+
92
+ for (int j = 0, index = 0; j < current.length; j += step, index++) {
93
+ newArr[index] = current[j];
94
+ }
95
+
96
+ reducedX[i] = newArr;
97
+ }
98
+
99
+ x = reducedX;
100
+ sr = 16000;
101
+ }
102
+
103
+ // If the sample rate is not in the list of supported sample rates, throw an exception
104
+ if (!SAMPLE_RATES.contains(sr)) {
105
+ throw new IllegalArgumentException("Only supports sample rates " + SAMPLE_RATES + " (or multiples of 16000)");
106
+ }
107
+
108
+ // If the input audio block is too short, throw an exception
109
+ if (((float) sr) / x[0].length > 31.25) {
110
+ throw new IllegalArgumentException("Input audio is too short");
111
+ }
112
+
113
+ // Return the validated result
114
+ return new ValidationResult(x, sr);
115
+ }
116
+
117
+ /**
118
+ * Method to call the ONNX model
119
+ */
120
+ public float[] call(float[][] x, int sr) throws OrtException {
121
+ ValidationResult result = validateInput(x, sr);
122
+ x = result.x;
123
+ sr = result.sr;
124
+
125
+ int batchSize = x.length;
126
+
127
+ if (lastBatchSize == 0 || lastSr != sr || lastBatchSize != batchSize) {
128
+ resetStates();
129
+ }
130
+
131
+ OrtEnvironment env = OrtEnvironment.getEnvironment();
132
+
133
+ OnnxTensor inputTensor = null;
134
+ OnnxTensor hTensor = null;
135
+ OnnxTensor cTensor = null;
136
+ OnnxTensor srTensor = null;
137
+ OrtSession.Result ortOutputs = null;
138
+
139
+ try {
140
+ // Create input tensors
141
+ inputTensor = OnnxTensor.createTensor(env, x);
142
+ hTensor = OnnxTensor.createTensor(env, h);
143
+ cTensor = OnnxTensor.createTensor(env, c);
144
+ srTensor = OnnxTensor.createTensor(env, new long[]{sr});
145
+
146
+ Map<String, OnnxTensor> inputs = new HashMap<>();
147
+ inputs.put("input", inputTensor);
148
+ inputs.put("sr", srTensor);
149
+ inputs.put("h", hTensor);
150
+ inputs.put("c", cTensor);
151
+
152
+ // Call the ONNX model for calculation
153
+ ortOutputs = session.run(inputs);
154
+ // Get the output results
155
+ float[][] output = (float[][]) ortOutputs.get(0).getValue();
156
+ h = (float[][][]) ortOutputs.get(1).getValue();
157
+ c = (float[][][]) ortOutputs.get(2).getValue();
158
+
159
+ lastSr = sr;
160
+ lastBatchSize = batchSize;
161
+ return output[0];
162
+ } finally {
163
+ if (inputTensor != null) {
164
+ inputTensor.close();
165
+ }
166
+ if (hTensor != null) {
167
+ hTensor.close();
168
+ }
169
+ if (cTensor != null) {
170
+ cTensor.close();
171
+ }
172
+ if (srTensor != null) {
173
+ srTensor.close();
174
+ }
175
+ if (ortOutputs != null) {
176
+ ortOutputs.close();
177
+ }
178
+ }
179
+ }
180
+ }
snakers4_silero-vad_master/examples/microphone_and_webRTC_integration/README.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ In this example, an integration with the microphone and the webRTC VAD has been done. I used [this](https://github.com/mozilla/DeepSpeech-examples/tree/r0.8/mic_vad_streaming) as a draft.
3
+ Here a short video to present the results:
4
+
5
+ https://user-images.githubusercontent.com/28188499/116685087-182ff100-a9b2-11eb-927d-ed9f621226ee.mp4
6
+
7
+ # Requirements:
8
+ The libraries used for the following example are:
9
+ ```
10
+ Python == 3.6.9
11
+ webrtcvad >= 2.0.10
12
+ torchaudio >= 0.8.1
13
+ torch >= 1.8.1
14
+ halo >= 0.0.31
15
+ Soundfile >= 0.13.3
16
+ ```
17
+ Using pip3:
18
+ ```
19
+ pip3 install webrtcvad
20
+ pip3 install torchaudio
21
+ pip3 install torch
22
+ pip3 install halo
23
+ pip3 install soundfile
24
+ ```
25
+ Moreover, to make the code easier, the default sample_rate is 16KHz without resampling.
26
+
27
+ This example has been tested on ``` ubuntu 18.04.3 LTS```
28
+
snakers4_silero-vad_master/examples/microphone_and_webRTC_integration/microphone_and_webRTC_integration.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import collections, queue
2
+ import numpy as np
3
+ import pyaudio
4
+ import webrtcvad
5
+ from halo import Halo
6
+ import torch
7
+ import torchaudio
8
+
9
+ class Audio(object):
10
+ """Streams raw audio from microphone. Data is received in a separate thread, and stored in a buffer, to be read from."""
11
+
12
+ FORMAT = pyaudio.paInt16
13
+ # Network/VAD rate-space
14
+ RATE_PROCESS = 16000
15
+ CHANNELS = 1
16
+ BLOCKS_PER_SECOND = 50
17
+
18
+ def __init__(self, callback=None, device=None, input_rate=RATE_PROCESS):
19
+ def proxy_callback(in_data, frame_count, time_info, status):
20
+ #pylint: disable=unused-argument
21
+ callback(in_data)
22
+ return (None, pyaudio.paContinue)
23
+ if callback is None: callback = lambda in_data: self.buffer_queue.put(in_data)
24
+ self.buffer_queue = queue.Queue()
25
+ self.device = device
26
+ self.input_rate = input_rate
27
+ self.sample_rate = self.RATE_PROCESS
28
+ self.block_size = int(self.RATE_PROCESS / float(self.BLOCKS_PER_SECOND))
29
+ self.block_size_input = int(self.input_rate / float(self.BLOCKS_PER_SECOND))
30
+ self.pa = pyaudio.PyAudio()
31
+
32
+ kwargs = {
33
+ 'format': self.FORMAT,
34
+ 'channels': self.CHANNELS,
35
+ 'rate': self.input_rate,
36
+ 'input': True,
37
+ 'frames_per_buffer': self.block_size_input,
38
+ 'stream_callback': proxy_callback,
39
+ }
40
+
41
+ self.chunk = None
42
+ # if not default device
43
+ if self.device:
44
+ kwargs['input_device_index'] = self.device
45
+
46
+ self.stream = self.pa.open(**kwargs)
47
+ self.stream.start_stream()
48
+
49
+ def read(self):
50
+ """Return a block of audio data, blocking if necessary."""
51
+ return self.buffer_queue.get()
52
+
53
+ def destroy(self):
54
+ self.stream.stop_stream()
55
+ self.stream.close()
56
+ self.pa.terminate()
57
+
58
+ frame_duration_ms = property(lambda self: 1000 * self.block_size // self.sample_rate)
59
+
60
+
61
+ class VADAudio(Audio):
62
+ """Filter & segment audio with voice activity detection."""
63
+
64
+ def __init__(self, aggressiveness=3, device=None, input_rate=None):
65
+ super().__init__(device=device, input_rate=input_rate)
66
+ self.vad = webrtcvad.Vad(aggressiveness)
67
+
68
+ def frame_generator(self):
69
+ """Generator that yields all audio frames from microphone."""
70
+ if self.input_rate == self.RATE_PROCESS:
71
+ while True:
72
+ yield self.read()
73
+ else:
74
+ raise Exception("Resampling required")
75
+
76
+ def vad_collector(self, padding_ms=300, ratio=0.75, frames=None):
77
+ """Generator that yields series of consecutive audio frames comprising each utterence, separated by yielding a single None.
78
+ Determines voice activity by ratio of frames in padding_ms. Uses a buffer to include padding_ms prior to being triggered.
79
+ Example: (frame, ..., frame, None, frame, ..., frame, None, ...)
80
+ |---utterence---| |---utterence---|
81
+ """
82
+ if frames is None: frames = self.frame_generator()
83
+ num_padding_frames = padding_ms // self.frame_duration_ms
84
+ ring_buffer = collections.deque(maxlen=num_padding_frames)
85
+ triggered = False
86
+
87
+ for frame in frames:
88
+ if len(frame) < 640:
89
+ return
90
+
91
+ is_speech = self.vad.is_speech(frame, self.sample_rate)
92
+
93
+ if not triggered:
94
+ ring_buffer.append((frame, is_speech))
95
+ num_voiced = len([f for f, speech in ring_buffer if speech])
96
+ if num_voiced > ratio * ring_buffer.maxlen:
97
+ triggered = True
98
+ for f, s in ring_buffer:
99
+ yield f
100
+ ring_buffer.clear()
101
+
102
+ else:
103
+ yield frame
104
+ ring_buffer.append((frame, is_speech))
105
+ num_unvoiced = len([f for f, speech in ring_buffer if not speech])
106
+ if num_unvoiced > ratio * ring_buffer.maxlen:
107
+ triggered = False
108
+ yield None
109
+ ring_buffer.clear()
110
+
111
+ def main(ARGS):
112
+ # Start audio with VAD
113
+ vad_audio = VADAudio(aggressiveness=ARGS.webRTC_aggressiveness,
114
+ device=ARGS.device,
115
+ input_rate=ARGS.rate)
116
+
117
+ print("Listening (ctrl-C to exit)...")
118
+ frames = vad_audio.vad_collector()
119
+
120
+ # load silero VAD
121
+ torchaudio.set_audio_backend("soundfile")
122
+ model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
123
+ model=ARGS.silaro_model_name,
124
+ force_reload= ARGS.reload)
125
+ (get_speech_ts,_,_, _,_, _, _) = utils
126
+
127
+
128
+ # Stream from microphone to DeepSpeech using VAD
129
+ spinner = None
130
+ if not ARGS.nospinner:
131
+ spinner = Halo(spinner='line')
132
+ wav_data = bytearray()
133
+ for frame in frames:
134
+ if frame is not None:
135
+ if spinner: spinner.start()
136
+
137
+ wav_data.extend(frame)
138
+ else:
139
+ if spinner: spinner.stop()
140
+ print("webRTC has detected a possible speech")
141
+
142
+ newsound= np.frombuffer(wav_data,np.int16)
143
+ audio_float32=Int2Float(newsound)
144
+ time_stamps =get_speech_ts(audio_float32, model,num_steps=ARGS.num_steps,trig_sum=ARGS.trig_sum,neg_trig_sum=ARGS.neg_trig_sum,
145
+ num_samples_per_window=ARGS.num_samples_per_window,min_speech_samples=ARGS.min_speech_samples,
146
+ min_silence_samples=ARGS.min_silence_samples)
147
+
148
+ if(len(time_stamps)>0):
149
+ print("silero VAD has detected a possible speech")
150
+ else:
151
+ print("silero VAD has detected a noise")
152
+ print()
153
+ wav_data = bytearray()
154
+
155
+
156
+ def Int2Float(sound):
157
+ _sound = np.copy(sound) #
158
+ abs_max = np.abs(_sound).max()
159
+ _sound = _sound.astype('float32')
160
+ if abs_max > 0:
161
+ _sound *= 1/abs_max
162
+ audio_float32 = torch.from_numpy(_sound.squeeze())
163
+ return audio_float32
164
+
165
+ if __name__ == '__main__':
166
+ DEFAULT_SAMPLE_RATE = 16000
167
+
168
+ import argparse
169
+ parser = argparse.ArgumentParser(description="Stream from microphone to webRTC and silero VAD")
170
+
171
+ parser.add_argument('-v', '--webRTC_aggressiveness', type=int, default=3,
172
+ help="Set aggressiveness of webRTC: an integer between 0 and 3, 0 being the least aggressive about filtering out non-speech, 3 the most aggressive. Default: 3")
173
+ parser.add_argument('--nospinner', action='store_true',
174
+ help="Disable spinner")
175
+ parser.add_argument('-d', '--device', type=int, default=None,
176
+ help="Device input index (Int) as listed by pyaudio.PyAudio.get_device_info_by_index(). If not provided, falls back to PyAudio.get_default_device().")
177
+
178
+ parser.add_argument('-name', '--silaro_model_name', type=str, default="silero_vad",
179
+ help="select the name of the model. You can select between 'silero_vad',''silero_vad_micro','silero_vad_micro_8k','silero_vad_mini','silero_vad_mini_8k'")
180
+ parser.add_argument('--reload', action='store_true',help="download the last version of the silero vad")
181
+
182
+ parser.add_argument('-ts', '--trig_sum', type=float, default=0.25,
183
+ help="overlapping windows are used for each audio chunk, trig sum defines average probability among those windows for switching into triggered state (speech state)")
184
+
185
+ parser.add_argument('-nts', '--neg_trig_sum', type=float, default=0.07,
186
+ help="same as trig_sum, but for switching from triggered to non-triggered state (non-speech)")
187
+
188
+ parser.add_argument('-N', '--num_steps', type=int, default=8,
189
+ help="nubmer of overlapping windows to split audio chunk into (we recommend 4 or 8)")
190
+
191
+ parser.add_argument('-nspw', '--num_samples_per_window', type=int, default=4000,
192
+ help="number of samples in each window, our models were trained using 4000 samples (250 ms) per window, so this is preferable value (lesser values reduce quality)")
193
+
194
+ parser.add_argument('-msps', '--min_speech_samples', type=int, default=10000,
195
+ help="minimum speech chunk duration in samples")
196
+
197
+ parser.add_argument('-msis', '--min_silence_samples', type=int, default=500,
198
+ help=" minimum silence duration in samples between to separate speech chunks")
199
+ ARGS = parser.parse_args()
200
+ ARGS.rate=DEFAULT_SAMPLE_RATE
201
+ main(ARGS)
snakers4_silero-vad_master/examples/parallel_example.ipynb ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "attachments": {},
5
+ "cell_type": "markdown",
6
+ "metadata": {},
7
+ "source": [
8
+ "## Install Dependencies"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": null,
14
+ "metadata": {},
15
+ "outputs": [],
16
+ "source": [
17
+ "# !pip install -q torchaudio\n",
18
+ "SAMPLING_RATE = 16000\n",
19
+ "import torch\n",
20
+ "from pprint import pprint\n",
21
+ "\n",
22
+ "torch.set_num_threads(1)\n",
23
+ "NUM_PROCESS=4 # set to the number of CPU cores in the machine\n",
24
+ "NUM_COPIES=8\n",
25
+ "# download wav files, make multiple copies\n",
26
+ "for idx in range(NUM_COPIES):\n",
27
+ " torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', f\"en_example{idx}.wav\")\n"
28
+ ]
29
+ },
30
+ {
31
+ "attachments": {},
32
+ "cell_type": "markdown",
33
+ "metadata": {},
34
+ "source": [
35
+ "## Load VAD model from torch hub"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": null,
41
+ "metadata": {},
42
+ "outputs": [],
43
+ "source": [
44
+ "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
45
+ " model='silero_vad',\n",
46
+ " force_reload=True,\n",
47
+ " onnx=False)\n",
48
+ "\n",
49
+ "(get_speech_timestamps,\n",
50
+ "save_audio,\n",
51
+ "read_audio,\n",
52
+ "VADIterator,\n",
53
+ "collect_chunks) = utils"
54
+ ]
55
+ },
56
+ {
57
+ "attachments": {},
58
+ "cell_type": "markdown",
59
+ "metadata": {},
60
+ "source": [
61
+ "## Define a vad process function"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "execution_count": null,
67
+ "metadata": {},
68
+ "outputs": [],
69
+ "source": [
70
+ "import multiprocessing\n",
71
+ "\n",
72
+ "vad_models = dict()\n",
73
+ "\n",
74
+ "def init_model(model):\n",
75
+ " pid = multiprocessing.current_process().pid\n",
76
+ " model, _ = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
77
+ " model='silero_vad',\n",
78
+ " force_reload=False,\n",
79
+ " onnx=False)\n",
80
+ " vad_models[pid] = model\n",
81
+ "\n",
82
+ "def vad_process(audio_file: str):\n",
83
+ " \n",
84
+ " pid = multiprocessing.current_process().pid\n",
85
+ " \n",
86
+ " with torch.no_grad():\n",
87
+ " wav = read_audio(audio_file, sampling_rate=SAMPLING_RATE)\n",
88
+ " return get_speech_timestamps(\n",
89
+ " wav,\n",
90
+ " vad_models[pid],\n",
91
+ " 0.46, # speech prob threshold\n",
92
+ " 16000, # sample rate\n",
93
+ " 300, # min speech duration in ms\n",
94
+ " 20, # max speech duration in seconds\n",
95
+ " 600, # min silence duration\n",
96
+ " 512, # window size\n",
97
+ " 200, # spech pad ms\n",
98
+ " )"
99
+ ]
100
+ },
101
+ {
102
+ "attachments": {},
103
+ "cell_type": "markdown",
104
+ "metadata": {},
105
+ "source": [
106
+ "## Parallelization"
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "execution_count": null,
112
+ "metadata": {},
113
+ "outputs": [],
114
+ "source": [
115
+ "from concurrent.futures import ProcessPoolExecutor, as_completed\n",
116
+ "\n",
117
+ "futures = []\n",
118
+ "\n",
119
+ "with ProcessPoolExecutor(max_workers=NUM_PROCESS, initializer=init_model, initargs=(model,)) as ex:\n",
120
+ " for i in range(NUM_COPIES):\n",
121
+ " futures.append(ex.submit(vad_process, f\"en_example{idx}.wav\"))\n",
122
+ "\n",
123
+ "for finished in as_completed(futures):\n",
124
+ " pprint(finished.result())"
125
+ ]
126
+ }
127
+ ],
128
+ "metadata": {
129
+ "kernelspec": {
130
+ "display_name": "diarization",
131
+ "language": "python",
132
+ "name": "python3"
133
+ },
134
+ "language_info": {
135
+ "codemirror_mode": {
136
+ "name": "ipython",
137
+ "version": 3
138
+ },
139
+ "file_extension": ".py",
140
+ "mimetype": "text/x-python",
141
+ "name": "python",
142
+ "nbconvert_exporter": "python",
143
+ "pygments_lexer": "ipython3",
144
+ "version": "3.9.15"
145
+ }
146
+ },
147
+ "nbformat": 4,
148
+ "nbformat_minor": 2
149
+ }
snakers4_silero-vad_master/examples/pyaudio-streaming/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Pyaudio Streaming Example
2
+
3
+ This example notebook shows how micophone audio fetched by pyaudio can be processed with Silero-VAD.
4
+
5
+ It has been designed as a low-level example for binary real-time streaming using only the prediction of the model, processing the binary data and plotting the speech probabilities at the end to visualize it.
6
+
7
+ Currently, the notebook consits of two examples:
8
+ - One that records audio of a predefined length from the microphone, process it with Silero-VAD, and plots it afterwards.
9
+ - The other one plots the speech probabilities in real-time (using jupyterplot) and records the audio until you press enter.
10
+
11
+ ## Example Video for the Real-Time Visualization
12
+
13
+
14
+ https://user-images.githubusercontent.com/8079748/117580455-4622dd00-b0f8-11eb-858d-e6368ed4eada.mp4
15
+
16
+
17
+
18
+
19
+
20
+
snakers4_silero-vad_master/examples/pyaudio-streaming/pyaudio-streaming-examples.ipynb ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "62a0cccb",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Pyaudio Microphone Streaming Examples\n",
9
+ "\n",
10
+ "A simple notebook that uses pyaudio to get the microphone audio and feeds this audio then to Silero VAD.\n",
11
+ "\n",
12
+ "I created it as an example on how binary data from a stream could be feed into Silero VAD.\n",
13
+ "\n",
14
+ "\n",
15
+ "Has been tested on Ubuntu 21.04 (x86). After you installed the dependencies below, no additional setup is required."
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "markdown",
20
+ "id": "64cbe1eb",
21
+ "metadata": {},
22
+ "source": [
23
+ "## Dependencies\n",
24
+ "The cell below lists all used dependencies and the used versions. Uncomment to install them from within the notebook."
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": null,
30
+ "id": "57bc2aac",
31
+ "metadata": {},
32
+ "outputs": [],
33
+ "source": [
34
+ "#!pip install numpy==1.20.2\n",
35
+ "#!pip install torch==1.9.0\n",
36
+ "#!pip install matplotlib==3.4.2\n",
37
+ "#!pip install torchaudio==0.9.0\n",
38
+ "#!pip install soundfile==0.10.3.post1\n",
39
+ "#!pip install pyaudio==0.2.11"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "markdown",
44
+ "id": "110de761",
45
+ "metadata": {},
46
+ "source": [
47
+ "## Imports"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": null,
53
+ "id": "5a647d8d",
54
+ "metadata": {},
55
+ "outputs": [],
56
+ "source": [
57
+ "import io\n",
58
+ "import numpy as np\n",
59
+ "import torch\n",
60
+ "torch.set_num_threads(1)\n",
61
+ "import torchaudio\n",
62
+ "import matplotlib\n",
63
+ "import matplotlib.pylab as plt\n",
64
+ "torchaudio.set_audio_backend(\"soundfile\")\n",
65
+ "import pyaudio"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": null,
71
+ "id": "725d7066",
72
+ "metadata": {},
73
+ "outputs": [],
74
+ "source": [
75
+ "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
76
+ " model='silero_vad',\n",
77
+ " force_reload=True)"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "code",
82
+ "execution_count": null,
83
+ "id": "1c0b2ea7",
84
+ "metadata": {},
85
+ "outputs": [],
86
+ "source": [
87
+ "(get_speech_timestamps,\n",
88
+ " save_audio,\n",
89
+ " read_audio,\n",
90
+ " VADIterator,\n",
91
+ " collect_chunks) = utils"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "markdown",
96
+ "id": "f9112603",
97
+ "metadata": {},
98
+ "source": [
99
+ "### Helper Methods"
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "execution_count": null,
105
+ "id": "5abc6330",
106
+ "metadata": {},
107
+ "outputs": [],
108
+ "source": [
109
+ "# Taken from utils_vad.py\n",
110
+ "def validate(model,\n",
111
+ " inputs: torch.Tensor):\n",
112
+ " with torch.no_grad():\n",
113
+ " outs = model(inputs)\n",
114
+ " return outs\n",
115
+ "\n",
116
+ "# Provided by Alexander Veysov\n",
117
+ "def int2float(sound):\n",
118
+ " abs_max = np.abs(sound).max()\n",
119
+ " sound = sound.astype('float32')\n",
120
+ " if abs_max > 0:\n",
121
+ " sound *= 1/32768\n",
122
+ " sound = sound.squeeze() # depends on the use case\n",
123
+ " return sound"
124
+ ]
125
+ },
126
+ {
127
+ "cell_type": "markdown",
128
+ "id": "5124095e",
129
+ "metadata": {},
130
+ "source": [
131
+ "## Pyaudio Set-up"
132
+ ]
133
+ },
134
+ {
135
+ "cell_type": "code",
136
+ "execution_count": null,
137
+ "id": "a845356e",
138
+ "metadata": {},
139
+ "outputs": [],
140
+ "source": [
141
+ "FORMAT = pyaudio.paInt16\n",
142
+ "CHANNELS = 1\n",
143
+ "SAMPLE_RATE = 16000\n",
144
+ "CHUNK = int(SAMPLE_RATE / 10)\n",
145
+ "\n",
146
+ "audio = pyaudio.PyAudio()"
147
+ ]
148
+ },
149
+ {
150
+ "cell_type": "markdown",
151
+ "id": "0b910c99",
152
+ "metadata": {},
153
+ "source": [
154
+ "## Simple Example\n",
155
+ "The following example reads the audio as 250ms chunks from the microphone, converts them to a Pytorch Tensor, and gets the probabilities/confidences if the model thinks the frame is voiced."
156
+ ]
157
+ },
158
+ {
159
+ "cell_type": "code",
160
+ "execution_count": null,
161
+ "id": "9d3d2c10",
162
+ "metadata": {},
163
+ "outputs": [],
164
+ "source": [
165
+ "num_samples = 1536"
166
+ ]
167
+ },
168
+ {
169
+ "cell_type": "code",
170
+ "execution_count": null,
171
+ "id": "3cb44a4a",
172
+ "metadata": {},
173
+ "outputs": [],
174
+ "source": [
175
+ "stream = audio.open(format=FORMAT,\n",
176
+ " channels=CHANNELS,\n",
177
+ " rate=SAMPLE_RATE,\n",
178
+ " input=True,\n",
179
+ " frames_per_buffer=CHUNK)\n",
180
+ "data = []\n",
181
+ "voiced_confidences = []\n",
182
+ "\n",
183
+ "print(\"Started Recording\")\n",
184
+ "for i in range(0, frames_to_record):\n",
185
+ " \n",
186
+ " audio_chunk = stream.read(num_samples)\n",
187
+ " \n",
188
+ " # in case you want to save the audio later\n",
189
+ " data.append(audio_chunk)\n",
190
+ " \n",
191
+ " audio_int16 = np.frombuffer(audio_chunk, np.int16);\n",
192
+ "\n",
193
+ " audio_float32 = int2float(audio_int16)\n",
194
+ " \n",
195
+ " # get the confidences and add them to the list to plot them later\n",
196
+ " new_confidence = model(torch.from_numpy(audio_float32), 16000).item()\n",
197
+ " voiced_confidences.append(new_confidence)\n",
198
+ " \n",
199
+ "print(\"Stopped the recording\")\n",
200
+ "\n",
201
+ "# plot the confidences for the speech\n",
202
+ "plt.figure(figsize=(20,6))\n",
203
+ "plt.plot(voiced_confidences)\n",
204
+ "plt.show()"
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "markdown",
209
+ "id": "a3dda982",
210
+ "metadata": {},
211
+ "source": [
212
+ "## Real Time Visualization\n",
213
+ "\n",
214
+ "As an enhancement to plot the speech probabilities in real time I added the implementation below.\n",
215
+ "In contrast to the simeple one, it records the audio until to stop the recording by pressing enter.\n",
216
+ "While looking into good ways to update matplotlib plots in real-time, I found a simple libarary that does the job. https://github.com/lvwerra/jupyterplot It has some limitations, but works for this use case really well.\n"
217
+ ]
218
+ },
219
+ {
220
+ "cell_type": "code",
221
+ "execution_count": null,
222
+ "id": "05ef4100",
223
+ "metadata": {},
224
+ "outputs": [],
225
+ "source": [
226
+ "#!pip install jupyterplot==0.0.3"
227
+ ]
228
+ },
229
+ {
230
+ "cell_type": "code",
231
+ "execution_count": null,
232
+ "id": "d1d4cdd6",
233
+ "metadata": {},
234
+ "outputs": [],
235
+ "source": [
236
+ "from jupyterplot import ProgressPlot\n",
237
+ "import threading\n",
238
+ "\n",
239
+ "continue_recording = True\n",
240
+ "\n",
241
+ "def stop():\n",
242
+ " input(\"Press Enter to stop the recording:\")\n",
243
+ " global continue_recording\n",
244
+ " continue_recording = False\n",
245
+ "\n",
246
+ "def start_recording():\n",
247
+ " \n",
248
+ " stream = audio.open(format=FORMAT,\n",
249
+ " channels=CHANNELS,\n",
250
+ " rate=SAMPLE_RATE,\n",
251
+ " input=True,\n",
252
+ " frames_per_buffer=CHUNK)\n",
253
+ "\n",
254
+ " data = []\n",
255
+ " voiced_confidences = []\n",
256
+ " \n",
257
+ " global continue_recording\n",
258
+ " continue_recording = True\n",
259
+ " \n",
260
+ " pp = ProgressPlot(plot_names=[\"Silero VAD\"],line_names=[\"speech probabilities\"], x_label=\"audio chunks\")\n",
261
+ " \n",
262
+ " stop_listener = threading.Thread(target=stop)\n",
263
+ " stop_listener.start()\n",
264
+ "\n",
265
+ " while continue_recording:\n",
266
+ " \n",
267
+ " audio_chunk = stream.read(num_samples)\n",
268
+ " \n",
269
+ " # in case you want to save the audio later\n",
270
+ " data.append(audio_chunk)\n",
271
+ " \n",
272
+ " audio_int16 = np.frombuffer(audio_chunk, np.int16);\n",
273
+ "\n",
274
+ " audio_float32 = int2float(audio_int16)\n",
275
+ " \n",
276
+ " # get the confidences and add them to the list to plot them later\n",
277
+ " new_confidence = model(torch.from_numpy(audio_float32), 16000).item()\n",
278
+ " voiced_confidences.append(new_confidence)\n",
279
+ " \n",
280
+ " pp.update(new_confidence)\n",
281
+ "\n",
282
+ "\n",
283
+ " pp.finalize()"
284
+ ]
285
+ },
286
+ {
287
+ "cell_type": "code",
288
+ "execution_count": null,
289
+ "id": "1e398009",
290
+ "metadata": {},
291
+ "outputs": [],
292
+ "source": [
293
+ "start_recording()"
294
+ ]
295
+ }
296
+ ],
297
+ "metadata": {
298
+ "kernelspec": {
299
+ "display_name": "Python 3",
300
+ "language": "python",
301
+ "name": "python3"
302
+ },
303
+ "language_info": {
304
+ "codemirror_mode": {
305
+ "name": "ipython",
306
+ "version": 3
307
+ },
308
+ "file_extension": ".py",
309
+ "mimetype": "text/x-python",
310
+ "name": "python",
311
+ "nbconvert_exporter": "python",
312
+ "pygments_lexer": "ipython3",
313
+ "version": "3.7.10"
314
+ },
315
+ "toc": {
316
+ "base_numbering": 1,
317
+ "nav_menu": {},
318
+ "number_sections": true,
319
+ "sideBar": true,
320
+ "skip_h1_title": false,
321
+ "title_cell": "Table of Contents",
322
+ "title_sidebar": "Contents",
323
+ "toc_cell": false,
324
+ "toc_position": {},
325
+ "toc_section_display": true,
326
+ "toc_window_display": false
327
+ }
328
+ },
329
+ "nbformat": 4,
330
+ "nbformat_minor": 5
331
+ }
snakers4_silero-vad_master/examples/rust-example/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ target/
2
+ recorder.wav
snakers4_silero-vad_master/examples/rust-example/Cargo.lock ADDED
@@ -0,0 +1,781 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is automatically @generated by Cargo.
2
+ # It is not intended for manual editing.
3
+ version = 3
4
+
5
+ [[package]]
6
+ name = "adler"
7
+ version = "1.0.2"
8
+ source = "registry+https://github.com/rust-lang/crates.io-index"
9
+ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
10
+
11
+ [[package]]
12
+ name = "autocfg"
13
+ version = "1.3.0"
14
+ source = "registry+https://github.com/rust-lang/crates.io-index"
15
+ checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0"
16
+
17
+ [[package]]
18
+ name = "base64"
19
+ version = "0.22.1"
20
+ source = "registry+https://github.com/rust-lang/crates.io-index"
21
+ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
22
+
23
+ [[package]]
24
+ name = "bitflags"
25
+ version = "1.3.2"
26
+ source = "registry+https://github.com/rust-lang/crates.io-index"
27
+ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
28
+
29
+ [[package]]
30
+ name = "bitflags"
31
+ version = "2.5.0"
32
+ source = "registry+https://github.com/rust-lang/crates.io-index"
33
+ checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
34
+
35
+ [[package]]
36
+ name = "block-buffer"
37
+ version = "0.10.4"
38
+ source = "registry+https://github.com/rust-lang/crates.io-index"
39
+ checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
40
+ dependencies = [
41
+ "generic-array",
42
+ ]
43
+
44
+ [[package]]
45
+ name = "bumpalo"
46
+ version = "3.16.0"
47
+ source = "registry+https://github.com/rust-lang/crates.io-index"
48
+ checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
49
+
50
+ [[package]]
51
+ name = "cc"
52
+ version = "1.0.98"
53
+ source = "registry+https://github.com/rust-lang/crates.io-index"
54
+ checksum = "41c270e7540d725e65ac7f1b212ac8ce349719624d7bcff99f8e2e488e8cf03f"
55
+
56
+ [[package]]
57
+ name = "cfg-if"
58
+ version = "1.0.0"
59
+ source = "registry+https://github.com/rust-lang/crates.io-index"
60
+ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
61
+
62
+ [[package]]
63
+ name = "cpufeatures"
64
+ version = "0.2.12"
65
+ source = "registry+https://github.com/rust-lang/crates.io-index"
66
+ checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504"
67
+ dependencies = [
68
+ "libc",
69
+ ]
70
+
71
+ [[package]]
72
+ name = "crc32fast"
73
+ version = "1.4.2"
74
+ source = "registry+https://github.com/rust-lang/crates.io-index"
75
+ checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
76
+ dependencies = [
77
+ "cfg-if",
78
+ ]
79
+
80
+ [[package]]
81
+ name = "crunchy"
82
+ version = "0.2.2"
83
+ source = "registry+https://github.com/rust-lang/crates.io-index"
84
+ checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
85
+
86
+ [[package]]
87
+ name = "crypto-common"
88
+ version = "0.1.6"
89
+ source = "registry+https://github.com/rust-lang/crates.io-index"
90
+ checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
91
+ dependencies = [
92
+ "generic-array",
93
+ "typenum",
94
+ ]
95
+
96
+ [[package]]
97
+ name = "digest"
98
+ version = "0.10.7"
99
+ source = "registry+https://github.com/rust-lang/crates.io-index"
100
+ checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
101
+ dependencies = [
102
+ "block-buffer",
103
+ "crypto-common",
104
+ ]
105
+
106
+ [[package]]
107
+ name = "errno"
108
+ version = "0.3.9"
109
+ source = "registry+https://github.com/rust-lang/crates.io-index"
110
+ checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
111
+ dependencies = [
112
+ "libc",
113
+ "windows-sys",
114
+ ]
115
+
116
+ [[package]]
117
+ name = "filetime"
118
+ version = "0.2.23"
119
+ source = "registry+https://github.com/rust-lang/crates.io-index"
120
+ checksum = "1ee447700ac8aa0b2f2bd7bc4462ad686ba06baa6727ac149a2d6277f0d240fd"
121
+ dependencies = [
122
+ "cfg-if",
123
+ "libc",
124
+ "redox_syscall",
125
+ "windows-sys",
126
+ ]
127
+
128
+ [[package]]
129
+ name = "flate2"
130
+ version = "1.0.30"
131
+ source = "registry+https://github.com/rust-lang/crates.io-index"
132
+ checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae"
133
+ dependencies = [
134
+ "crc32fast",
135
+ "miniz_oxide",
136
+ ]
137
+
138
+ [[package]]
139
+ name = "form_urlencoded"
140
+ version = "1.2.1"
141
+ source = "registry+https://github.com/rust-lang/crates.io-index"
142
+ checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
143
+ dependencies = [
144
+ "percent-encoding",
145
+ ]
146
+
147
+ [[package]]
148
+ name = "generic-array"
149
+ version = "0.14.7"
150
+ source = "registry+https://github.com/rust-lang/crates.io-index"
151
+ checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
152
+ dependencies = [
153
+ "typenum",
154
+ "version_check",
155
+ ]
156
+
157
+ [[package]]
158
+ name = "getrandom"
159
+ version = "0.2.15"
160
+ source = "registry+https://github.com/rust-lang/crates.io-index"
161
+ checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
162
+ dependencies = [
163
+ "cfg-if",
164
+ "libc",
165
+ "wasi",
166
+ ]
167
+
168
+ [[package]]
169
+ name = "half"
170
+ version = "2.4.1"
171
+ source = "registry+https://github.com/rust-lang/crates.io-index"
172
+ checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
173
+ dependencies = [
174
+ "cfg-if",
175
+ "crunchy",
176
+ ]
177
+
178
+ [[package]]
179
+ name = "hound"
180
+ version = "3.5.1"
181
+ source = "registry+https://github.com/rust-lang/crates.io-index"
182
+ checksum = "62adaabb884c94955b19907d60019f4e145d091c75345379e70d1ee696f7854f"
183
+
184
+ [[package]]
185
+ name = "idna"
186
+ version = "0.5.0"
187
+ source = "registry+https://github.com/rust-lang/crates.io-index"
188
+ checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6"
189
+ dependencies = [
190
+ "unicode-bidi",
191
+ "unicode-normalization",
192
+ ]
193
+
194
+ [[package]]
195
+ name = "js-sys"
196
+ version = "0.3.69"
197
+ source = "registry+https://github.com/rust-lang/crates.io-index"
198
+ checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
199
+ dependencies = [
200
+ "wasm-bindgen",
201
+ ]
202
+
203
+ [[package]]
204
+ name = "libc"
205
+ version = "0.2.155"
206
+ source = "registry+https://github.com/rust-lang/crates.io-index"
207
+ checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
208
+
209
+ [[package]]
210
+ name = "libloading"
211
+ version = "0.8.3"
212
+ source = "registry+https://github.com/rust-lang/crates.io-index"
213
+ checksum = "0c2a198fb6b0eada2a8df47933734e6d35d350665a33a3593d7164fa52c75c19"
214
+ dependencies = [
215
+ "cfg-if",
216
+ "windows-targets",
217
+ ]
218
+
219
+ [[package]]
220
+ name = "linux-raw-sys"
221
+ version = "0.4.14"
222
+ source = "registry+https://github.com/rust-lang/crates.io-index"
223
+ checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
224
+
225
+ [[package]]
226
+ name = "log"
227
+ version = "0.4.21"
228
+ source = "registry+https://github.com/rust-lang/crates.io-index"
229
+ checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
230
+
231
+ [[package]]
232
+ name = "matrixmultiply"
233
+ version = "0.3.8"
234
+ source = "registry+https://github.com/rust-lang/crates.io-index"
235
+ checksum = "7574c1cf36da4798ab73da5b215bbf444f50718207754cb522201d78d1cd0ff2"
236
+ dependencies = [
237
+ "autocfg",
238
+ "rawpointer",
239
+ ]
240
+
241
+ [[package]]
242
+ name = "miniz_oxide"
243
+ version = "0.7.3"
244
+ source = "registry+https://github.com/rust-lang/crates.io-index"
245
+ checksum = "87dfd01fe195c66b572b37921ad8803d010623c0aca821bea2302239d155cdae"
246
+ dependencies = [
247
+ "adler",
248
+ ]
249
+
250
+ [[package]]
251
+ name = "ndarray"
252
+ version = "0.15.6"
253
+ source = "registry+https://github.com/rust-lang/crates.io-index"
254
+ checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32"
255
+ dependencies = [
256
+ "matrixmultiply",
257
+ "num-complex",
258
+ "num-integer",
259
+ "num-traits",
260
+ "rawpointer",
261
+ ]
262
+
263
+ [[package]]
264
+ name = "num-complex"
265
+ version = "0.4.6"
266
+ source = "registry+https://github.com/rust-lang/crates.io-index"
267
+ checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
268
+ dependencies = [
269
+ "num-traits",
270
+ ]
271
+
272
+ [[package]]
273
+ name = "num-integer"
274
+ version = "0.1.46"
275
+ source = "registry+https://github.com/rust-lang/crates.io-index"
276
+ checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
277
+ dependencies = [
278
+ "num-traits",
279
+ ]
280
+
281
+ [[package]]
282
+ name = "num-traits"
283
+ version = "0.2.19"
284
+ source = "registry+https://github.com/rust-lang/crates.io-index"
285
+ checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
286
+ dependencies = [
287
+ "autocfg",
288
+ ]
289
+
290
+ [[package]]
291
+ name = "once_cell"
292
+ version = "1.19.0"
293
+ source = "registry+https://github.com/rust-lang/crates.io-index"
294
+ checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
295
+
296
+ [[package]]
297
+ name = "ort"
298
+ version = "2.0.0-rc.2"
299
+ source = "registry+https://github.com/rust-lang/crates.io-index"
300
+ checksum = "0bc80894094c6a875bfac64415ed456fa661081a278a035e22be661305c87e14"
301
+ dependencies = [
302
+ "half",
303
+ "js-sys",
304
+ "libloading",
305
+ "ndarray",
306
+ "ort-sys",
307
+ "thiserror",
308
+ "tracing",
309
+ "web-sys",
310
+ ]
311
+
312
+ [[package]]
313
+ name = "ort-sys"
314
+ version = "2.0.0-rc.2"
315
+ source = "registry+https://github.com/rust-lang/crates.io-index"
316
+ checksum = "b3d9c1373fc813d3f024d394f621f4c6dde0734c79b1c17113c3bb5bf0084bbe"
317
+ dependencies = [
318
+ "flate2",
319
+ "sha2",
320
+ "tar",
321
+ "ureq",
322
+ ]
323
+
324
+ [[package]]
325
+ name = "percent-encoding"
326
+ version = "2.3.1"
327
+ source = "registry+https://github.com/rust-lang/crates.io-index"
328
+ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
329
+
330
+ [[package]]
331
+ name = "pin-project-lite"
332
+ version = "0.2.14"
333
+ source = "registry+https://github.com/rust-lang/crates.io-index"
334
+ checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02"
335
+
336
+ [[package]]
337
+ name = "proc-macro2"
338
+ version = "1.0.84"
339
+ source = "registry+https://github.com/rust-lang/crates.io-index"
340
+ checksum = "ec96c6a92621310b51366f1e28d05ef11489516e93be030060e5fc12024a49d6"
341
+ dependencies = [
342
+ "unicode-ident",
343
+ ]
344
+
345
+ [[package]]
346
+ name = "quote"
347
+ version = "1.0.36"
348
+ source = "registry+https://github.com/rust-lang/crates.io-index"
349
+ checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
350
+ dependencies = [
351
+ "proc-macro2",
352
+ ]
353
+
354
+ [[package]]
355
+ name = "rawpointer"
356
+ version = "0.2.1"
357
+ source = "registry+https://github.com/rust-lang/crates.io-index"
358
+ checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3"
359
+
360
+ [[package]]
361
+ name = "redox_syscall"
362
+ version = "0.4.1"
363
+ source = "registry+https://github.com/rust-lang/crates.io-index"
364
+ checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
365
+ dependencies = [
366
+ "bitflags 1.3.2",
367
+ ]
368
+
369
+ [[package]]
370
+ name = "ring"
371
+ version = "0.17.8"
372
+ source = "registry+https://github.com/rust-lang/crates.io-index"
373
+ checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d"
374
+ dependencies = [
375
+ "cc",
376
+ "cfg-if",
377
+ "getrandom",
378
+ "libc",
379
+ "spin",
380
+ "untrusted",
381
+ "windows-sys",
382
+ ]
383
+
384
+ [[package]]
385
+ name = "rust-example"
386
+ version = "0.1.0"
387
+ dependencies = [
388
+ "hound",
389
+ "ndarray",
390
+ "ort",
391
+ ]
392
+
393
+ [[package]]
394
+ name = "rustix"
395
+ version = "0.38.34"
396
+ source = "registry+https://github.com/rust-lang/crates.io-index"
397
+ checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f"
398
+ dependencies = [
399
+ "bitflags 2.5.0",
400
+ "errno",
401
+ "libc",
402
+ "linux-raw-sys",
403
+ "windows-sys",
404
+ ]
405
+
406
+ [[package]]
407
+ name = "rustls"
408
+ version = "0.22.4"
409
+ source = "registry+https://github.com/rust-lang/crates.io-index"
410
+ checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432"
411
+ dependencies = [
412
+ "log",
413
+ "ring",
414
+ "rustls-pki-types",
415
+ "rustls-webpki",
416
+ "subtle",
417
+ "zeroize",
418
+ ]
419
+
420
+ [[package]]
421
+ name = "rustls-pki-types"
422
+ version = "1.7.0"
423
+ source = "registry+https://github.com/rust-lang/crates.io-index"
424
+ checksum = "976295e77ce332211c0d24d92c0e83e50f5c5f046d11082cea19f3df13a3562d"
425
+
426
+ [[package]]
427
+ name = "rustls-webpki"
428
+ version = "0.102.4"
429
+ source = "registry+https://github.com/rust-lang/crates.io-index"
430
+ checksum = "ff448f7e92e913c4b7d4c6d8e4540a1724b319b4152b8aef6d4cf8339712b33e"
431
+ dependencies = [
432
+ "ring",
433
+ "rustls-pki-types",
434
+ "untrusted",
435
+ ]
436
+
437
+ [[package]]
438
+ name = "sha2"
439
+ version = "0.10.8"
440
+ source = "registry+https://github.com/rust-lang/crates.io-index"
441
+ checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
442
+ dependencies = [
443
+ "cfg-if",
444
+ "cpufeatures",
445
+ "digest",
446
+ ]
447
+
448
+ [[package]]
449
+ name = "spin"
450
+ version = "0.9.8"
451
+ source = "registry+https://github.com/rust-lang/crates.io-index"
452
+ checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
453
+
454
+ [[package]]
455
+ name = "subtle"
456
+ version = "2.5.0"
457
+ source = "registry+https://github.com/rust-lang/crates.io-index"
458
+ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
459
+
460
+ [[package]]
461
+ name = "syn"
462
+ version = "2.0.66"
463
+ source = "registry+https://github.com/rust-lang/crates.io-index"
464
+ checksum = "c42f3f41a2de00b01c0aaad383c5a45241efc8b2d1eda5661812fda5f3cdcff5"
465
+ dependencies = [
466
+ "proc-macro2",
467
+ "quote",
468
+ "unicode-ident",
469
+ ]
470
+
471
+ [[package]]
472
+ name = "tar"
473
+ version = "0.4.40"
474
+ source = "registry+https://github.com/rust-lang/crates.io-index"
475
+ checksum = "b16afcea1f22891c49a00c751c7b63b2233284064f11a200fc624137c51e2ddb"
476
+ dependencies = [
477
+ "filetime",
478
+ "libc",
479
+ "xattr",
480
+ ]
481
+
482
+ [[package]]
483
+ name = "thiserror"
484
+ version = "1.0.61"
485
+ source = "registry+https://github.com/rust-lang/crates.io-index"
486
+ checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709"
487
+ dependencies = [
488
+ "thiserror-impl",
489
+ ]
490
+
491
+ [[package]]
492
+ name = "thiserror-impl"
493
+ version = "1.0.61"
494
+ source = "registry+https://github.com/rust-lang/crates.io-index"
495
+ checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533"
496
+ dependencies = [
497
+ "proc-macro2",
498
+ "quote",
499
+ "syn",
500
+ ]
501
+
502
+ [[package]]
503
+ name = "tinyvec"
504
+ version = "1.6.0"
505
+ source = "registry+https://github.com/rust-lang/crates.io-index"
506
+ checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50"
507
+ dependencies = [
508
+ "tinyvec_macros",
509
+ ]
510
+
511
+ [[package]]
512
+ name = "tinyvec_macros"
513
+ version = "0.1.1"
514
+ source = "registry+https://github.com/rust-lang/crates.io-index"
515
+ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
516
+
517
+ [[package]]
518
+ name = "tracing"
519
+ version = "0.1.40"
520
+ source = "registry+https://github.com/rust-lang/crates.io-index"
521
+ checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
522
+ dependencies = [
523
+ "pin-project-lite",
524
+ "tracing-attributes",
525
+ "tracing-core",
526
+ ]
527
+
528
+ [[package]]
529
+ name = "tracing-attributes"
530
+ version = "0.1.27"
531
+ source = "registry+https://github.com/rust-lang/crates.io-index"
532
+ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
533
+ dependencies = [
534
+ "proc-macro2",
535
+ "quote",
536
+ "syn",
537
+ ]
538
+
539
+ [[package]]
540
+ name = "tracing-core"
541
+ version = "0.1.32"
542
+ source = "registry+https://github.com/rust-lang/crates.io-index"
543
+ checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
544
+ dependencies = [
545
+ "once_cell",
546
+ ]
547
+
548
+ [[package]]
549
+ name = "typenum"
550
+ version = "1.17.0"
551
+ source = "registry+https://github.com/rust-lang/crates.io-index"
552
+ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825"
553
+
554
+ [[package]]
555
+ name = "unicode-bidi"
556
+ version = "0.3.15"
557
+ source = "registry+https://github.com/rust-lang/crates.io-index"
558
+ checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75"
559
+
560
+ [[package]]
561
+ name = "unicode-ident"
562
+ version = "1.0.12"
563
+ source = "registry+https://github.com/rust-lang/crates.io-index"
564
+ checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
565
+
566
+ [[package]]
567
+ name = "unicode-normalization"
568
+ version = "0.1.23"
569
+ source = "registry+https://github.com/rust-lang/crates.io-index"
570
+ checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
571
+ dependencies = [
572
+ "tinyvec",
573
+ ]
574
+
575
+ [[package]]
576
+ name = "untrusted"
577
+ version = "0.9.0"
578
+ source = "registry+https://github.com/rust-lang/crates.io-index"
579
+ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
580
+
581
+ [[package]]
582
+ name = "ureq"
583
+ version = "2.9.7"
584
+ source = "registry+https://github.com/rust-lang/crates.io-index"
585
+ checksum = "d11a831e3c0b56e438a28308e7c810799e3c118417f342d30ecec080105395cd"
586
+ dependencies = [
587
+ "base64",
588
+ "log",
589
+ "once_cell",
590
+ "rustls",
591
+ "rustls-pki-types",
592
+ "rustls-webpki",
593
+ "url",
594
+ "webpki-roots",
595
+ ]
596
+
597
+ [[package]]
598
+ name = "url"
599
+ version = "2.5.0"
600
+ source = "registry+https://github.com/rust-lang/crates.io-index"
601
+ checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633"
602
+ dependencies = [
603
+ "form_urlencoded",
604
+ "idna",
605
+ "percent-encoding",
606
+ ]
607
+
608
+ [[package]]
609
+ name = "version_check"
610
+ version = "0.9.4"
611
+ source = "registry+https://github.com/rust-lang/crates.io-index"
612
+ checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
613
+
614
+ [[package]]
615
+ name = "wasi"
616
+ version = "0.11.0+wasi-snapshot-preview1"
617
+ source = "registry+https://github.com/rust-lang/crates.io-index"
618
+ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
619
+
620
+ [[package]]
621
+ name = "wasm-bindgen"
622
+ version = "0.2.92"
623
+ source = "registry+https://github.com/rust-lang/crates.io-index"
624
+ checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
625
+ dependencies = [
626
+ "cfg-if",
627
+ "wasm-bindgen-macro",
628
+ ]
629
+
630
+ [[package]]
631
+ name = "wasm-bindgen-backend"
632
+ version = "0.2.92"
633
+ source = "registry+https://github.com/rust-lang/crates.io-index"
634
+ checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
635
+ dependencies = [
636
+ "bumpalo",
637
+ "log",
638
+ "once_cell",
639
+ "proc-macro2",
640
+ "quote",
641
+ "syn",
642
+ "wasm-bindgen-shared",
643
+ ]
644
+
645
+ [[package]]
646
+ name = "wasm-bindgen-macro"
647
+ version = "0.2.92"
648
+ source = "registry+https://github.com/rust-lang/crates.io-index"
649
+ checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
650
+ dependencies = [
651
+ "quote",
652
+ "wasm-bindgen-macro-support",
653
+ ]
654
+
655
+ [[package]]
656
+ name = "wasm-bindgen-macro-support"
657
+ version = "0.2.92"
658
+ source = "registry+https://github.com/rust-lang/crates.io-index"
659
+ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
660
+ dependencies = [
661
+ "proc-macro2",
662
+ "quote",
663
+ "syn",
664
+ "wasm-bindgen-backend",
665
+ "wasm-bindgen-shared",
666
+ ]
667
+
668
+ [[package]]
669
+ name = "wasm-bindgen-shared"
670
+ version = "0.2.92"
671
+ source = "registry+https://github.com/rust-lang/crates.io-index"
672
+ checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
673
+
674
+ [[package]]
675
+ name = "web-sys"
676
+ version = "0.3.69"
677
+ source = "registry+https://github.com/rust-lang/crates.io-index"
678
+ checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef"
679
+ dependencies = [
680
+ "js-sys",
681
+ "wasm-bindgen",
682
+ ]
683
+
684
+ [[package]]
685
+ name = "webpki-roots"
686
+ version = "0.26.1"
687
+ source = "registry+https://github.com/rust-lang/crates.io-index"
688
+ checksum = "b3de34ae270483955a94f4b21bdaaeb83d508bb84a01435f393818edb0012009"
689
+ dependencies = [
690
+ "rustls-pki-types",
691
+ ]
692
+
693
+ [[package]]
694
+ name = "windows-sys"
695
+ version = "0.52.0"
696
+ source = "registry+https://github.com/rust-lang/crates.io-index"
697
+ checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
698
+ dependencies = [
699
+ "windows-targets",
700
+ ]
701
+
702
+ [[package]]
703
+ name = "windows-targets"
704
+ version = "0.52.5"
705
+ source = "registry+https://github.com/rust-lang/crates.io-index"
706
+ checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb"
707
+ dependencies = [
708
+ "windows_aarch64_gnullvm",
709
+ "windows_aarch64_msvc",
710
+ "windows_i686_gnu",
711
+ "windows_i686_gnullvm",
712
+ "windows_i686_msvc",
713
+ "windows_x86_64_gnu",
714
+ "windows_x86_64_gnullvm",
715
+ "windows_x86_64_msvc",
716
+ ]
717
+
718
+ [[package]]
719
+ name = "windows_aarch64_gnullvm"
720
+ version = "0.52.5"
721
+ source = "registry+https://github.com/rust-lang/crates.io-index"
722
+ checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263"
723
+
724
+ [[package]]
725
+ name = "windows_aarch64_msvc"
726
+ version = "0.52.5"
727
+ source = "registry+https://github.com/rust-lang/crates.io-index"
728
+ checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6"
729
+
730
+ [[package]]
731
+ name = "windows_i686_gnu"
732
+ version = "0.52.5"
733
+ source = "registry+https://github.com/rust-lang/crates.io-index"
734
+ checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670"
735
+
736
+ [[package]]
737
+ name = "windows_i686_gnullvm"
738
+ version = "0.52.5"
739
+ source = "registry+https://github.com/rust-lang/crates.io-index"
740
+ checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9"
741
+
742
+ [[package]]
743
+ name = "windows_i686_msvc"
744
+ version = "0.52.5"
745
+ source = "registry+https://github.com/rust-lang/crates.io-index"
746
+ checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf"
747
+
748
+ [[package]]
749
+ name = "windows_x86_64_gnu"
750
+ version = "0.52.5"
751
+ source = "registry+https://github.com/rust-lang/crates.io-index"
752
+ checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9"
753
+
754
+ [[package]]
755
+ name = "windows_x86_64_gnullvm"
756
+ version = "0.52.5"
757
+ source = "registry+https://github.com/rust-lang/crates.io-index"
758
+ checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596"
759
+
760
+ [[package]]
761
+ name = "windows_x86_64_msvc"
762
+ version = "0.52.5"
763
+ source = "registry+https://github.com/rust-lang/crates.io-index"
764
+ checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
765
+
766
+ [[package]]
767
+ name = "xattr"
768
+ version = "1.3.1"
769
+ source = "registry+https://github.com/rust-lang/crates.io-index"
770
+ checksum = "8da84f1a25939b27f6820d92aed108f83ff920fdf11a7b19366c27c4cda81d4f"
771
+ dependencies = [
772
+ "libc",
773
+ "linux-raw-sys",
774
+ "rustix",
775
+ ]
776
+
777
+ [[package]]
778
+ name = "zeroize"
779
+ version = "1.8.1"
780
+ source = "registry+https://github.com/rust-lang/crates.io-index"
781
+ checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde"
snakers4_silero-vad_master/examples/rust-example/Cargo.toml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [package]
2
+ name = "rust-example"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+
6
+ [dependencies]
7
+ ort = { version = "2.0.0-rc.2", features = ["load-dynamic", "ndarray"] }
8
+ ndarray = "0.15"
9
+ hound = "3"
snakers4_silero-vad_master/examples/rust-example/README.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Stream example in Rust
2
+ Made after [C++ stream example](https://github.com/snakers4/silero-vad/tree/master/examples/cpp)
3
+
4
+ ## Dependencies
5
+ - To build Rust crate `ort` you need `cc` installed.
6
+
7
+ ## Usage
8
+ Just
9
+ ```
10
+ cargo run
11
+ ```
12
+ If you run example outside of this repo adjust environment variable
13
+ ```
14
+ SILERO_MODEL_PATH=/path/to/silero_vad.onnx cargo run
15
+ ```
16
+ If you need to test against other wav file, not `recorder.wav`, specify it as the first argument
17
+ ```
18
+ cargo run -- /path/to/audio/file.wav
19
+ ```
snakers4_silero-vad_master/examples/rust-example/src/main.rs ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ mod silero;
2
+ mod utils;
3
+ mod vad_iter;
4
+
5
+ fn main() {
6
+ let model_path = std::env::var("SILERO_MODEL_PATH")
7
+ .unwrap_or_else(|_| String::from("../../files/silero_vad.onnx"));
8
+ let audio_path = std::env::args()
9
+ .nth(1)
10
+ .unwrap_or_else(|| String::from("recorder.wav"));
11
+ let mut wav_reader = hound::WavReader::open(audio_path).unwrap();
12
+ let sample_rate = match wav_reader.spec().sample_rate {
13
+ 8000 => utils::SampleRate::EightkHz,
14
+ 16000 => utils::SampleRate::SixteenkHz,
15
+ _ => panic!("Unsupported sample rate. Expect 8 kHz or 16 kHz."),
16
+ };
17
+ if wav_reader.spec().sample_format != hound::SampleFormat::Int {
18
+ panic!("Unsupported sample format. Expect Int.");
19
+ }
20
+ let content = wav_reader
21
+ .samples()
22
+ .filter_map(|x| x.ok())
23
+ .collect::<Vec<i16>>();
24
+ assert!(!content.is_empty());
25
+ let silero = silero::Silero::new(sample_rate, model_path).unwrap();
26
+ let vad_params = utils::VadParams {
27
+ sample_rate: sample_rate.into(),
28
+ ..Default::default()
29
+ };
30
+ let mut vad_iterator = vad_iter::VadIter::new(silero, vad_params);
31
+ vad_iterator.process(&content).unwrap();
32
+ for timestamp in vad_iterator.speeches() {
33
+ println!("{}", timestamp);
34
+ }
35
+ println!("Finished.");
36
+ }
snakers4_silero-vad_master/examples/rust-example/src/silero.rs ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ use crate::utils;
2
+ use ndarray::{Array, Array2, ArrayBase, ArrayD, Dim, IxDynImpl, OwnedRepr};
3
+ use std::path::Path;
4
+
5
+ #[derive(Debug)]
6
+ pub struct Silero {
7
+ session: ort::Session,
8
+ sample_rate: ArrayBase<OwnedRepr<i64>, Dim<[usize; 1]>>,
9
+ h: ArrayBase<OwnedRepr<f32>, Dim<IxDynImpl>>,
10
+ c: ArrayBase<OwnedRepr<f32>, Dim<IxDynImpl>>,
11
+ }
12
+
13
+ impl Silero {
14
+ pub fn new(
15
+ sample_rate: utils::SampleRate,
16
+ model_path: impl AsRef<Path>,
17
+ ) -> Result<Self, ort::Error> {
18
+ let session = ort::Session::builder()?.commit_from_file(model_path)?;
19
+ let h = ArrayD::<f32>::zeros([2, 1, 64].as_slice());
20
+ let c = ArrayD::<f32>::zeros([2, 1, 64].as_slice());
21
+ let sample_rate = Array::from_shape_vec([1], vec![sample_rate.into()]).unwrap();
22
+ Ok(Self {
23
+ session,
24
+ sample_rate,
25
+ h,
26
+ c,
27
+ })
28
+ }
29
+
30
+ pub fn reset(&mut self) {
31
+ self.h = ArrayD::<f32>::zeros([2, 1, 64].as_slice());
32
+ self.c = ArrayD::<f32>::zeros([2, 1, 64].as_slice());
33
+ }
34
+
35
+ pub fn calc_level(&mut self, audio_frame: &[i16]) -> Result<f32, ort::Error> {
36
+ let data = audio_frame
37
+ .iter()
38
+ .map(|x| (*x as f32) / (i16::MAX as f32))
39
+ .collect::<Vec<_>>();
40
+ let frame = Array2::<f32>::from_shape_vec([1, data.len()], data).unwrap();
41
+ let inps = ort::inputs![
42
+ frame,
43
+ self.sample_rate.clone(),
44
+ std::mem::take(&mut self.h),
45
+ std::mem::take(&mut self.c)
46
+ ]?;
47
+ let res = self
48
+ .session
49
+ .run(ort::SessionInputs::ValueSlice::<4>(&inps))?;
50
+ self.h = res["hn"].try_extract_tensor().unwrap().to_owned();
51
+ self.c = res["cn"].try_extract_tensor().unwrap().to_owned();
52
+ Ok(*res["output"]
53
+ .try_extract_raw_tensor::<f32>()
54
+ .unwrap()
55
+ .1
56
+ .first()
57
+ .unwrap())
58
+ }
59
+ }
snakers4_silero-vad_master/examples/rust-example/src/utils.rs ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #[derive(Debug, Clone, Copy)]
2
+ pub enum SampleRate {
3
+ EightkHz,
4
+ SixteenkHz,
5
+ }
6
+
7
+ impl From<SampleRate> for i64 {
8
+ fn from(value: SampleRate) -> Self {
9
+ match value {
10
+ SampleRate::EightkHz => 8000,
11
+ SampleRate::SixteenkHz => 16000,
12
+ }
13
+ }
14
+ }
15
+
16
+ impl From<SampleRate> for usize {
17
+ fn from(value: SampleRate) -> Self {
18
+ match value {
19
+ SampleRate::EightkHz => 8000,
20
+ SampleRate::SixteenkHz => 16000,
21
+ }
22
+ }
23
+ }
24
+
25
+ #[derive(Debug)]
26
+ pub struct VadParams {
27
+ pub frame_size: usize,
28
+ pub threshold: f32,
29
+ pub min_silence_duration_ms: usize,
30
+ pub speech_pad_ms: usize,
31
+ pub min_speech_duration_ms: usize,
32
+ pub max_speech_duration_s: f32,
33
+ pub sample_rate: usize,
34
+ }
35
+
36
+ impl Default for VadParams {
37
+ fn default() -> Self {
38
+ Self {
39
+ frame_size: 64,
40
+ threshold: 0.5,
41
+ min_silence_duration_ms: 0,
42
+ speech_pad_ms: 64,
43
+ min_speech_duration_ms: 64,
44
+ max_speech_duration_s: f32::INFINITY,
45
+ sample_rate: 16000,
46
+ }
47
+ }
48
+ }
49
+
50
+ #[derive(Debug, Default)]
51
+ pub struct TimeStamp {
52
+ pub start: i64,
53
+ pub end: i64,
54
+ }
55
+
56
+ impl std::fmt::Display for TimeStamp {
57
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
58
+ write!(f, "[start:{:08}, end:{:08}]", self.start, self.end)
59
+ }
60
+ }
snakers4_silero-vad_master/examples/rust-example/src/vad_iter.rs ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ use crate::{silero, utils};
2
+
3
+ const DEBUG_SPEECH_PROB: bool = true;
4
+ #[derive(Debug)]
5
+ pub struct VadIter {
6
+ silero: silero::Silero,
7
+ params: Params,
8
+ state: State,
9
+ }
10
+
11
+ impl VadIter {
12
+ pub fn new(silero: silero::Silero, params: utils::VadParams) -> Self {
13
+ Self {
14
+ silero,
15
+ params: Params::from(params),
16
+ state: State::new(),
17
+ }
18
+ }
19
+
20
+ pub fn process(&mut self, samples: &[i16]) -> Result<(), ort::Error> {
21
+ self.reset_states();
22
+ for audio_frame in samples.chunks_exact(self.params.frame_size_samples) {
23
+ let speech_prob = self.silero.calc_level(audio_frame)?;
24
+ self.state.update(&self.params, speech_prob);
25
+ }
26
+ self.state.check_for_last_speech(samples.len());
27
+ Ok(())
28
+ }
29
+
30
+ pub fn speeches(&self) -> &[utils::TimeStamp] {
31
+ &self.state.speeches
32
+ }
33
+ }
34
+
35
+ impl VadIter {
36
+ fn reset_states(&mut self) {
37
+ self.silero.reset();
38
+ self.state = State::new()
39
+ }
40
+ }
41
+
42
+ #[allow(unused)]
43
+ #[derive(Debug)]
44
+ struct Params {
45
+ frame_size: usize,
46
+ threshold: f32,
47
+ min_silence_duration_ms: usize,
48
+ speech_pad_ms: usize,
49
+ min_speech_duration_ms: usize,
50
+ max_speech_duration_s: f32,
51
+ sample_rate: usize,
52
+ sr_per_ms: usize,
53
+ frame_size_samples: usize,
54
+ min_speech_samples: usize,
55
+ speech_pad_samples: usize,
56
+ max_speech_samples: f32,
57
+ min_silence_samples: usize,
58
+ min_silence_samples_at_max_speech: usize,
59
+ }
60
+
61
+ impl From<utils::VadParams> for Params {
62
+ fn from(value: utils::VadParams) -> Self {
63
+ let frame_size = value.frame_size;
64
+ let threshold = value.threshold;
65
+ let min_silence_duration_ms = value.min_silence_duration_ms;
66
+ let speech_pad_ms = value.speech_pad_ms;
67
+ let min_speech_duration_ms = value.min_speech_duration_ms;
68
+ let max_speech_duration_s = value.max_speech_duration_s;
69
+ let sample_rate = value.sample_rate;
70
+ let sr_per_ms = sample_rate / 1000;
71
+ let frame_size_samples = frame_size * sr_per_ms;
72
+ let min_speech_samples = sr_per_ms * min_speech_duration_ms;
73
+ let speech_pad_samples = sr_per_ms * speech_pad_ms;
74
+ let max_speech_samples = sample_rate as f32 * max_speech_duration_s
75
+ - frame_size_samples as f32
76
+ - 2.0 * speech_pad_samples as f32;
77
+ let min_silence_samples = sr_per_ms * min_silence_duration_ms;
78
+ let min_silence_samples_at_max_speech = sr_per_ms * 98;
79
+ Self {
80
+ frame_size,
81
+ threshold,
82
+ min_silence_duration_ms,
83
+ speech_pad_ms,
84
+ min_speech_duration_ms,
85
+ max_speech_duration_s,
86
+ sample_rate,
87
+ sr_per_ms,
88
+ frame_size_samples,
89
+ min_speech_samples,
90
+ speech_pad_samples,
91
+ max_speech_samples,
92
+ min_silence_samples,
93
+ min_silence_samples_at_max_speech,
94
+ }
95
+ }
96
+ }
97
+
98
+ #[derive(Debug, Default)]
99
+ struct State {
100
+ current_sample: usize,
101
+ temp_end: usize,
102
+ next_start: usize,
103
+ prev_end: usize,
104
+ triggered: bool,
105
+ current_speech: utils::TimeStamp,
106
+ speeches: Vec<utils::TimeStamp>,
107
+ }
108
+
109
+ impl State {
110
+ fn new() -> Self {
111
+ Default::default()
112
+ }
113
+
114
+ fn update(&mut self, params: &Params, speech_prob: f32) {
115
+ self.current_sample += params.frame_size_samples;
116
+ if speech_prob > params.threshold {
117
+ if self.temp_end != 0 {
118
+ self.temp_end = 0;
119
+ if self.next_start < self.prev_end {
120
+ self.next_start = self
121
+ .current_sample
122
+ .saturating_sub(params.frame_size_samples)
123
+ }
124
+ }
125
+ if !self.triggered {
126
+ self.debug(speech_prob, params, "start");
127
+ self.triggered = true;
128
+ self.current_speech.start =
129
+ self.current_sample as i64 - params.frame_size_samples as i64;
130
+ }
131
+ return;
132
+ }
133
+ if self.triggered
134
+ && (self.current_sample as i64 - self.current_speech.start) as f32
135
+ > params.max_speech_samples
136
+ {
137
+ if self.prev_end > 0 {
138
+ self.current_speech.end = self.prev_end as _;
139
+ self.take_speech();
140
+ if self.next_start < self.prev_end {
141
+ self.triggered = false
142
+ } else {
143
+ self.current_speech.start = self.next_start as _;
144
+ }
145
+ self.prev_end = 0;
146
+ self.next_start = 0;
147
+ self.temp_end = 0;
148
+ } else {
149
+ self.current_speech.end = self.current_sample as _;
150
+ self.take_speech();
151
+ self.prev_end = 0;
152
+ self.next_start = 0;
153
+ self.temp_end = 0;
154
+ self.triggered = false;
155
+ }
156
+ return;
157
+ }
158
+ if speech_prob >= (params.threshold - 0.15) && (speech_prob < params.threshold) {
159
+ if self.triggered {
160
+ self.debug(speech_prob, params, "speaking")
161
+ } else {
162
+ self.debug(speech_prob, params, "silence")
163
+ }
164
+ }
165
+ if self.triggered && speech_prob < (params.threshold - 0.15) {
166
+ self.debug(speech_prob, params, "end");
167
+ if self.temp_end == 0 {
168
+ self.temp_end = self.current_sample;
169
+ }
170
+ if self.current_sample.saturating_sub(self.temp_end)
171
+ > params.min_silence_samples_at_max_speech
172
+ {
173
+ self.prev_end = self.temp_end;
174
+ }
175
+ if self.current_sample.saturating_sub(self.temp_end) >= params.min_silence_samples {
176
+ self.current_speech.end = self.temp_end as _;
177
+ if self.current_speech.end - self.current_speech.start
178
+ > params.min_speech_samples as _
179
+ {
180
+ self.take_speech();
181
+ self.prev_end = 0;
182
+ self.next_start = 0;
183
+ self.temp_end = 0;
184
+ self.triggered = false;
185
+ }
186
+ }
187
+ }
188
+ }
189
+
190
+ fn take_speech(&mut self) {
191
+ self.speeches.push(std::mem::take(&mut self.current_speech)); // current speech becomes TimeStamp::default() due to take()
192
+ }
193
+
194
+ fn check_for_last_speech(&mut self, last_sample: usize) {
195
+ if self.current_speech.start > 0 {
196
+ self.current_speech.end = last_sample as _;
197
+ self.take_speech();
198
+ self.prev_end = 0;
199
+ self.next_start = 0;
200
+ self.temp_end = 0;
201
+ self.triggered = false;
202
+ }
203
+ }
204
+
205
+ fn debug(&self, speech_prob: f32, params: &Params, title: &str) {
206
+ if DEBUG_SPEECH_PROB {
207
+ let speech = self.current_sample as f32
208
+ - params.frame_size_samples as f32
209
+ - if title == "end" {
210
+ params.speech_pad_samples
211
+ } else {
212
+ 0
213
+ } as f32; // minus window_size_samples to get precise start time point.
214
+ println!(
215
+ "[{:10}: {:.3} s ({:.3}) {:8}]",
216
+ title,
217
+ speech / params.sample_rate as f32,
218
+ speech_prob,
219
+ self.current_sample - params.frame_size_samples,
220
+ );
221
+ }
222
+ }
223
+ }
snakers4_silero-vad_master/files/lang_dict_95.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"59": "mg, Malagasy", "76": "tk, Turkmen", "20": "lb, Luxembourgish, Letzeburgesch", "62": "or, Oriya", "30": "en, English", "26": "oc, Occitan", "69": "no, Norwegian", "77": "sr, Serbian", "90": "bs, Bosnian", "71": "el, Greek, Modern (1453\u2013)", "15": "az, Azerbaijani", "12": "lo, Lao", "85": "zh-HK, Chinese", "79": "cs, Czech", "43": "sv, Swedish", "37": "mn, Mongolian", "32": "fi, Finnish", "51": "tg, Tajik", "46": "am, Amharic", "17": "nn, Norwegian Nynorsk", "40": "ja, Japanese", "8": "it, Italian", "21": "ha, Hausa", "11": "as, Assamese", "29": "fa, Persian", "82": "bn, Bengali", "54": "mk, Macedonian", "31": "sw, Swahili", "45": "vi, Vietnamese", "41": "ur, Urdu", "74": "bo, Tibetan", "4": "hi, Hindi", "86": "mr, Marathi", "3": "fy-NL, Western Frisian", "65": "sk, Slovak", "2": "ln, Lingala", "92": "gl, Galician", "53": "sn, Shona", "87": "su, Sundanese", "35": "tt, Tatar", "93": "kn, Kannada", "6": "yo, Yoruba", "27": "ps, Pashto, Pushto", "34": "hy, Armenian", "25": "pa-IN, Punjabi, Panjabi", "23": "nl, Dutch, Flemish", "48": "th, Thai", "73": "mt, Maltese", "55": "ar, Arabic", "89": "ba, Bashkir", "78": "bg, Bulgarian", "42": "yi, Yiddish", "5": "ru, Russian", "84": "sv-SE, Swedish", "80": "tr, Turkish", "33": "sq, Albanian", "38": "kk, Kazakh", "50": "pl, Polish", "9": "hr, Croatian", "66": "ky, Kirghiz, Kyrgyz", "49": "hu, Hungarian", "10": "si, Sinhala, Sinhalese", "56": "la, Latin", "75": "de, German", "14": "ko, Korean", "22": "id, Indonesian", "47": "sl, Slovenian", "57": "be, Belarusian", "36": "ta, Tamil", "7": "da, Danish", "91": "sd, Sindhi", "28": "et, Estonian", "63": "pt, Portuguese", "60": "ne, Nepali", "94": "zh-TW, Chinese", "18": "zh-CN, Chinese", "88": "rw, Kinyarwanda", "19": "es, Spanish, Castilian", "39": "ht, Haitian, Haitian Creole", "64": "tl, Tagalog", "83": "ms, Malay", "70": "ro, Romanian, Moldavian, Moldovan", "68": "pa, Punjabi, Panjabi", "52": "uz, Uzbek", "58": "km, Central Khmer", "67": "my, Burmese", "0": "fr, French", "24": "af, Afrikaans", "16": "gu, Gujarati", "81": "so, Somali", "13": "uk, Ukrainian", "44": "ca, Catalan, Valencian", "72": "ml, Malayalam", "61": "te, Telugu", "1": "zh, Chinese"}
snakers4_silero-vad_master/files/lang_group_dict_95.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"0": ["Afrikaans", "Dutch, Flemish", "Western Frisian"], "1": ["Turkish", "Azerbaijani"], "2": ["Russian", "Slovak", "Ukrainian", "Czech", "Polish", "Belarusian"], "3": ["Bulgarian", "Macedonian", "Serbian", "Croatian", "Bosnian", "Slovenian"], "4": ["Norwegian Nynorsk", "Swedish", "Danish", "Norwegian"], "5": ["English"], "6": ["Finnish", "Estonian"], "7": ["Yiddish", "Luxembourgish, Letzeburgesch", "German"], "8": ["Spanish", "Occitan", "Portuguese", "Catalan, Valencian", "Galician", "Spanish, Castilian", "Italian"], "9": ["Maltese", "Arabic"], "10": ["Marathi"], "11": ["Hindi", "Urdu"], "12": ["Lao", "Thai"], "13": ["Malay", "Indonesian"], "14": ["Romanian, Moldavian, Moldovan"], "15": ["Tagalog"], "16": ["Tajik", "Persian"], "17": ["Kazakh", "Uzbek", "Kirghiz, Kyrgyz"], "18": ["Kinyarwanda"], "19": ["Tatar", "Bashkir"], "20": ["French"], "21": ["Chinese"], "22": ["Lingala"], "23": ["Yoruba"], "24": ["Sinhala, Sinhalese"], "25": ["Assamese"], "26": ["Korean"], "27": ["Gujarati"], "28": ["Hausa"], "29": ["Punjabi, Panjabi"], "30": ["Pashto, Pushto"], "31": ["Swahili"], "32": ["Albanian"], "33": ["Armenian"], "34": ["Mongolian"], "35": ["Tamil"], "36": ["Haitian, Haitian Creole"], "37": ["Japanese"], "38": ["Vietnamese"], "39": ["Amharic"], "40": ["Hungarian"], "41": ["Shona"], "42": ["Latin"], "43": ["Central Khmer"], "44": ["Malagasy"], "45": ["Nepali"], "46": ["Telugu"], "47": ["Oriya"], "48": ["Burmese"], "49": ["Greek, Modern (1453\u2013)"], "50": ["Malayalam"], "51": ["Tibetan"], "52": ["Turkmen"], "53": ["Somali"], "54": ["Bengali"], "55": ["Sundanese"], "56": ["Sindhi"], "57": ["Kannada"]}
snakers4_silero-vad_master/files/silero_logo.jpg ADDED
snakers4_silero-vad_master/files/silero_vad.jit ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99033608562094bbb44e2363198cd47647a668f846c4c9a9edde68b4800b5fd4
3
+ size 1439299
snakers4_silero-vad_master/files/silero_vad.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a35ebf52fd3ce5f1469b2a36158dba761bc47b973ea3382b3186ca15b1f5af28
3
+ size 1807522
snakers4_silero-vad_master/hubconf.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dependencies = ['torch', 'torchaudio']
2
+ import torch
3
+ import json
4
+ import os
5
+ from utils_vad import (init_jit_model,
6
+ get_speech_timestamps,
7
+ get_number_ts,
8
+ get_language,
9
+ get_language_and_group,
10
+ save_audio,
11
+ read_audio,
12
+ VADIterator,
13
+ collect_chunks,
14
+ drop_chunks,
15
+ Validator,
16
+ OnnxWrapper)
17
+
18
+
19
+ def versiontuple(v):
20
+ splitted = v.split('+')[0].split(".")
21
+ version_list = []
22
+ for i in splitted:
23
+ try:
24
+ version_list.append(int(i))
25
+ except:
26
+ version_list.append(0)
27
+ return tuple(version_list)
28
+
29
+
30
+ def silero_vad(onnx=False, force_onnx_cpu=False):
31
+ """Silero Voice Activity Detector
32
+ Returns a model with a set of utils
33
+ Please see https://github.com/snakers4/silero-vad for usage examples
34
+ """
35
+
36
+ if not onnx:
37
+ installed_version = torch.__version__
38
+ supported_version = '1.12.0'
39
+ if versiontuple(installed_version) < versiontuple(supported_version):
40
+ raise Exception(f'Please install torch {supported_version} or greater ({installed_version} installed)')
41
+
42
+ model_dir = os.path.join(os.path.dirname(__file__), 'files')
43
+ if onnx:
44
+ model = OnnxWrapper(os.path.join(model_dir, 'silero_vad.onnx'), force_onnx_cpu)
45
+ else:
46
+ model = init_jit_model(os.path.join(model_dir, 'silero_vad.jit'))
47
+ utils = (get_speech_timestamps,
48
+ save_audio,
49
+ read_audio,
50
+ VADIterator,
51
+ collect_chunks)
52
+
53
+ return model, utils
54
+
55
+
56
+ def silero_number_detector(onnx=False, force_onnx_cpu=False):
57
+ """Silero Number Detector
58
+ Returns a model with a set of utils
59
+ Please see https://github.com/snakers4/silero-vad for usage examples
60
+ """
61
+ raise NotImplementedError('This model has been deprecated and is not supported anymore.')
62
+ if onnx:
63
+ url = 'https://models.silero.ai/vad_models/number_detector.onnx'
64
+ else:
65
+ url = 'https://models.silero.ai/vad_models/number_detector.jit'
66
+ model = Validator(url, force_onnx_cpu)
67
+ utils = (get_number_ts,
68
+ save_audio,
69
+ read_audio,
70
+ collect_chunks,
71
+ drop_chunks)
72
+
73
+ return model, utils
74
+
75
+
76
+ def silero_lang_detector(onnx=False, force_onnx_cpu=False):
77
+ """Silero Language Classifier
78
+ Returns a model with a set of utils
79
+ Please see https://github.com/snakers4/silero-vad for usage examples
80
+ """
81
+ raise NotImplementedError('This model has been deprecated and is not supported anymore.')
82
+ if onnx:
83
+ url = 'https://models.silero.ai/vad_models/number_detector.onnx'
84
+ else:
85
+ url = 'https://models.silero.ai/vad_models/number_detector.jit'
86
+ model = Validator(url, force_onnx_cpu)
87
+ utils = (get_language,
88
+ read_audio)
89
+
90
+ return model, utils
91
+
92
+
93
+ def silero_lang_detector_95(onnx=False, force_onnx_cpu=False):
94
+ """Silero Language Classifier (95 languages)
95
+ Returns a model with a set of utils
96
+ Please see https://github.com/snakers4/silero-vad for usage examples
97
+ """
98
+ raise NotImplementedError('This model has been deprecated and is not supported anymore.')
99
+ if onnx:
100
+ url = 'https://models.silero.ai/vad_models/lang_classifier_95.onnx'
101
+ else:
102
+ url = 'https://models.silero.ai/vad_models/lang_classifier_95.jit'
103
+ model = Validator(url, force_onnx_cpu)
104
+
105
+ model_dir = os.path.join(os.path.dirname(__file__), 'files')
106
+ with open(os.path.join(model_dir, 'lang_dict_95.json'), 'r') as f:
107
+ lang_dict = json.load(f)
108
+
109
+ with open(os.path.join(model_dir, 'lang_group_dict_95.json'), 'r') as f:
110
+ lang_group_dict = json.load(f)
111
+
112
+ utils = (get_language_and_group, read_audio)
113
+
114
+ return model, lang_dict, lang_group_dict, utils
snakers4_silero-vad_master/silero-vad.ipynb ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "heading_collapsed": true,
7
+ "id": "62A6F_072Fwq"
8
+ },
9
+ "source": [
10
+ "## Install Dependencies"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": null,
16
+ "metadata": {
17
+ "hidden": true,
18
+ "id": "5w5AkskZ2Fwr"
19
+ },
20
+ "outputs": [],
21
+ "source": [
22
+ "#@title Install and Import Dependencies\n",
23
+ "\n",
24
+ "# this assumes that you have a relevant version of PyTorch installed\n",
25
+ "!pip install -q torchaudio\n",
26
+ "\n",
27
+ "SAMPLING_RATE = 16000\n",
28
+ "\n",
29
+ "import torch\n",
30
+ "torch.set_num_threads(1)\n",
31
+ "\n",
32
+ "from IPython.display import Audio\n",
33
+ "from pprint import pprint\n",
34
+ "# download example\n",
35
+ "torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', 'en_example.wav')"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": null,
41
+ "metadata": {
42
+ "id": "pSifus5IilRp"
43
+ },
44
+ "outputs": [],
45
+ "source": [
46
+ "USE_ONNX = False # change this to True if you want to test onnx model\n",
47
+ "if USE_ONNX:\n",
48
+ " !pip install -q onnxruntime\n",
49
+ " \n",
50
+ "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
51
+ " model='silero_vad',\n",
52
+ " force_reload=True,\n",
53
+ " onnx=USE_ONNX)\n",
54
+ "\n",
55
+ "(get_speech_timestamps,\n",
56
+ " save_audio,\n",
57
+ " read_audio,\n",
58
+ " VADIterator,\n",
59
+ " collect_chunks) = utils"
60
+ ]
61
+ },
62
+ {
63
+ "cell_type": "markdown",
64
+ "metadata": {
65
+ "id": "fXbbaUO3jsrw"
66
+ },
67
+ "source": [
68
+ "## Full Audio"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "markdown",
73
+ "metadata": {
74
+ "id": "RAfJPb_a-Auj"
75
+ },
76
+ "source": [
77
+ "**Speech timestapms from full audio**"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "code",
82
+ "execution_count": null,
83
+ "metadata": {
84
+ "id": "aI_eydBPjsrx"
85
+ },
86
+ "outputs": [],
87
+ "source": [
88
+ "wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)\n",
89
+ "# get speech timestamps from full audio file\n",
90
+ "speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLING_RATE)\n",
91
+ "pprint(speech_timestamps)"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "execution_count": null,
97
+ "metadata": {
98
+ "id": "OuEobLchjsry"
99
+ },
100
+ "outputs": [],
101
+ "source": [
102
+ "# merge all speech chunks to one audio\n",
103
+ "save_audio('only_speech.wav',\n",
104
+ " collect_chunks(speech_timestamps, wav), sampling_rate=SAMPLING_RATE) \n",
105
+ "Audio('only_speech.wav')"
106
+ ]
107
+ },
108
+ {
109
+ "cell_type": "markdown",
110
+ "metadata": {
111
+ "id": "iDKQbVr8jsry"
112
+ },
113
+ "source": [
114
+ "## Stream imitation example"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "execution_count": null,
120
+ "metadata": {
121
+ "id": "q-lql_2Wjsry"
122
+ },
123
+ "outputs": [],
124
+ "source": [
125
+ "## using VADIterator class\n",
126
+ "\n",
127
+ "vad_iterator = VADIterator(model)\n",
128
+ "wav = read_audio(f'en_example.wav', sampling_rate=SAMPLING_RATE)\n",
129
+ "\n",
130
+ "window_size_samples = 1536 # number of samples in a single audio chunk\n",
131
+ "for i in range(0, len(wav), window_size_samples):\n",
132
+ " chunk = wav[i: i+ window_size_samples]\n",
133
+ " if len(chunk) < window_size_samples:\n",
134
+ " break\n",
135
+ " speech_dict = vad_iterator(chunk, return_seconds=True)\n",
136
+ " if speech_dict:\n",
137
+ " print(speech_dict, end=' ')\n",
138
+ "vad_iterator.reset_states() # reset model states after each audio"
139
+ ]
140
+ },
141
+ {
142
+ "cell_type": "code",
143
+ "execution_count": null,
144
+ "metadata": {
145
+ "id": "BX3UgwwB2Fwv"
146
+ },
147
+ "outputs": [],
148
+ "source": [
149
+ "## just probabilities\n",
150
+ "\n",
151
+ "wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)\n",
152
+ "speech_probs = []\n",
153
+ "window_size_samples = 1536\n",
154
+ "for i in range(0, len(wav), window_size_samples):\n",
155
+ " chunk = wav[i: i+ window_size_samples]\n",
156
+ " if len(chunk) < window_size_samples:\n",
157
+ " break\n",
158
+ " speech_prob = model(chunk, SAMPLING_RATE).item()\n",
159
+ " speech_probs.append(speech_prob)\n",
160
+ "vad_iterator.reset_states() # reset model states after each audio\n",
161
+ "\n",
162
+ "print(speech_probs[:10]) # first 10 chunks predicts"
163
+ ]
164
+ }
165
+ ],
166
+ "metadata": {
167
+ "colab": {
168
+ "name": "silero-vad.ipynb",
169
+ "provenance": []
170
+ },
171
+ "kernelspec": {
172
+ "display_name": "Python 3",
173
+ "language": "python",
174
+ "name": "python3"
175
+ },
176
+ "language_info": {
177
+ "codemirror_mode": {
178
+ "name": "ipython",
179
+ "version": 3
180
+ },
181
+ "file_extension": ".py",
182
+ "mimetype": "text/x-python",
183
+ "name": "python",
184
+ "nbconvert_exporter": "python",
185
+ "pygments_lexer": "ipython3",
186
+ "version": "3.8.8"
187
+ },
188
+ "toc": {
189
+ "base_numbering": 1,
190
+ "nav_menu": {},
191
+ "number_sections": true,
192
+ "sideBar": true,
193
+ "skip_h1_title": false,
194
+ "title_cell": "Table of Contents",
195
+ "title_sidebar": "Contents",
196
+ "toc_cell": false,
197
+ "toc_position": {},
198
+ "toc_section_display": true,
199
+ "toc_window_display": false
200
+ }
201
+ },
202
+ "nbformat": 4,
203
+ "nbformat_minor": 0
204
+ }
snakers4_silero-vad_master/utils_vad.py ADDED
@@ -0,0 +1,545 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchaudio
3
+ from typing import Callable, List
4
+ import torch.nn.functional as F
5
+ import warnings
6
+
7
+ languages = ['ru', 'en', 'de', 'es']
8
+
9
+
10
+ class OnnxWrapper():
11
+
12
+ def __init__(self, path, force_onnx_cpu=False):
13
+ import numpy as np
14
+ global np
15
+ import onnxruntime
16
+
17
+ opts = onnxruntime.SessionOptions()
18
+ opts.inter_op_num_threads = 1
19
+ opts.intra_op_num_threads = 1
20
+
21
+ if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers():
22
+ self.session = onnxruntime.InferenceSession(path, providers=['CPUExecutionProvider'], sess_options=opts)
23
+ else:
24
+ self.session = onnxruntime.InferenceSession(path, sess_options=opts)
25
+
26
+ self.reset_states()
27
+ self.sample_rates = [8000, 16000]
28
+
29
+ def _validate_input(self, x, sr: int):
30
+ if x.dim() == 1:
31
+ x = x.unsqueeze(0)
32
+ if x.dim() > 2:
33
+ raise ValueError(f"Too many dimensions for input audio chunk {x.dim()}")
34
+
35
+ if sr != 16000 and (sr % 16000 == 0):
36
+ step = sr // 16000
37
+ x = x[:,::step]
38
+ sr = 16000
39
+
40
+ if sr not in self.sample_rates:
41
+ raise ValueError(f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)")
42
+
43
+ if sr / x.shape[1] > 31.25:
44
+ raise ValueError("Input audio chunk is too short")
45
+
46
+ return x, sr
47
+
48
+ def reset_states(self, batch_size=1):
49
+ self._h = np.zeros((2, batch_size, 64)).astype('float32')
50
+ self._c = np.zeros((2, batch_size, 64)).astype('float32')
51
+ self._last_sr = 0
52
+ self._last_batch_size = 0
53
+
54
+ def __call__(self, x, sr: int):
55
+
56
+ x, sr = self._validate_input(x, sr)
57
+ batch_size = x.shape[0]
58
+
59
+ if not self._last_batch_size:
60
+ self.reset_states(batch_size)
61
+ if (self._last_sr) and (self._last_sr != sr):
62
+ self.reset_states(batch_size)
63
+ if (self._last_batch_size) and (self._last_batch_size != batch_size):
64
+ self.reset_states(batch_size)
65
+
66
+ if sr in [8000, 16000]:
67
+ ort_inputs = {'input': x.numpy(), 'h': self._h, 'c': self._c, 'sr': np.array(sr, dtype='int64')}
68
+ ort_outs = self.session.run(None, ort_inputs)
69
+ out, self._h, self._c = ort_outs
70
+ else:
71
+ raise ValueError()
72
+
73
+ self._last_sr = sr
74
+ self._last_batch_size = batch_size
75
+
76
+ out = torch.tensor(out)
77
+ return out
78
+
79
+ def audio_forward(self, x, sr: int, num_samples: int = 512):
80
+ outs = []
81
+ x, sr = self._validate_input(x, sr)
82
+
83
+ if x.shape[1] % num_samples:
84
+ pad_num = num_samples - (x.shape[1] % num_samples)
85
+ x = torch.nn.functional.pad(x, (0, pad_num), 'constant', value=0.0)
86
+
87
+ self.reset_states(x.shape[0])
88
+ for i in range(0, x.shape[1], num_samples):
89
+ wavs_batch = x[:, i:i+num_samples]
90
+ out_chunk = self.__call__(wavs_batch, sr)
91
+ outs.append(out_chunk)
92
+
93
+ stacked = torch.cat(outs, dim=1)
94
+ return stacked.cpu()
95
+
96
+
97
+ class Validator():
98
+ def __init__(self, url, force_onnx_cpu):
99
+ self.onnx = True if url.endswith('.onnx') else False
100
+ torch.hub.download_url_to_file(url, 'inf.model')
101
+ if self.onnx:
102
+ import onnxruntime
103
+ if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers():
104
+ self.model = onnxruntime.InferenceSession('inf.model', providers=['CPUExecutionProvider'])
105
+ else:
106
+ self.model = onnxruntime.InferenceSession('inf.model')
107
+ else:
108
+ self.model = init_jit_model(model_path='inf.model')
109
+
110
+ def __call__(self, inputs: torch.Tensor):
111
+ with torch.no_grad():
112
+ if self.onnx:
113
+ ort_inputs = {'input': inputs.cpu().numpy()}
114
+ outs = self.model.run(None, ort_inputs)
115
+ outs = [torch.Tensor(x) for x in outs]
116
+ else:
117
+ outs = self.model(inputs)
118
+
119
+ return outs
120
+
121
+
122
+ def read_audio(path: str,
123
+ sampling_rate: int = 16000):
124
+
125
+ sox_backends = set(['sox', 'sox_io'])
126
+ audio_backends = torchaudio.list_audio_backends()
127
+
128
+ if len(sox_backends.intersection(audio_backends)) > 0:
129
+ effects = [
130
+ ['channels', '1'],
131
+ ['rate', str(sampling_rate)]
132
+ ]
133
+
134
+ wav, sr = torchaudio.sox_effects.apply_effects_file(path, effects=effects)
135
+ else:
136
+ wav, sr = torchaudio.load(path)
137
+
138
+ if wav.size(0) > 1:
139
+ wav = wav.mean(dim=0, keepdim=True)
140
+
141
+ if sr != sampling_rate:
142
+ transform = torchaudio.transforms.Resample(orig_freq=sr,
143
+ new_freq=sampling_rate)
144
+ wav = transform(wav)
145
+ sr = sampling_rate
146
+
147
+ assert sr == sampling_rate
148
+ return wav.squeeze(0)
149
+
150
+
151
+ def save_audio(path: str,
152
+ tensor: torch.Tensor,
153
+ sampling_rate: int = 16000):
154
+ torchaudio.save(path, tensor.unsqueeze(0), sampling_rate, bits_per_sample=16)
155
+
156
+
157
+ def init_jit_model(model_path: str,
158
+ device=torch.device('cpu')):
159
+ model = torch.jit.load(model_path, map_location=device)
160
+ model.eval()
161
+ return model
162
+
163
+
164
+ def make_visualization(probs, step):
165
+ import pandas as pd
166
+ pd.DataFrame({'probs': probs},
167
+ index=[x * step for x in range(len(probs))]).plot(figsize=(16, 8),
168
+ kind='area', ylim=[0, 1.05], xlim=[0, len(probs) * step],
169
+ xlabel='seconds',
170
+ ylabel='speech probability',
171
+ colormap='tab20')
172
+
173
+
174
+ @torch.no_grad()
175
+ def get_speech_timestamps(audio: torch.Tensor,
176
+ model,
177
+ threshold: float = 0.5,
178
+ sampling_rate: int = 16000,
179
+ min_speech_duration_ms: int = 250,
180
+ max_speech_duration_s: float = float('inf'),
181
+ min_silence_duration_ms: int = 100,
182
+ window_size_samples: int = 512,
183
+ speech_pad_ms: int = 30,
184
+ return_seconds: bool = False,
185
+ visualize_probs: bool = False,
186
+ progress_tracking_callback: Callable[[float], None] = None):
187
+
188
+ """
189
+ This method is used for splitting long audios into speech chunks using silero VAD
190
+
191
+ Parameters
192
+ ----------
193
+ audio: torch.Tensor, one dimensional
194
+ One dimensional float torch.Tensor, other types are casted to torch if possible
195
+
196
+ model: preloaded .jit silero VAD model
197
+
198
+ threshold: float (default - 0.5)
199
+ Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
200
+ It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
201
+
202
+ sampling_rate: int (default - 16000)
203
+ Currently silero VAD models support 8000 and 16000 sample rates
204
+
205
+ min_speech_duration_ms: int (default - 250 milliseconds)
206
+ Final speech chunks shorter min_speech_duration_ms are thrown out
207
+
208
+ max_speech_duration_s: int (default - inf)
209
+ Maximum duration of speech chunks in seconds
210
+ Chunks longer than max_speech_duration_s will be split at the timestamp of the last silence that lasts more than 100ms (if any), to prevent agressive cutting.
211
+ Otherwise, they will be split aggressively just before max_speech_duration_s.
212
+
213
+ min_silence_duration_ms: int (default - 100 milliseconds)
214
+ In the end of each speech chunk wait for min_silence_duration_ms before separating it
215
+
216
+ window_size_samples: int (default - 1536 samples)
217
+ Audio chunks of window_size_samples size are fed to the silero VAD model.
218
+ WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate and 256, 512, 768 samples for 8000 sample rate.
219
+ Values other than these may affect model perfomance!!
220
+
221
+ speech_pad_ms: int (default - 30 milliseconds)
222
+ Final speech chunks are padded by speech_pad_ms each side
223
+
224
+ return_seconds: bool (default - False)
225
+ whether return timestamps in seconds (default - samples)
226
+
227
+ visualize_probs: bool (default - False)
228
+ whether draw prob hist or not
229
+
230
+ progress_tracking_callback: Callable[[float], None] (default - None)
231
+ callback function taking progress in percents as an argument
232
+
233
+ Returns
234
+ ----------
235
+ speeches: list of dicts
236
+ list containing ends and beginnings of speech chunks (samples or seconds based on return_seconds)
237
+ """
238
+
239
+ if not torch.is_tensor(audio):
240
+ try:
241
+ audio = torch.Tensor(audio)
242
+ except:
243
+ raise TypeError("Audio cannot be casted to tensor. Cast it manually")
244
+
245
+ if len(audio.shape) > 1:
246
+ for i in range(len(audio.shape)): # trying to squeeze empty dimensions
247
+ audio = audio.squeeze(0)
248
+ if len(audio.shape) > 1:
249
+ raise ValueError("More than one dimension in audio. Are you trying to process audio with 2 channels?")
250
+
251
+ if sampling_rate > 16000 and (sampling_rate % 16000 == 0):
252
+ step = sampling_rate // 16000
253
+ sampling_rate = 16000
254
+ audio = audio[::step]
255
+ warnings.warn('Sampling rate is a multiply of 16000, casting to 16000 manually!')
256
+ else:
257
+ step = 1
258
+
259
+ if sampling_rate == 8000 and window_size_samples > 768:
260
+ warnings.warn('window_size_samples is too big for 8000 sampling_rate! Better set window_size_samples to 256, 512 or 768 for 8000 sample rate!')
261
+ if window_size_samples not in [256, 512, 768, 1024, 1536]:
262
+ warnings.warn('Unusual window_size_samples! Supported window_size_samples:\n - [512, 1024, 1536] for 16000 sampling_rate\n - [256, 512, 768] for 8000 sampling_rate')
263
+
264
+ model.reset_states()
265
+ min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
266
+ speech_pad_samples = sampling_rate * speech_pad_ms / 1000
267
+ max_speech_samples = sampling_rate * max_speech_duration_s - window_size_samples - 2 * speech_pad_samples
268
+ min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
269
+ min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
270
+
271
+ audio_length_samples = len(audio)
272
+
273
+ speech_probs = []
274
+ for current_start_sample in range(0, audio_length_samples, window_size_samples):
275
+ chunk = audio[current_start_sample: current_start_sample + window_size_samples]
276
+ if len(chunk) < window_size_samples:
277
+ chunk = torch.nn.functional.pad(chunk, (0, int(window_size_samples - len(chunk))))
278
+ speech_prob = model(chunk, sampling_rate).item()
279
+ speech_probs.append(speech_prob)
280
+ # caculate progress and seng it to callback function
281
+ progress = current_start_sample + window_size_samples
282
+ if progress > audio_length_samples:
283
+ progress = audio_length_samples
284
+ progress_percent = (progress / audio_length_samples) * 100
285
+ if progress_tracking_callback:
286
+ progress_tracking_callback(progress_percent)
287
+
288
+ triggered = False
289
+ speeches = []
290
+ current_speech = {}
291
+ neg_threshold = threshold - 0.15
292
+ temp_end = 0 # to save potential segment end (and tolerate some silence)
293
+ prev_end = next_start = 0 # to save potential segment limits in case of maximum segment size reached
294
+
295
+ for i, speech_prob in enumerate(speech_probs):
296
+ if (speech_prob >= threshold) and temp_end:
297
+ temp_end = 0
298
+ if next_start < prev_end:
299
+ next_start = window_size_samples * i
300
+
301
+ if (speech_prob >= threshold) and not triggered:
302
+ triggered = True
303
+ current_speech['start'] = window_size_samples * i
304
+ continue
305
+
306
+ if triggered and (window_size_samples * i) - current_speech['start'] > max_speech_samples:
307
+ if prev_end:
308
+ current_speech['end'] = prev_end
309
+ speeches.append(current_speech)
310
+ current_speech = {}
311
+ if next_start < prev_end: # previously reached silence (< neg_thres) and is still not speech (< thres)
312
+ triggered = False
313
+ else:
314
+ current_speech['start'] = next_start
315
+ prev_end = next_start = temp_end = 0
316
+ else:
317
+ current_speech['end'] = window_size_samples * i
318
+ speeches.append(current_speech)
319
+ current_speech = {}
320
+ prev_end = next_start = temp_end = 0
321
+ triggered = False
322
+ continue
323
+
324
+ if (speech_prob < neg_threshold) and triggered:
325
+ if not temp_end:
326
+ temp_end = window_size_samples * i
327
+ if ((window_size_samples * i) - temp_end) > min_silence_samples_at_max_speech : # condition to avoid cutting in very short silence
328
+ prev_end = temp_end
329
+ if (window_size_samples * i) - temp_end < min_silence_samples:
330
+ continue
331
+ else:
332
+ current_speech['end'] = temp_end
333
+ if (current_speech['end'] - current_speech['start']) > min_speech_samples:
334
+ speeches.append(current_speech)
335
+ current_speech = {}
336
+ prev_end = next_start = temp_end = 0
337
+ triggered = False
338
+ continue
339
+
340
+ if current_speech and (audio_length_samples - current_speech['start']) > min_speech_samples:
341
+ current_speech['end'] = audio_length_samples
342
+ speeches.append(current_speech)
343
+
344
+ for i, speech in enumerate(speeches):
345
+ if i == 0:
346
+ speech['start'] = int(max(0, speech['start'] - speech_pad_samples))
347
+ if i != len(speeches) - 1:
348
+ silence_duration = speeches[i+1]['start'] - speech['end']
349
+ if silence_duration < 2 * speech_pad_samples:
350
+ speech['end'] += int(silence_duration // 2)
351
+ speeches[i+1]['start'] = int(max(0, speeches[i+1]['start'] - silence_duration // 2))
352
+ else:
353
+ speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))
354
+ speeches[i+1]['start'] = int(max(0, speeches[i+1]['start'] - speech_pad_samples))
355
+ else:
356
+ speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))
357
+
358
+ if return_seconds:
359
+ for speech_dict in speeches:
360
+ speech_dict['start'] = round(speech_dict['start'] / sampling_rate, 1)
361
+ speech_dict['end'] = round(speech_dict['end'] / sampling_rate, 1)
362
+ elif step > 1:
363
+ for speech_dict in speeches:
364
+ speech_dict['start'] *= step
365
+ speech_dict['end'] *= step
366
+
367
+ if visualize_probs:
368
+ make_visualization(speech_probs, window_size_samples / sampling_rate)
369
+
370
+ return speeches
371
+
372
+
373
+ def get_number_ts(wav: torch.Tensor,
374
+ model,
375
+ model_stride=8,
376
+ hop_length=160,
377
+ sample_rate=16000):
378
+ wav = torch.unsqueeze(wav, dim=0)
379
+ perframe_logits = model(wav)[0]
380
+ perframe_preds = torch.argmax(torch.softmax(perframe_logits, dim=1), dim=1).squeeze() # (1, num_frames_strided)
381
+ extended_preds = []
382
+ for i in perframe_preds:
383
+ extended_preds.extend([i.item()] * model_stride)
384
+ # len(extended_preds) is *num_frames_real*; for each frame of audio we know if it has a number in it.
385
+ triggered = False
386
+ timings = []
387
+ cur_timing = {}
388
+ for i, pred in enumerate(extended_preds):
389
+ if pred == 1:
390
+ if not triggered:
391
+ cur_timing['start'] = int((i * hop_length) / (sample_rate / 1000))
392
+ triggered = True
393
+ elif pred == 0:
394
+ if triggered:
395
+ cur_timing['end'] = int((i * hop_length) / (sample_rate / 1000))
396
+ timings.append(cur_timing)
397
+ cur_timing = {}
398
+ triggered = False
399
+ if cur_timing:
400
+ cur_timing['end'] = int(len(wav) / (sample_rate / 1000))
401
+ timings.append(cur_timing)
402
+ return timings
403
+
404
+
405
+ def get_language(wav: torch.Tensor,
406
+ model):
407
+ wav = torch.unsqueeze(wav, dim=0)
408
+ lang_logits = model(wav)[2]
409
+ lang_pred = torch.argmax(torch.softmax(lang_logits, dim=1), dim=1).item() # from 0 to len(languages) - 1
410
+ assert lang_pred < len(languages)
411
+ return languages[lang_pred]
412
+
413
+
414
+ def get_language_and_group(wav: torch.Tensor,
415
+ model,
416
+ lang_dict: dict,
417
+ lang_group_dict: dict,
418
+ top_n=1):
419
+ wav = torch.unsqueeze(wav, dim=0)
420
+ lang_logits, lang_group_logits = model(wav)
421
+
422
+ softm = torch.softmax(lang_logits, dim=1).squeeze()
423
+ softm_group = torch.softmax(lang_group_logits, dim=1).squeeze()
424
+
425
+ srtd = torch.argsort(softm, descending=True)
426
+ srtd_group = torch.argsort(softm_group, descending=True)
427
+
428
+ outs = []
429
+ outs_group = []
430
+ for i in range(top_n):
431
+ prob = round(softm[srtd[i]].item(), 2)
432
+ prob_group = round(softm_group[srtd_group[i]].item(), 2)
433
+ outs.append((lang_dict[str(srtd[i].item())], prob))
434
+ outs_group.append((lang_group_dict[str(srtd_group[i].item())], prob_group))
435
+
436
+ return outs, outs_group
437
+
438
+
439
+ class VADIterator:
440
+ def __init__(self,
441
+ model,
442
+ threshold: float = 0.5,
443
+ sampling_rate: int = 16000,
444
+ min_silence_duration_ms: int = 100,
445
+ speech_pad_ms: int = 30
446
+ ):
447
+
448
+ """
449
+ Class for stream imitation
450
+
451
+ Parameters
452
+ ----------
453
+ model: preloaded .jit silero VAD model
454
+
455
+ threshold: float (default - 0.5)
456
+ Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
457
+ It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
458
+
459
+ sampling_rate: int (default - 16000)
460
+ Currently silero VAD models support 8000 and 16000 sample rates
461
+
462
+ min_silence_duration_ms: int (default - 100 milliseconds)
463
+ In the end of each speech chunk wait for min_silence_duration_ms before separating it
464
+
465
+ speech_pad_ms: int (default - 30 milliseconds)
466
+ Final speech chunks are padded by speech_pad_ms each side
467
+ """
468
+
469
+ self.model = model
470
+ self.threshold = threshold
471
+ self.sampling_rate = sampling_rate
472
+
473
+ if sampling_rate not in [8000, 16000]:
474
+ raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')
475
+
476
+ self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
477
+ self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
478
+ self.reset_states()
479
+
480
+ def reset_states(self):
481
+
482
+ self.model.reset_states()
483
+ self.triggered = False
484
+ self.temp_end = 0
485
+ self.current_sample = 0
486
+
487
+ @torch.no_grad()
488
+ def __call__(self, x, return_seconds=False):
489
+ """
490
+ x: torch.Tensor
491
+ audio chunk (see examples in repo)
492
+
493
+ return_seconds: bool (default - False)
494
+ whether return timestamps in seconds (default - samples)
495
+ """
496
+
497
+ if not torch.is_tensor(x):
498
+ try:
499
+ x = torch.Tensor(x)
500
+ except:
501
+ raise TypeError("Audio cannot be casted to tensor. Cast it manually")
502
+
503
+ window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
504
+ self.current_sample += window_size_samples
505
+
506
+ speech_prob = self.model(x, self.sampling_rate).item()
507
+
508
+ if (speech_prob >= self.threshold) and self.temp_end:
509
+ self.temp_end = 0
510
+
511
+ if (speech_prob >= self.threshold) and not self.triggered:
512
+ self.triggered = True
513
+ speech_start = self.current_sample - self.speech_pad_samples - window_size_samples
514
+ return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
515
+
516
+ if (speech_prob < self.threshold - 0.15) and self.triggered:
517
+ if not self.temp_end:
518
+ self.temp_end = self.current_sample
519
+ if self.current_sample - self.temp_end < self.min_silence_samples:
520
+ return None
521
+ else:
522
+ speech_end = self.temp_end + self.speech_pad_samples - window_size_samples
523
+ self.temp_end = 0
524
+ self.triggered = False
525
+ return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
526
+
527
+ return None
528
+
529
+
530
+ def collect_chunks(tss: List[dict],
531
+ wav: torch.Tensor):
532
+ chunks = []
533
+ for i in tss:
534
+ chunks.append(wav[i['start']: i['end']])
535
+ return torch.cat(chunks)
536
+
537
+
538
+ def drop_chunks(tss: List[dict],
539
+ wav: torch.Tensor):
540
+ chunks = []
541
+ cur_start = 0
542
+ for i in tss:
543
+ chunks.append((wav[cur_start: i['start']]))
544
+ cur_start = i['end']
545
+ return torch.cat(chunks)