Commit ·
d21d362
0
Parent(s):
Duplicate from MoYoYoTech/vad_cpp
Browse filesCo-authored-by: chenxiang <xianglarry@users.noreply.huggingface.co>
- .gitattributes +35 -0
- CMakeLists.txt +74 -0
- README.md +84 -0
- bin/CMakeLists.txt +26 -0
- bin/main.cpp +71 -0
- bin/main_silero.cpp +59 -0
- bin/test_main.cpp +87 -0
- bin/test_silero.cpp +80 -0
- bin/wav.h +237 -0
- python/__inip__.py +0 -0
- python/__pycache__/processing.cpython-312.pyc +0 -0
- python/helpers/__init__.py +0 -0
- python/helpers/__pycache__/__init__.cpython-312.pyc +0 -0
- python/helpers/__pycache__/vadprocessor.cpython-312.pyc +0 -0
- python/helpers/vadprocessor.py +603 -0
- python/pipelines/__init__.py +3 -0
- python/pipelines/__pycache__/__init__.cpython-312.pyc +0 -0
- python/pipelines/__pycache__/base.cpython-312.pyc +0 -0
- python/pipelines/__pycache__/pipe_vad.cpython-312.pyc +0 -0
- python/pipelines/base.py +71 -0
- python/pipelines/pipe_vad.py +96 -0
- python/processing.py +62 -0
- reference/.DS_Store +0 -0
- reference/cpp/onnx_wrapper.cpp +185 -0
- reference/cpp/onnx_wrapper.h +70 -0
- reference/cpp/vad_iterator_onnx.cpp +104 -0
- reference/cpp/vad_iterator_onnx.h +38 -0
- reference/python/__pycache__/audio_utils.cpython-312.pyc +0 -0
- reference/python/audio_utils.py +54 -0
- reference/python/test_vad.ipynb +295 -0
- silero_vad_onnx/CMakeLists.txt +39 -0
- silero_vad_onnx/time_stamp.cpp +53 -0
- silero_vad_onnx/time_stamp.h +26 -0
- silero_vad_onnx/vad_iterator.cpp +196 -0
- silero_vad_onnx/vad_iterator.h +87 -0
- silero_vad_onnx/wav.h +237 -0
- vad_onnx/CMakeLists.txt +40 -0
- vad_onnx/vad_onnx.cpp +245 -0
- vad_onnx/vad_onnx.h +107 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
CMakeLists.txt
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
cmake_minimum_required(VERSION 3.16)
|
| 2 |
+
project(VadOnnx)
|
| 3 |
+
|
| 4 |
+
set(CMAKE_CXX_STANDARD 17)
|
| 5 |
+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
| 6 |
+
set(CMAKE_CXX_EXTENSIONS OFF)
|
| 7 |
+
|
| 8 |
+
# 可选:测试是否为大端系统(用于音频处理等场景)
|
| 9 |
+
include(TestBigEndian)
|
| 10 |
+
test_big_endian(BIG_ENDIAN)
|
| 11 |
+
if(BIG_ENDIAN)
|
| 12 |
+
message("Big endian system")
|
| 13 |
+
else()
|
| 14 |
+
message("Little endian system")
|
| 15 |
+
endif()
|
| 16 |
+
|
| 17 |
+
# 查找 ONNX Runtime 安装路径
|
| 18 |
+
if(NOT DEFINED ONNXRUNTIME_DIR OR NOT EXISTS ${ONNXRUNTIME_DIR})
|
| 19 |
+
message(FATAL_ERROR "Please specify ONNXRUNTIME_DIR when configuring, e.g. cmake -DONNXRUNTIME_DIR=/path/to/onnxruntime ..")
|
| 20 |
+
endif()
|
| 21 |
+
|
| 22 |
+
# 使用 pkg-config 查找 sndfile
|
| 23 |
+
find_package(PkgConfig REQUIRED)
|
| 24 |
+
pkg_check_modules(SNDFILE REQUIRED IMPORTED_TARGET sndfile)
|
| 25 |
+
|
| 26 |
+
# # 添加 ONNX Runtime include 路径
|
| 27 |
+
# include_directories(${ONNXRUNTIME_DIR}/include)
|
| 28 |
+
|
| 29 |
+
# file(GLOB SOURCES_FILES "${CMAKE_SOURCE_DIR}/src/*.cpp")
|
| 30 |
+
# set(files ${files1})
|
| 31 |
+
|
| 32 |
+
# add_library(vad_onnx SHARED ${SOURCES_FILES})
|
| 33 |
+
# add_library(vad_onnx SHARED ${CMAKE_SOURCE_DIR}/src/vad_onnx.cpp)
|
| 34 |
+
|
| 35 |
+
# 设置库输出名称(跨平台兼容)
|
| 36 |
+
# set_target_properties(vad_onnx PROPERTIES
|
| 37 |
+
# PREFIX ""
|
| 38 |
+
# SUFFIX ".so"
|
| 39 |
+
# LIBRARY_OUTPUT_NAME_DEBUG "vad_onnx"
|
| 40 |
+
# LIBRARY_OUTPUT_NAME_RELEASE "vad_onnx"
|
| 41 |
+
# )
|
| 42 |
+
|
| 43 |
+
# 链接 ONNX Runtime 库
|
| 44 |
+
# if(APPLE)
|
| 45 |
+
# # macOS 上链接 dylib
|
| 46 |
+
# target_link_libraries(vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/libonnxruntime.dylib)
|
| 47 |
+
# elseif(UNIX)
|
| 48 |
+
# # Linux 上链接 so
|
| 49 |
+
# target_link_libraries(vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/libonnxruntime.so)
|
| 50 |
+
# elseif(WIN32)
|
| 51 |
+
# # Windows 上链接 dll + lib
|
| 52 |
+
# target_link_libraries(vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/onnxruntime.lib)
|
| 53 |
+
# set_target_properties(vad_onnx PROPERTIES SUFFIX ".dll")
|
| 54 |
+
# else()
|
| 55 |
+
# message(WARNING "Unknown platform, no ONNX Runtime linking applied.")
|
| 56 |
+
# endif()
|
| 57 |
+
|
| 58 |
+
# 添加项目头文件目录
|
| 59 |
+
# include_directories(${CMAKE_SOURCE_DIR}/src)
|
| 60 |
+
|
| 61 |
+
# 可选:添加 ffmpeg 等其他依赖
|
| 62 |
+
# if (ENABLE_FFMPEG)
|
| 63 |
+
# include_directories(${FFMPEG_DIR}/include)
|
| 64 |
+
# target_link_libraries(vad_onnx PRIVATE ${FFMPEG_DIR}/lib/libavcodec.a ${FFMPEG_DIR}/lib/libavutil.a)
|
| 65 |
+
# endif()
|
| 66 |
+
|
| 67 |
+
# https://github.com/snakers4/silero-vad examples/cpp
|
| 68 |
+
add_subdirectory(silero_vad_onnx)
|
| 69 |
+
|
| 70 |
+
# from moyoyo/translator python/helpers/vadprocessor.py
|
| 71 |
+
add_subdirectory(vad_onnx)
|
| 72 |
+
|
| 73 |
+
# 编译测试程序
|
| 74 |
+
add_subdirectory(bin)
|
README.md
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
---
|
| 4 |
+
|
| 5 |
+
## 简介
|
| 6 |
+
这是一个对 silero_vad (https://github.com/snakers4/silero-vad) 的简易封装,便于开发。
|
| 7 |
+
|
| 8 |
+
## 目录简介
|
| 9 |
+
|
| 10 |
+
## 环境安装
|
| 11 |
+
### 系统环境
|
| 12 |
+
|
| 13 |
+
> 1. 在macOS 14.5 版本已验证执行OK。
|
| 14 |
+
> 2. mac上安装 onnxruntime(brew 安装未使能 CoreMl, 需下载源码, 手动编译使能, 并安装)
|
| 15 |
+
```bash
|
| 16 |
+
brew install onnxruntime
|
| 17 |
+
```
|
| 18 |
+
> 3. 下载onnxruntime源码, 手动编译使能, 并安装
|
| 19 |
+
```bash
|
| 20 |
+
brew install cmake protobuf python # 可选
|
| 21 |
+
git clone https://github.com/microsoft/onnxruntime
|
| 22 |
+
cd onnxruntime
|
| 23 |
+
./build.sh --config Release --enable_coreml
|
| 24 |
+
# 或者 ./build.sh --config Release --enable_coreml --build_wheel --parallel
|
| 25 |
+
sudo ./install_to_system.sh # 如果不安装,库目录build/Release,头文件目录 build/Release/include/
|
| 26 |
+
```
|
| 27 |
+
## 目录简介
|
| 28 |
+
|
| 29 |
+
```
|
| 30 |
+
.
|
| 31 |
+
├── README.md
|
| 32 |
+
├── bin/
|
| 33 |
+
│ ├──main_silero.cpp // 参照 silero_vad中cpp的example封装代码进行测试
|
| 34 |
+
│ ├── main.cpp // 参照 translator中FixedVADIterator封装代码进行测试
|
| 35 |
+
│ ├── wav.h // 定义读取 wav 文件类
|
| 36 |
+
│ └── ...
|
| 37 |
+
├── python/
|
| 38 |
+
│ ├── processing.py // translator中FixedVADIterator的python脚本
|
| 39 |
+
│ └── ...
|
| 40 |
+
├── reference/ // python、cpp参考代码
|
| 41 |
+
├── silero_vad_onnx/ // 参照 silero_vad中cpp的封装
|
| 42 |
+
│ ├── time_stamp.cpp
|
| 43 |
+
│ ├── time_stamp.h
|
| 44 |
+
│ ├── vad_iterator.cpp
|
| 45 |
+
│ ├── vad_iterator.h
|
| 46 |
+
│ └── ...
|
| 47 |
+
├── vad_onnx/ // 参照 translator中FixedVADIterator封装
|
| 48 |
+
│ ├── vad_onnx.cpp
|
| 49 |
+
│ ├── vad_onnx.h
|
| 50 |
+
│ └── ...
|
| 51 |
+
└── ...
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
## 编译
|
| 55 |
+
```bash
|
| 56 |
+
git clone https://huggingface.co/MoYoYoTech/vad_cpp
|
| 57 |
+
cd vad_cpp
|
| 58 |
+
mkdir build
|
| 59 |
+
cd build
|
| 60 |
+
cmake .. -DONNXRUNTIME_DIR=/opt/homebrew/Cellar/onnxruntime/1.21.1 # 或者指定源码编译后的路径,包括 include 和 lib目录
|
| 61 |
+
make
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
## 运行&使用
|
| 65 |
+
### 接口调用和使用参考 main_silero.cpp 和 main.cpp
|
| 66 |
+
```bash
|
| 67 |
+
cd bin
|
| 68 |
+
# silero_vad_onnx.dylib 测试程序
|
| 69 |
+
./main_silero "/Users/.../Translator/moyoyo_asr_models/silero-vad/silero_vad.onnx" "/Users/xxx/zh.wav"
|
| 70 |
+
# vad_onnx.dylib 测试程序
|
| 71 |
+
./main "/Users/.../Translator/moyoyo_asr_models/silero-vad/silero_vad.onnx" "/Users/xxx/zh.wav"
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
### translator中FixedVADIterator的python测试脚本
|
| 75 |
+
```bash
|
| 76 |
+
cd vad_cpp
|
| 77 |
+
# python/processing.py中配置 wav_path ; 在python/helpers/vadprocessor.py中配置 VAD_MODEL_PATH
|
| 78 |
+
python -m python.processing
|
| 79 |
+
# 结果显示
|
| 80 |
+
....
|
| 81 |
+
935936 ->>> {'start': 935456} -> {'start': 5664}
|
| 82 |
+
984576 ->>> {'end': 983008} -> {'start': 5664, 'end': 983008}
|
| 83 |
+
strat: 5664 end: 983008
|
| 84 |
+
```
|
bin/CMakeLists.txt
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set(CMAKE_CXX_STANDARD 17)
|
| 2 |
+
|
| 3 |
+
if(WIN32)
|
| 4 |
+
add_compile_options("$<$<CXX_COMPILER_ID:MSVC>:/execution-charset:utf-8>")
|
| 5 |
+
add_compile_options("$<$<CXX_COMPILER_ID:MSVC>:/source-charset:utf-8>")
|
| 6 |
+
endif()
|
| 7 |
+
|
| 8 |
+
# 查找 onnxruntime
|
| 9 |
+
find_package(onnxruntime REQUIRED)
|
| 10 |
+
# 添加 ONNX Runtime include 路径
|
| 11 |
+
include_directories(${ONNXRUNTIME_DIR}/include)
|
| 12 |
+
|
| 13 |
+
include_directories(${PROJECT_SOURCE_DIR}/vad_onnx)
|
| 14 |
+
add_executable(main "main.cpp")
|
| 15 |
+
if(UNIX AND NOT APPLE)
|
| 16 |
+
target_link_options(main PRIVATE "-Wl,--no-as-needed")
|
| 17 |
+
endif()
|
| 18 |
+
target_link_libraries(main PUBLIC vad_onnx onnxruntime::onnxruntime PkgConfig::SNDFILE)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
include_directories(${PROJECT_SOURCE_DIR}/silero_vad_onnx)
|
| 22 |
+
add_executable(main_silero "main_silero.cpp")
|
| 23 |
+
if(UNIX AND NOT APPLE)
|
| 24 |
+
target_link_options(main PRIVATE "-Wl,--no-as-needed")
|
| 25 |
+
endif()
|
| 26 |
+
target_link_libraries(main_silero PUBLIC silero_vad_onnx onnxruntime::onnxruntime)
|
bin/main.cpp
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "vad_onnx.h"
|
| 2 |
+
#include <iostream>
|
| 3 |
+
#include <sndfile.h>
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
int main(int argc, char* argv[]) {
|
| 7 |
+
if (argc < 3) {
|
| 8 |
+
std::cerr << "Usage: " << argv[0] << " <model_absolute_path>" << " <audio_file_absolute_path>" << std::endl;
|
| 9 |
+
return 1;
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
// 获取命令行传入的音频文件路径
|
| 13 |
+
std::string model_path = argv[1];
|
| 14 |
+
std::string wav_path = argv[2];
|
| 15 |
+
|
| 16 |
+
// std::string model_path = "/Users/chenxiang/translator/Translator/moyoyo_asr_models/silero-vad/silero_vad.onnx";
|
| 17 |
+
// std::string wav_path = "/Users/chenxiang/translator/core/whisper_wrapper/bin/zh.wav";
|
| 18 |
+
|
| 19 |
+
// 加载音频文件
|
| 20 |
+
SF_INFO sf_info;
|
| 21 |
+
SNDFILE* file = sf_open(wav_path.c_str(), SFM_READ, &sf_info);
|
| 22 |
+
|
| 23 |
+
int samplerate = sf_info.samplerate;
|
| 24 |
+
int channels = sf_info.channels;
|
| 25 |
+
int frames = sf_info.frames;
|
| 26 |
+
|
| 27 |
+
std::vector<float> audio(frames * channels);
|
| 28 |
+
sf_readf_float(file, audio.data(), sf_info.frames);
|
| 29 |
+
sf_close(file);
|
| 30 |
+
|
| 31 |
+
// 创建目标 buffer 来保存 512 帧音频数据
|
| 32 |
+
std::vector<float> audio_512frames(audio.begin(), audio.begin() + 512);
|
| 33 |
+
|
| 34 |
+
try {
|
| 35 |
+
VadOnnx vad_model = VadOnnx(model_path);
|
| 36 |
+
|
| 37 |
+
// 输入一段音频数据(512 samples)
|
| 38 |
+
float result_512 = vad_model.forward_infer(audio_512frames);
|
| 39 |
+
std::cout << "result_512 = " << result_512 << std::endl;
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
std::vector<float> result_1 = vad_model.vad_dectect(audio);
|
| 43 |
+
if (!result_1.empty()) {
|
| 44 |
+
std::cout << "result_1.size = " << result_1.size() << std::endl;
|
| 45 |
+
for (int i = 0; i < 5 && i < result_1.size(); ++i) {
|
| 46 |
+
std::cout << result_1[i] << ", ";
|
| 47 |
+
}
|
| 48 |
+
std::cout << "(only show 5)" << std::endl;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
std::map<std::string, double> result_map;
|
| 52 |
+
result_map = vad_model.vad_dectect(audio, false);
|
| 53 |
+
std::cerr << "result: " << std::endl;
|
| 54 |
+
if (!result_map.empty()) {
|
| 55 |
+
for (const auto& pair : result_map) {
|
| 56 |
+
std::cout << pair.first << " : " << pair.second << std::endl;
|
| 57 |
+
}
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
} catch (const std::exception& ex) {
|
| 61 |
+
std::cerr << "Error: " << ex.what() << std::endl;
|
| 62 |
+
}
|
| 63 |
+
// // 输出音频信息
|
| 64 |
+
// std::cout << "========= 音频信息 =========" << std::endl;
|
| 65 |
+
// std::cout << "采样率: " << samplerate << " Hz" << std::endl;
|
| 66 |
+
// std::cout << "通道数: " << channels << std::endl;
|
| 67 |
+
// std::cout << "总帧数: " << frames << std::endl;
|
| 68 |
+
// std::cout << "===========================" << std::endl;
|
| 69 |
+
|
| 70 |
+
return 0;
|
| 71 |
+
}
|
bin/main_silero.cpp
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <iostream>
|
| 2 |
+
#include <vector>
|
| 3 |
+
#include <cmath>
|
| 4 |
+
#include <iomanip> // std::fixed, std::setprecision
|
| 5 |
+
|
| 6 |
+
// 自定义头文件
|
| 7 |
+
#include "wav.h" // 包含 wav::WavReader 定义
|
| 8 |
+
#include "time_stamp.h" // 包含 timestamp_t 定义
|
| 9 |
+
#include "vad_iterator.h" // 包含 VadIterator 类声明
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
int main(int argc, char* argv[]) {
|
| 13 |
+
if (argc < 3) {
|
| 14 |
+
std::cerr << "Usage: " << argv[0] << " <model_absolute_path>" << " <audio_file_absolute_path>" << std::endl;
|
| 15 |
+
return 1;
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
// 获取命令行传入的音频文件路径
|
| 19 |
+
std::string model_path = argv[1];
|
| 20 |
+
std::string wav_path = argv[2];
|
| 21 |
+
|
| 22 |
+
// std::string model_path = "/Users/chenxiang/translator/Translator/moyoyo_asr_models/silero-vad/silero_vad.onnx";
|
| 23 |
+
// std::string wav_path = "/Users/chenxiang/translator/core/whisper_wrapper/bin/zh.wav";
|
| 24 |
+
|
| 25 |
+
// Read the WAV file (expects 16000 Hz, mono, PCM).
|
| 26 |
+
wav::WavReader wav_reader(wav_path); // File located in the "audio" folder.
|
| 27 |
+
int numSamples = wav_reader.num_samples();
|
| 28 |
+
std::vector<float> input_wav(static_cast<size_t>(numSamples));
|
| 29 |
+
for (size_t i = 0; i < static_cast<size_t>(numSamples); i++) {
|
| 30 |
+
input_wav[i] = static_cast<float>(*(wav_reader.data() + i));
|
| 31 |
+
}
|
| 32 |
+
// Initialize the VadIterator.
|
| 33 |
+
VadIterator vad(model_path);
|
| 34 |
+
|
| 35 |
+
// Process the audio.
|
| 36 |
+
vad.process(input_wav);
|
| 37 |
+
|
| 38 |
+
// Retrieve the speech timestamps (in samples).
|
| 39 |
+
std::vector<timestamp_t> stamps = vad.get_speech_timestamps();
|
| 40 |
+
|
| 41 |
+
// Convert timestamps to seconds and round to one decimal place (for 16000 Hz).
|
| 42 |
+
const float sample_rate_float = 16000.0f;
|
| 43 |
+
for (size_t i = 0; i < stamps.size(); i++) {
|
| 44 |
+
float start_sec = std::rint((stamps[i].start / sample_rate_float) * 10.0f) / 10.0f;
|
| 45 |
+
float end_sec = std::rint((stamps[i].end / sample_rate_float) * 10.0f) / 10.0f;
|
| 46 |
+
std::cout << "Speech detected from "
|
| 47 |
+
<< std::fixed << std::setprecision(1) << start_sec
|
| 48 |
+
<< " s to "
|
| 49 |
+
<< std::fixed << std::setprecision(1) << end_sec
|
| 50 |
+
<< " s"
|
| 51 |
+
<< " [ " << stamps[i].start << " " << stamps[i].end <<" ]"
|
| 52 |
+
<< std::endl;
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
// Optionally, reset the internal state.
|
| 56 |
+
vad.reset();
|
| 57 |
+
|
| 58 |
+
return 0;
|
| 59 |
+
}
|
bin/test_main.cpp
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "vad_onnx.h"
|
| 2 |
+
#include <iostream>
|
| 3 |
+
#include <sndfile.h>
|
| 4 |
+
#include <vector>
|
| 5 |
+
#include <map>
|
| 6 |
+
#include <fstream>
|
| 7 |
+
#include <string>
|
| 8 |
+
|
| 9 |
+
int main(int argc, char* argv[]) {
|
| 10 |
+
if (argc < 3) {
|
| 11 |
+
std::cerr << "Usage: " << argv[0] << " <model_absolute_path> <audio_list_absolute_path>" << std::endl;
|
| 12 |
+
return 1;
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
// 获取命令行传入的模型路径和音频列表文件路径
|
| 16 |
+
std::string model_path = argv[1];
|
| 17 |
+
std::string audio_list_path = argv[2];
|
| 18 |
+
|
| 19 |
+
// 打开 audio_list.txt 文件
|
| 20 |
+
std::ifstream audio_list_file(audio_list_path);
|
| 21 |
+
if (!audio_list_file.is_open()) {
|
| 22 |
+
std::cerr << "Error: Unable to open audio list file: " << audio_list_path << std::endl;
|
| 23 |
+
return 1;
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
try {
|
| 27 |
+
VadOnnx vad_model = VadOnnx(model_path);
|
| 28 |
+
|
| 29 |
+
// 逐行读取音频文件路径并处理
|
| 30 |
+
std::string wav_path;
|
| 31 |
+
while (std::getline(audio_list_file, wav_path)) {
|
| 32 |
+
if (wav_path.empty()) {
|
| 33 |
+
continue; // 跳过空行
|
| 34 |
+
}
|
| 35 |
+
vad_model.reset_states(); // 重置状态
|
| 36 |
+
|
| 37 |
+
std::cout << wav_path << std::endl;
|
| 38 |
+
|
| 39 |
+
// 加载音频文件
|
| 40 |
+
SF_INFO sf_info;
|
| 41 |
+
SNDFILE* file = sf_open(wav_path.c_str(), SFM_READ, &sf_info);
|
| 42 |
+
if (!file) {
|
| 43 |
+
std::cerr << "Error: Unable to open audio file: " << wav_path << std::endl;
|
| 44 |
+
continue; // 跳过无法打开的文件
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
int samplerate = sf_info.samplerate;
|
| 48 |
+
int channels = sf_info.channels;
|
| 49 |
+
int frames = sf_info.frames;
|
| 50 |
+
|
| 51 |
+
std::vector<float> audio_buffer(4096 * channels); // 用于存储每次读取的 4096 帧音频数据
|
| 52 |
+
|
| 53 |
+
try {
|
| 54 |
+
// 循环读取音频文件,每次读取 4096 帧
|
| 55 |
+
int read_frames = 0;
|
| 56 |
+
while ((read_frames = sf_readf_float(file, audio_buffer.data(), 4096)) > 0) {
|
| 57 |
+
// 如果实际读取的帧数小于 4096,则调整 buffer 大小
|
| 58 |
+
audio_buffer.resize(read_frames * channels);
|
| 59 |
+
|
| 60 |
+
// 推理
|
| 61 |
+
std::map<std::string, double> result_map = vad_model.vad_dectect(audio_buffer, false);
|
| 62 |
+
|
| 63 |
+
// 打印推理结果
|
| 64 |
+
if (!result_map.empty()) {
|
| 65 |
+
for (const auto& pair : result_map) {
|
| 66 |
+
std::cout << pair.first << ", " << pair.second << std::endl;
|
| 67 |
+
}
|
| 68 |
+
}
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
sf_close(file);
|
| 72 |
+
|
| 73 |
+
} catch (const std::exception& ex) {
|
| 74 |
+
std::cerr << "Error processing file " << wav_path << ": " << ex.what() << std::endl;
|
| 75 |
+
sf_close(file);
|
| 76 |
+
}
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
audio_list_file.close();
|
| 80 |
+
|
| 81 |
+
} catch (const std::exception& ex) {
|
| 82 |
+
std::cerr << "Error: " << ex.what() << std::endl;
|
| 83 |
+
return 1;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
return 0;
|
| 87 |
+
}
|
bin/test_silero.cpp
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <iostream>
|
| 2 |
+
#include <vector>
|
| 3 |
+
#include <cmath>
|
| 4 |
+
#include <iomanip> // std::fixed, std::setprecision
|
| 5 |
+
#include <fstream> // std::ifstream
|
| 6 |
+
#include <string> // std::string
|
| 7 |
+
|
| 8 |
+
// 自定义头文件
|
| 9 |
+
#include "wav.h" // 包含 wav::WavReader 定义
|
| 10 |
+
#include "time_stamp.h" // 包含 timestamp_t 定义
|
| 11 |
+
#include "vad_iterator.h" // 包含 VadIterator 类声明
|
| 12 |
+
|
| 13 |
+
int main(int argc, char* argv[]) {
|
| 14 |
+
if (argc < 3) {
|
| 15 |
+
std::cerr << "Usage: " << argv[0] << " <model_absolute_path>" << " <audio_list_absolute_path>" << std::endl;
|
| 16 |
+
return 1;
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
// 获取命令行传入的模型路径和音频列表文件路径
|
| 20 |
+
std::string model_path = argv[1];
|
| 21 |
+
std::string audio_list_path = argv[2];
|
| 22 |
+
|
| 23 |
+
// 打开 audio_list.txt 文件
|
| 24 |
+
std::ifstream audio_list_file(audio_list_path);
|
| 25 |
+
if (!audio_list_file.is_open()) {
|
| 26 |
+
std::cerr << "Error: Unable to open audio list file: " << audio_list_path << std::endl;
|
| 27 |
+
return 1;
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
// 初始化 VadIterator
|
| 31 |
+
VadIterator vad(model_path);
|
| 32 |
+
|
| 33 |
+
// 逐行读取音频文件路径并处理
|
| 34 |
+
std::string wav_path;
|
| 35 |
+
while (std::getline(audio_list_file, wav_path)) {
|
| 36 |
+
if (wav_path.empty()) {
|
| 37 |
+
continue; // 跳过空行
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
std::cout << wav_path << std::endl;
|
| 41 |
+
|
| 42 |
+
try {
|
| 43 |
+
// 读取 WAV 文件 (expects 16000 Hz, mono, PCM)
|
| 44 |
+
wav::WavReader wav_reader(wav_path);
|
| 45 |
+
int numSamples = wav_reader.num_samples();
|
| 46 |
+
std::vector<float> input_wav(static_cast<size_t>(numSamples));
|
| 47 |
+
for (size_t i = 0; i < static_cast<size_t>(numSamples); i++) {
|
| 48 |
+
input_wav[i] = static_cast<float>(*(wav_reader.data() + i));
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
// 处理音频
|
| 52 |
+
vad.process(input_wav);
|
| 53 |
+
// 获取语音时间戳 (以样本为单位)
|
| 54 |
+
std::vector<timestamp_t> stamps = vad.get_speech_timestamps();
|
| 55 |
+
|
| 56 |
+
// 将时间戳转换为秒并输出
|
| 57 |
+
const float sample_rate_float = 16000.0f;
|
| 58 |
+
for (size_t i = 0; i < stamps.size(); i++) {
|
| 59 |
+
float start_sec = std::rint((stamps[i].start / sample_rate_float) * 10.0f) / 10.0f;
|
| 60 |
+
float end_sec = std::rint((stamps[i].end / sample_rate_float) * 10.0f) / 10.0f;
|
| 61 |
+
// std::cout << "Speech detected from "
|
| 62 |
+
// << std::fixed << std::setprecision(1) << start_sec
|
| 63 |
+
// << " s to "
|
| 64 |
+
// << std::fixed << std::setprecision(1) << end_sec
|
| 65 |
+
// << " s"
|
| 66 |
+
// << " [ " << stamps[i].start << " " << stamps[i].end << " ]"
|
| 67 |
+
// << std::endl;
|
| 68 |
+
std::cout << stamps[i].start << ", " << stamps[i].end << std::endl;
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
// 重置内部状态
|
| 72 |
+
vad.reset();
|
| 73 |
+
} catch (const std::exception& e) {
|
| 74 |
+
std::cerr << "Error processing file " << wav_path << ": " << e.what() << std::endl;
|
| 75 |
+
}
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
audio_list_file.close();
|
| 79 |
+
return 0;
|
| 80 |
+
}
|
bin/wav.h
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Copyright (c) 2016 Personal (Binbin Zhang)
|
| 2 |
+
//
|
| 3 |
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
// you may not use this file except in compliance with the License.
|
| 5 |
+
// You may obtain a copy of the License at
|
| 6 |
+
//
|
| 7 |
+
// http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
//
|
| 9 |
+
// Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
// See the License for the specific language governing permissions and
|
| 13 |
+
// limitations under the License.
|
| 14 |
+
|
| 15 |
+
#ifndef FRONTEND_WAV_H_
|
| 16 |
+
#define FRONTEND_WAV_H_
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
#include <assert.h>
|
| 20 |
+
#include <stdint.h>
|
| 21 |
+
#include <stdio.h>
|
| 22 |
+
#include <stdlib.h>
|
| 23 |
+
#include <string.h>
|
| 24 |
+
|
| 25 |
+
#include <string>
|
| 26 |
+
|
| 27 |
+
#include <iostream>
|
| 28 |
+
|
| 29 |
+
// #include "utils/log.h"
|
| 30 |
+
|
| 31 |
+
namespace wav {
|
| 32 |
+
|
| 33 |
+
struct WavHeader {
|
| 34 |
+
char riff[4]; // "riff"
|
| 35 |
+
unsigned int size;
|
| 36 |
+
char wav[4]; // "WAVE"
|
| 37 |
+
char fmt[4]; // "fmt "
|
| 38 |
+
unsigned int fmt_size;
|
| 39 |
+
uint16_t format;
|
| 40 |
+
uint16_t channels;
|
| 41 |
+
unsigned int sample_rate;
|
| 42 |
+
unsigned int bytes_per_second;
|
| 43 |
+
uint16_t block_size;
|
| 44 |
+
uint16_t bit;
|
| 45 |
+
char data[4]; // "data"
|
| 46 |
+
unsigned int data_size;
|
| 47 |
+
};
|
| 48 |
+
|
| 49 |
+
class WavReader {
|
| 50 |
+
public:
|
| 51 |
+
WavReader() : data_(nullptr) {}
|
| 52 |
+
explicit WavReader(const std::string& filename) { Open(filename); }
|
| 53 |
+
|
| 54 |
+
bool Open(const std::string& filename) {
|
| 55 |
+
FILE* fp = fopen(filename.c_str(), "rb"); //文件读取
|
| 56 |
+
if (NULL == fp) {
|
| 57 |
+
std::cout << "Error in read " << filename;
|
| 58 |
+
return false;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
WavHeader header;
|
| 62 |
+
fread(&header, 1, sizeof(header), fp);
|
| 63 |
+
if (header.fmt_size < 16) {
|
| 64 |
+
printf("WaveData: expect PCM format data "
|
| 65 |
+
"to have fmt chunk of at least size 16.\n");
|
| 66 |
+
return false;
|
| 67 |
+
} else if (header.fmt_size > 16) {
|
| 68 |
+
int offset = 44 - 8 + header.fmt_size - 16;
|
| 69 |
+
fseek(fp, offset, SEEK_SET);
|
| 70 |
+
fread(header.data, 8, sizeof(char), fp);
|
| 71 |
+
}
|
| 72 |
+
// check "riff" "WAVE" "fmt " "data"
|
| 73 |
+
|
| 74 |
+
// Skip any sub-chunks between "fmt" and "data". Usually there will
|
| 75 |
+
// be a single "fact" sub chunk, but on Windows there can also be a
|
| 76 |
+
// "list" sub chunk.
|
| 77 |
+
while (0 != strncmp(header.data, "data", 4)) {
|
| 78 |
+
// We will just ignore the data in these chunks.
|
| 79 |
+
fseek(fp, header.data_size, SEEK_CUR);
|
| 80 |
+
// read next sub chunk
|
| 81 |
+
fread(header.data, 8, sizeof(char), fp);
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
if (header.data_size == 0) {
|
| 85 |
+
int offset = ftell(fp);
|
| 86 |
+
fseek(fp, 0, SEEK_END);
|
| 87 |
+
header.data_size = ftell(fp) - offset;
|
| 88 |
+
fseek(fp, offset, SEEK_SET);
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
num_channel_ = header.channels;
|
| 92 |
+
sample_rate_ = header.sample_rate;
|
| 93 |
+
bits_per_sample_ = header.bit;
|
| 94 |
+
int num_data = header.data_size / (bits_per_sample_ / 8);
|
| 95 |
+
data_ = new float[num_data]; // Create 1-dim array
|
| 96 |
+
num_samples_ = num_data / num_channel_;
|
| 97 |
+
|
| 98 |
+
std::cout << "num_channel_ :" << num_channel_ << std::endl;
|
| 99 |
+
std::cout << "sample_rate_ :" << sample_rate_ << std::endl;
|
| 100 |
+
std::cout << "bits_per_sample_:" << bits_per_sample_ << std::endl;
|
| 101 |
+
std::cout << "num_samples :" << num_data << std::endl;
|
| 102 |
+
std::cout << "num_data_size :" << header.data_size << std::endl;
|
| 103 |
+
|
| 104 |
+
switch (bits_per_sample_) {
|
| 105 |
+
case 8: {
|
| 106 |
+
char sample;
|
| 107 |
+
for (int i = 0; i < num_data; ++i) {
|
| 108 |
+
fread(&sample, 1, sizeof(char), fp);
|
| 109 |
+
data_[i] = static_cast<float>(sample) / 32768;
|
| 110 |
+
}
|
| 111 |
+
break;
|
| 112 |
+
}
|
| 113 |
+
case 16: {
|
| 114 |
+
int16_t sample;
|
| 115 |
+
for (int i = 0; i < num_data; ++i) {
|
| 116 |
+
fread(&sample, 1, sizeof(int16_t), fp);
|
| 117 |
+
data_[i] = static_cast<float>(sample) / 32768;
|
| 118 |
+
}
|
| 119 |
+
break;
|
| 120 |
+
}
|
| 121 |
+
case 32:
|
| 122 |
+
{
|
| 123 |
+
if (header.format == 1) //S32
|
| 124 |
+
{
|
| 125 |
+
int sample;
|
| 126 |
+
for (int i = 0; i < num_data; ++i) {
|
| 127 |
+
fread(&sample, 1, sizeof(int), fp);
|
| 128 |
+
data_[i] = static_cast<float>(sample) / 32768;
|
| 129 |
+
}
|
| 130 |
+
}
|
| 131 |
+
else if (header.format == 3) // IEEE-float
|
| 132 |
+
{
|
| 133 |
+
float sample;
|
| 134 |
+
for (int i = 0; i < num_data; ++i) {
|
| 135 |
+
fread(&sample, 1, sizeof(float), fp);
|
| 136 |
+
data_[i] = static_cast<float>(sample);
|
| 137 |
+
}
|
| 138 |
+
}
|
| 139 |
+
else {
|
| 140 |
+
printf("unsupported quantization bits\n");
|
| 141 |
+
}
|
| 142 |
+
break;
|
| 143 |
+
}
|
| 144 |
+
default:
|
| 145 |
+
printf("unsupported quantization bits\n");
|
| 146 |
+
break;
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
fclose(fp);
|
| 150 |
+
return true;
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
int num_channel() const { return num_channel_; }
|
| 154 |
+
int sample_rate() const { return sample_rate_; }
|
| 155 |
+
int bits_per_sample() const { return bits_per_sample_; }
|
| 156 |
+
int num_samples() const { return num_samples_; }
|
| 157 |
+
|
| 158 |
+
~WavReader() {
|
| 159 |
+
delete[] data_;
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
const float* data() const { return data_; }
|
| 163 |
+
|
| 164 |
+
private:
|
| 165 |
+
int num_channel_;
|
| 166 |
+
int sample_rate_;
|
| 167 |
+
int bits_per_sample_;
|
| 168 |
+
int num_samples_; // sample points per channel
|
| 169 |
+
float* data_;
|
| 170 |
+
};
|
| 171 |
+
|
| 172 |
+
class WavWriter {
|
| 173 |
+
public:
|
| 174 |
+
WavWriter(const float* data, int num_samples, int num_channel,
|
| 175 |
+
int sample_rate, int bits_per_sample)
|
| 176 |
+
: data_(data),
|
| 177 |
+
num_samples_(num_samples),
|
| 178 |
+
num_channel_(num_channel),
|
| 179 |
+
sample_rate_(sample_rate),
|
| 180 |
+
bits_per_sample_(bits_per_sample) {}
|
| 181 |
+
|
| 182 |
+
void Write(const std::string& filename) {
|
| 183 |
+
FILE* fp = fopen(filename.c_str(), "w");
|
| 184 |
+
// init char 'riff' 'WAVE' 'fmt ' 'data'
|
| 185 |
+
WavHeader header;
|
| 186 |
+
char wav_header[44] = {0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57,
|
| 187 |
+
0x41, 0x56, 0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00,
|
| 188 |
+
0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
| 189 |
+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
| 190 |
+
0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00};
|
| 191 |
+
memcpy(&header, wav_header, sizeof(header));
|
| 192 |
+
header.channels = num_channel_;
|
| 193 |
+
header.bit = bits_per_sample_;
|
| 194 |
+
header.sample_rate = sample_rate_;
|
| 195 |
+
header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8);
|
| 196 |
+
header.size = sizeof(header) - 8 + header.data_size;
|
| 197 |
+
header.bytes_per_second =
|
| 198 |
+
sample_rate_ * num_channel_ * (bits_per_sample_ / 8);
|
| 199 |
+
header.block_size = num_channel_ * (bits_per_sample_ / 8);
|
| 200 |
+
|
| 201 |
+
fwrite(&header, 1, sizeof(header), fp);
|
| 202 |
+
|
| 203 |
+
for (int i = 0; i < num_samples_; ++i) {
|
| 204 |
+
for (int j = 0; j < num_channel_; ++j) {
|
| 205 |
+
switch (bits_per_sample_) {
|
| 206 |
+
case 8: {
|
| 207 |
+
char sample = static_cast<char>(data_[i * num_channel_ + j]);
|
| 208 |
+
fwrite(&sample, 1, sizeof(sample), fp);
|
| 209 |
+
break;
|
| 210 |
+
}
|
| 211 |
+
case 16: {
|
| 212 |
+
int16_t sample = static_cast<int16_t>(data_[i * num_channel_ + j]);
|
| 213 |
+
fwrite(&sample, 1, sizeof(sample), fp);
|
| 214 |
+
break;
|
| 215 |
+
}
|
| 216 |
+
case 32: {
|
| 217 |
+
int sample = static_cast<int>(data_[i * num_channel_ + j]);
|
| 218 |
+
fwrite(&sample, 1, sizeof(sample), fp);
|
| 219 |
+
break;
|
| 220 |
+
}
|
| 221 |
+
}
|
| 222 |
+
}
|
| 223 |
+
}
|
| 224 |
+
fclose(fp);
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
private:
|
| 228 |
+
const float* data_;
|
| 229 |
+
int num_samples_; // total float points in data_
|
| 230 |
+
int num_channel_;
|
| 231 |
+
int sample_rate_;
|
| 232 |
+
int bits_per_sample_;
|
| 233 |
+
};
|
| 234 |
+
|
| 235 |
+
} // namespace wav
|
| 236 |
+
|
| 237 |
+
#endif // FRONTEND_WAV_H_
|
python/__inip__.py
ADDED
|
File without changes
|
python/__pycache__/processing.cpython-312.pyc
ADDED
|
Binary file (3.25 kB). View file
|
|
|
python/helpers/__init__.py
ADDED
|
File without changes
|
python/helpers/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (164 Bytes). View file
|
|
|
python/helpers/__pycache__/vadprocessor.cpython-312.pyc
ADDED
|
Binary file (27.6 kB). View file
|
|
|
python/helpers/vadprocessor.py
ADDED
|
@@ -0,0 +1,603 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from copy import deepcopy
|
| 2 |
+
from time import time
|
| 3 |
+
# from config import VAD_MODEL_PATH
|
| 4 |
+
# from silero_vad import load_silero_vad
|
| 5 |
+
import numpy as np
|
| 6 |
+
import onnxruntime
|
| 7 |
+
import logging
|
| 8 |
+
from datetime import timedelta
|
| 9 |
+
import gc
|
| 10 |
+
# from pydub import AudioSegment
|
| 11 |
+
from collections import deque
|
| 12 |
+
|
| 13 |
+
VAD_MODEL_PATH = "/Users/xxx/Translator/moyoyo_asr_models/silero-vad/silero_vad.onnx"
|
| 14 |
+
|
| 15 |
+
class AdaptiveSilenceController:
|
| 16 |
+
def __init__(self, base_silence_ms=120, min_ms=50, max_ms=600):
|
| 17 |
+
self.base = base_silence_ms
|
| 18 |
+
self.min = min_ms
|
| 19 |
+
self.max = max_ms
|
| 20 |
+
self.recent_silences = deque(maxlen=20)
|
| 21 |
+
self.recent_speeches = deque(maxlen=20)
|
| 22 |
+
|
| 23 |
+
def update_silence(self, duration_ms):
|
| 24 |
+
self.recent_silences.append(duration_ms)
|
| 25 |
+
|
| 26 |
+
def update_speech(self, duration_ms):
|
| 27 |
+
self.recent_speeches.append(duration_ms)
|
| 28 |
+
|
| 29 |
+
def get_adaptive_silence_ms(self):
|
| 30 |
+
# 1. 快速说话特征:平均语音段长度短(如 < 250ms)
|
| 31 |
+
avg_speech = np.mean(self.recent_speeches) if self.recent_speeches else self.base
|
| 32 |
+
avg_silence = np.mean(self.recent_silences) if self.recent_silences else self.base
|
| 33 |
+
|
| 34 |
+
# 2. 快速语音则缩短 silence 阈值
|
| 35 |
+
speed_factor = 1.0
|
| 36 |
+
if avg_speech < 300:
|
| 37 |
+
speed_factor = 0.5
|
| 38 |
+
elif avg_speech < 600:
|
| 39 |
+
speed_factor = 0.8
|
| 40 |
+
logging.warning(f"Avg speech :{avg_speech}, Avg silence: {avg_silence}")
|
| 41 |
+
# 3. silence 的变化趋势也考虑进去
|
| 42 |
+
adaptive = self.base * speed_factor + 0.3 * avg_silence
|
| 43 |
+
|
| 44 |
+
return int(max(self.min, min(self.max, adaptive)))
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class OnnxWrapper():
|
| 48 |
+
|
| 49 |
+
def __init__(self, path, force_onnx_cpu=False):
|
| 50 |
+
opts = onnxruntime.SessionOptions()
|
| 51 |
+
opts.inter_op_num_threads = 1
|
| 52 |
+
opts.intra_op_num_threads = 1
|
| 53 |
+
|
| 54 |
+
if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers():
|
| 55 |
+
self.session = onnxruntime.InferenceSession(path, providers=['CPUExecutionProvider'], sess_options=opts)
|
| 56 |
+
else:
|
| 57 |
+
self.session = onnxruntime.InferenceSession(path, sess_options=opts)
|
| 58 |
+
|
| 59 |
+
self.reset_states()
|
| 60 |
+
self.sample_rates = [16000]
|
| 61 |
+
|
| 62 |
+
def _validate_input(self, x: np.ndarray, sr: int):
|
| 63 |
+
if x.ndim == 1:
|
| 64 |
+
x = x[None]
|
| 65 |
+
if x.ndim > 2:
|
| 66 |
+
raise ValueError(f"Too many dimensions for input audio chunk {x.ndim}")
|
| 67 |
+
|
| 68 |
+
if sr != 16000 and (sr % 16000 == 0):
|
| 69 |
+
step = sr // 16000
|
| 70 |
+
x = x[:, ::step]
|
| 71 |
+
sr = 16000
|
| 72 |
+
|
| 73 |
+
if sr not in self.sample_rates:
|
| 74 |
+
raise ValueError(f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)")
|
| 75 |
+
if sr / x.shape[1] > 31.25:
|
| 76 |
+
raise ValueError("Input audio chunk is too short")
|
| 77 |
+
|
| 78 |
+
return x, sr
|
| 79 |
+
|
| 80 |
+
def reset_states(self, batch_size=1):
|
| 81 |
+
self._state = np.zeros((2, batch_size, 128)).astype(np.float32)
|
| 82 |
+
self._context = np.zeros(0)
|
| 83 |
+
self._last_sr = 0
|
| 84 |
+
self._last_batch_size = 0
|
| 85 |
+
|
| 86 |
+
def __call__(self, x, sr: int):
|
| 87 |
+
|
| 88 |
+
x, sr = self._validate_input(x, sr)
|
| 89 |
+
num_samples = 512 if sr == 16000 else 256
|
| 90 |
+
|
| 91 |
+
if x.shape[-1] != num_samples:
|
| 92 |
+
raise ValueError(
|
| 93 |
+
f"Provided number of samples is {x.shape[-1]} (Supported values: 256 for 8000 sample rate, 512 for 16000)")
|
| 94 |
+
|
| 95 |
+
batch_size = x.shape[0]
|
| 96 |
+
context_size = 64 if sr == 16000 else 32
|
| 97 |
+
|
| 98 |
+
if not self._last_batch_size:
|
| 99 |
+
self.reset_states(batch_size)
|
| 100 |
+
if (self._last_sr) and (self._last_sr != sr):
|
| 101 |
+
self.reset_states(batch_size)
|
| 102 |
+
if (self._last_batch_size) and (self._last_batch_size != batch_size):
|
| 103 |
+
self.reset_states(batch_size)
|
| 104 |
+
|
| 105 |
+
if not len(self._context):
|
| 106 |
+
self._context = np.zeros((batch_size, context_size)).astype(np.float32)
|
| 107 |
+
|
| 108 |
+
x = np.concatenate([self._context, x], axis=1)
|
| 109 |
+
if sr in [8000, 16000]:
|
| 110 |
+
ort_inputs = {'input': x, 'state': self._state, 'sr': np.array(sr, dtype='int64')}
|
| 111 |
+
ort_outs = self.session.run(None, ort_inputs)
|
| 112 |
+
out, state = ort_outs
|
| 113 |
+
self._state = state
|
| 114 |
+
else:
|
| 115 |
+
raise ValueError()
|
| 116 |
+
|
| 117 |
+
self._context = x[..., -context_size:]
|
| 118 |
+
self._last_sr = sr
|
| 119 |
+
self._last_batch_size = batch_size
|
| 120 |
+
|
| 121 |
+
# out = torch.from_numpy(out)
|
| 122 |
+
return out
|
| 123 |
+
|
| 124 |
+
def audio_forward(self, audio: np.ndarray, sr: int):
|
| 125 |
+
outs = []
|
| 126 |
+
x, sr = self._validate_input(audio, sr)
|
| 127 |
+
self.reset_states()
|
| 128 |
+
num_samples = 512 if sr == 16000 else 256
|
| 129 |
+
|
| 130 |
+
if x.shape[1] % num_samples:
|
| 131 |
+
pad_num = num_samples - (x.shape[1] % num_samples)
|
| 132 |
+
x = np.pad(x, ((0, 0), (0, pad_num)), 'constant', constant_values=(0.0, 0.0))
|
| 133 |
+
|
| 134 |
+
for i in range(0, x.shape[1], num_samples):
|
| 135 |
+
wavs_batch = x[:, i:i + num_samples]
|
| 136 |
+
out_chunk = self.__call__(wavs_batch, sr)
|
| 137 |
+
outs.append(out_chunk)
|
| 138 |
+
|
| 139 |
+
stacked = np.concatenate(outs, axis=1)
|
| 140 |
+
return stacked
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
class VADIteratorOnnx:
|
| 144 |
+
def __init__(self,
|
| 145 |
+
threshold: float = 0.5,
|
| 146 |
+
sampling_rate: int = 16000,
|
| 147 |
+
min_silence_duration_ms: int = 100,
|
| 148 |
+
max_speech_duration_s: float = float('inf'),
|
| 149 |
+
speech_pad_ms: int = 30
|
| 150 |
+
):
|
| 151 |
+
self.model = OnnxWrapper(VAD_MODEL_PATH, True)
|
| 152 |
+
self.threshold = threshold
|
| 153 |
+
self.sampling_rate = sampling_rate
|
| 154 |
+
|
| 155 |
+
if sampling_rate not in [8000, 16000]:
|
| 156 |
+
raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')
|
| 157 |
+
|
| 158 |
+
self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
| 159 |
+
# self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
|
| 160 |
+
self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
| 161 |
+
self.reset_states()
|
| 162 |
+
|
| 163 |
+
def reset_states(self):
|
| 164 |
+
|
| 165 |
+
self.model.reset_states()
|
| 166 |
+
self.triggered = False
|
| 167 |
+
self.temp_end = 0
|
| 168 |
+
self.current_sample = 0
|
| 169 |
+
self.start = 0
|
| 170 |
+
|
| 171 |
+
def __call__(self, x: np.ndarray, return_seconds=False):
|
| 172 |
+
"""
|
| 173 |
+
x: np.ndarray
|
| 174 |
+
audio chunk (see examples in repo)
|
| 175 |
+
|
| 176 |
+
return_seconds: bool (default - False)
|
| 177 |
+
whether return timestamps in seconds (default - samples)
|
| 178 |
+
"""
|
| 179 |
+
|
| 180 |
+
window_size_samples = 512 if self.sampling_rate == 16000 else 256
|
| 181 |
+
x = x[:window_size_samples]
|
| 182 |
+
if len(x) < window_size_samples:
|
| 183 |
+
x = np.pad(x, ((0, 0), (0, window_size_samples - len(x))), 'constant', constant_values=0.0)
|
| 184 |
+
|
| 185 |
+
self.current_sample += window_size_samples
|
| 186 |
+
|
| 187 |
+
speech_prob = self.model(x, self.sampling_rate)[0,0]
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
if (speech_prob >= self.threshold) and self.temp_end:
|
| 191 |
+
self.temp_end = 0
|
| 192 |
+
|
| 193 |
+
if (speech_prob >= self.threshold) and not self.triggered:
|
| 194 |
+
self.triggered = True
|
| 195 |
+
# speech_start = max(0, self.current_sample - window_size_samples)
|
| 196 |
+
speech_start = max(0, self.current_sample - self.speech_pad_samples - window_size_samples)
|
| 197 |
+
self.start = speech_start
|
| 198 |
+
return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
|
| 199 |
+
|
| 200 |
+
# if (speech_prob >= self.threshold) and self.current_sample - self.start >= self.max_speech_samples:
|
| 201 |
+
# if self.temp_end:
|
| 202 |
+
# self.temp_end = 0
|
| 203 |
+
# self.start = self.current_sample
|
| 204 |
+
# return {'end': int(self.current_sample) if not return_seconds else round(self.current_sample / self.sampling_rate, 1)}
|
| 205 |
+
|
| 206 |
+
if (speech_prob < self.threshold - 0.15) and self.triggered:
|
| 207 |
+
if not self.temp_end:
|
| 208 |
+
self.temp_end = self.current_sample
|
| 209 |
+
if self.current_sample - self.temp_end < self.min_silence_samples:
|
| 210 |
+
return None
|
| 211 |
+
else:
|
| 212 |
+
# speech_end = self.temp_end - window_size_samples
|
| 213 |
+
speech_end = self.temp_end + self.speech_pad_samples - window_size_samples
|
| 214 |
+
self.temp_end = 0
|
| 215 |
+
self.triggered = False
|
| 216 |
+
return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
|
| 217 |
+
|
| 218 |
+
return None
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
class FixedVADIterator(VADIteratorOnnx):
|
| 222 |
+
'''It fixes VADIterator by allowing to process any audio length, not only exactly 512 frames at once.
|
| 223 |
+
If audio to be processed at once is long and multiple voiced segments detected,
|
| 224 |
+
then __call__ returns the start of the first segment, and end (or middle, which means no end) of the last segment.
|
| 225 |
+
'''
|
| 226 |
+
|
| 227 |
+
def reset_states(self):
|
| 228 |
+
super().reset_states()
|
| 229 |
+
self.buffer = np.array([],dtype=np.float32)
|
| 230 |
+
|
| 231 |
+
def __call__(self, x, return_seconds=False):
|
| 232 |
+
self.buffer = np.append(self.buffer, x)
|
| 233 |
+
# print(f"len(self.buffer): {len(self.buffer)}")
|
| 234 |
+
ret = None
|
| 235 |
+
i = 0
|
| 236 |
+
while len(self.buffer) >= 512:
|
| 237 |
+
# print(f"len(self.buffer): {len(self.buffer)}")
|
| 238 |
+
r = super().__call__(self.buffer[:512], return_seconds=return_seconds)
|
| 239 |
+
# print(f"super() : {r}")
|
| 240 |
+
self.buffer = self.buffer[512:]
|
| 241 |
+
if ret is None:
|
| 242 |
+
print(f"{i*512} ->>> {r} ->=== {ret}")
|
| 243 |
+
ret = r
|
| 244 |
+
elif r is not None:
|
| 245 |
+
if 'end' in r:
|
| 246 |
+
ret['end'] = r['end'] # the latter end
|
| 247 |
+
print(f"{i*512} ->>> {r} -> {ret}")
|
| 248 |
+
if 'start' in r and 'end' in ret: # there is an earlier start.
|
| 249 |
+
# Remove end, merging this segment with the previous one.
|
| 250 |
+
# print(f"{i*512} ->>>del {r} -> {ret}")
|
| 251 |
+
del ret['end']
|
| 252 |
+
print(f"{i*512} ->>> {r} -> {ret}")
|
| 253 |
+
# else:
|
| 254 |
+
# # print(f"{i*512} ->>> {r} -> {ret}")
|
| 255 |
+
i += 1
|
| 256 |
+
# print(f"FixedVADIterator output : {ret}")
|
| 257 |
+
return ret if ret != {} else None
|
| 258 |
+
|
| 259 |
+
class VadV2:
|
| 260 |
+
def __init__(self,
|
| 261 |
+
threshold: float = 0.5,
|
| 262 |
+
sampling_rate: int = 16000,
|
| 263 |
+
min_silence_duration_ms: int = 100,
|
| 264 |
+
speech_pad_ms: int = 30,
|
| 265 |
+
max_speech_duration_s: float = float('inf')):
|
| 266 |
+
# self.vad_iterator = VADIterator(threshold, sampling_rate, min_silence_duration_ms)
|
| 267 |
+
self.vad_iterator = VADIteratorOnnx(threshold, sampling_rate, min_silence_duration_ms, max_speech_duration_s)
|
| 268 |
+
self.speech_pad_samples = int(sampling_rate * speech_pad_ms / 1000)
|
| 269 |
+
self.sampling_rate = sampling_rate
|
| 270 |
+
self.audio_buffer = np.array([], dtype=np.float32)
|
| 271 |
+
self.start = 0
|
| 272 |
+
self.end = 0
|
| 273 |
+
self.offset = 0
|
| 274 |
+
assert speech_pad_ms <= min_silence_duration_ms, "speech_pad_ms should be less than min_silence_duration_ms"
|
| 275 |
+
self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
|
| 276 |
+
|
| 277 |
+
self.silence_chunk_size = 0
|
| 278 |
+
self.silence_chunk_threshold = 60 / (512 / self.sampling_rate)
|
| 279 |
+
|
| 280 |
+
def reset(self):
|
| 281 |
+
self.audio_buffer = np.array([], dtype=np.float32)
|
| 282 |
+
self.start = 0
|
| 283 |
+
self.end = 0
|
| 284 |
+
self.offset = 0
|
| 285 |
+
self.vad_iterator.reset_states()
|
| 286 |
+
|
| 287 |
+
def __call__(self, x: np.ndarray = None):
|
| 288 |
+
if x is None:
|
| 289 |
+
if self.start:
|
| 290 |
+
start = max(self.offset, self.start - self.speech_pad_samples)
|
| 291 |
+
end = self.offset + len(self.audio_buffer)
|
| 292 |
+
start_ts = round(start / self.sampling_rate, 1)
|
| 293 |
+
end_ts = round(end / self.sampling_rate, 1)
|
| 294 |
+
audio_data = self.audio_buffer[start - self.offset: end - self.offset]
|
| 295 |
+
result = {
|
| 296 |
+
"start": start_ts,
|
| 297 |
+
"end": end_ts,
|
| 298 |
+
"audio": audio_data,
|
| 299 |
+
}
|
| 300 |
+
else:
|
| 301 |
+
result = None
|
| 302 |
+
self.reset()
|
| 303 |
+
return result
|
| 304 |
+
|
| 305 |
+
self.audio_buffer = np.append(self.audio_buffer, deepcopy(x))
|
| 306 |
+
|
| 307 |
+
result = self.vad_iterator(x)
|
| 308 |
+
if result is not None:
|
| 309 |
+
# self.start = result.get('start', self.start)
|
| 310 |
+
# self.end = result.get('end', self.end)
|
| 311 |
+
self.silence_chunk_size = 0
|
| 312 |
+
|
| 313 |
+
if 'start' in result:
|
| 314 |
+
self.start = result['start']
|
| 315 |
+
if 'end' in result:
|
| 316 |
+
self.end = result['end']
|
| 317 |
+
else:
|
| 318 |
+
self.silence_chunk_size += 1
|
| 319 |
+
|
| 320 |
+
if self.start == 0 and len(self.audio_buffer) > self.speech_pad_samples:
|
| 321 |
+
self.offset += len(self.audio_buffer) - self.speech_pad_samples
|
| 322 |
+
self.audio_buffer = self.audio_buffer[-self.speech_pad_samples:]
|
| 323 |
+
|
| 324 |
+
if self.silence_chunk_size >= self.silence_chunk_threshold:
|
| 325 |
+
self.offset += len(self.audio_buffer) - self.speech_pad_samples
|
| 326 |
+
self.audio_buffer = self.audio_buffer[-self.speech_pad_samples:]
|
| 327 |
+
self.silence_chunk_size = 0
|
| 328 |
+
|
| 329 |
+
if self.end > self.start:
|
| 330 |
+
start = max(self.offset, self.start - self.speech_pad_samples)
|
| 331 |
+
end = self.end + self.speech_pad_samples
|
| 332 |
+
start_ts = round(start / self.sampling_rate, 1)
|
| 333 |
+
end_ts = round(end / self.sampling_rate, 1)
|
| 334 |
+
audio_data = self.audio_buffer[start - self.offset: end - self.offset]
|
| 335 |
+
self.audio_buffer = self.audio_buffer[self.end - self.offset:]
|
| 336 |
+
self.offset = self.end
|
| 337 |
+
self.start = self.end
|
| 338 |
+
# self.start = 0
|
| 339 |
+
self.end = 0
|
| 340 |
+
result = {
|
| 341 |
+
"start": start_ts,
|
| 342 |
+
"end": end_ts,
|
| 343 |
+
"audio": audio_data,
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
return result
|
| 347 |
+
return None
|
| 348 |
+
|
| 349 |
+
|
| 350 |
+
class SileroVADProcessor:
|
| 351 |
+
"""
|
| 352 |
+
A class for processing audio files using Silero VAD to detect voice activity
|
| 353 |
+
and extract voice segments from audio files.
|
| 354 |
+
"""
|
| 355 |
+
|
| 356 |
+
def __init__(self,
|
| 357 |
+
activate_threshold=0.5,
|
| 358 |
+
fusion_threshold=0.3,
|
| 359 |
+
min_speech_duration=0.25,
|
| 360 |
+
max_speech_duration=20,
|
| 361 |
+
min_silence_duration=250,
|
| 362 |
+
sample_rate=16000,
|
| 363 |
+
ort_providers=None):
|
| 364 |
+
"""
|
| 365 |
+
Initialize the SileroVADProcessor.
|
| 366 |
+
Args:
|
| 367 |
+
activate_threshold (float): Threshold for voice activity detection
|
| 368 |
+
fusion_threshold (float): Threshold for merging close speech segments (seconds)
|
| 369 |
+
min_speech_duration (float): Minimum duration of speech to be considered valid (seconds)
|
| 370 |
+
max_speech_duration (float): Maximum duration of speech (seconds)
|
| 371 |
+
min_silence_duration (int): Minimum silence duration (ms)
|
| 372 |
+
sample_rate (int): Sample rate of the audio (8000 or 16000 Hz)
|
| 373 |
+
ort_providers (list): ONNX Runtime providers for acceleration
|
| 374 |
+
"""
|
| 375 |
+
# VAD parameters
|
| 376 |
+
self.activate_threshold = activate_threshold
|
| 377 |
+
self.fusion_threshold = fusion_threshold
|
| 378 |
+
self.min_speech_duration = min_speech_duration
|
| 379 |
+
self.max_speech_duration = max_speech_duration
|
| 380 |
+
self.min_silence_duration = min_silence_duration
|
| 381 |
+
self.sample_rate = sample_rate
|
| 382 |
+
self.ort_providers = ort_providers if ort_providers else []
|
| 383 |
+
|
| 384 |
+
# Initialize logger
|
| 385 |
+
self.logger = logging.getLogger(__name__)
|
| 386 |
+
|
| 387 |
+
# Load Silero VAD model
|
| 388 |
+
self._init_onnx_session()
|
| 389 |
+
self.silero_vad = load_silero_vad(onnx=True)
|
| 390 |
+
|
| 391 |
+
def _init_onnx_session(self):
|
| 392 |
+
"""Initialize ONNX Runtime session with appropriate settings."""
|
| 393 |
+
session_opts = onnxruntime.SessionOptions()
|
| 394 |
+
session_opts.log_severity_level = 3
|
| 395 |
+
session_opts.inter_op_num_threads = 0
|
| 396 |
+
session_opts.intra_op_num_threads = 0
|
| 397 |
+
session_opts.enable_cpu_mem_arena = True
|
| 398 |
+
session_opts.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
|
| 399 |
+
session_opts.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
|
| 400 |
+
|
| 401 |
+
session_opts.add_session_config_entry("session.intra_op.allow_spinning", "1")
|
| 402 |
+
session_opts.add_session_config_entry("session.inter_op.allow_spinning", "1")
|
| 403 |
+
session_opts.add_session_config_entry("session.set_denormal_as_zero", "1")
|
| 404 |
+
|
| 405 |
+
# Set the session_opts to be used by silero_vad
|
| 406 |
+
# onnxruntime.capi._pybind_state.get_default_session_options(session_opts)
|
| 407 |
+
|
| 408 |
+
def load_audio(self, audio_path):
|
| 409 |
+
"""
|
| 410 |
+
Load audio file and prepare it for VAD processing.
|
| 411 |
+
Args:
|
| 412 |
+
audio_path (str): Path to the audio file
|
| 413 |
+
Returns:
|
| 414 |
+
numpy.ndarray: Audio data as numpy array
|
| 415 |
+
"""
|
| 416 |
+
self.logger.info(f"Loading audio from {audio_path}")
|
| 417 |
+
audio_segment = AudioSegment.from_file(audio_path)
|
| 418 |
+
audio_segment = audio_segment.set_channels(1).set_frame_rate(self.sample_rate)
|
| 419 |
+
|
| 420 |
+
# Convert to numpy array and normalize
|
| 421 |
+
dtype = np.float16 if self.use_gpu_fp16 else np.float32
|
| 422 |
+
audio_array = np.array(audio_segment.get_array_of_samples(), dtype=dtype) * 0.000030517578 # 1/32768
|
| 423 |
+
|
| 424 |
+
self.audio_segment = audio_segment # Store for later use
|
| 425 |
+
return audio_array
|
| 426 |
+
|
| 427 |
+
@property
|
| 428 |
+
def model(self):
|
| 429 |
+
return self.silero_vad
|
| 430 |
+
|
| 431 |
+
def process_timestamps(self, timestamps):
|
| 432 |
+
"""
|
| 433 |
+
Process VAD timestamps: filter short segments and merge close segments.
|
| 434 |
+
Args:
|
| 435 |
+
timestamps (list): List of (start, end) tuples
|
| 436 |
+
Returns:
|
| 437 |
+
list: Processed list of (start, end) tuples
|
| 438 |
+
"""
|
| 439 |
+
# Filter out short durations
|
| 440 |
+
filtered_timestamps = [(start, end) for start, end in timestamps
|
| 441 |
+
if (end - start) >= self.min_speech_duration]
|
| 442 |
+
|
| 443 |
+
# Fuse timestamps in two passes for better merging
|
| 444 |
+
fused_timestamps_1st = []
|
| 445 |
+
for start, end in filtered_timestamps:
|
| 446 |
+
if fused_timestamps_1st and (start - fused_timestamps_1st[-1][1] <= self.fusion_threshold):
|
| 447 |
+
fused_timestamps_1st[-1] = (fused_timestamps_1st[-1][0], end)
|
| 448 |
+
else:
|
| 449 |
+
fused_timestamps_1st.append((start, end))
|
| 450 |
+
|
| 451 |
+
fused_timestamps_2nd = []
|
| 452 |
+
for start, end in fused_timestamps_1st:
|
| 453 |
+
if fused_timestamps_2nd and (start - fused_timestamps_2nd[-1][1] <= self.fusion_threshold):
|
| 454 |
+
fused_timestamps_2nd[-1] = (fused_timestamps_2nd[-1][0], end)
|
| 455 |
+
else:
|
| 456 |
+
fused_timestamps_2nd.append((start, end))
|
| 457 |
+
|
| 458 |
+
return fused_timestamps_2nd
|
| 459 |
+
|
| 460 |
+
def format_time(self, seconds):
|
| 461 |
+
"""
|
| 462 |
+
Convert seconds to VTT time format 'hh:mm:ss.mmm'.
|
| 463 |
+
Args:
|
| 464 |
+
seconds (float): Time in seconds
|
| 465 |
+
Returns:
|
| 466 |
+
str: Formatted time string
|
| 467 |
+
"""
|
| 468 |
+
td = timedelta(seconds=seconds)
|
| 469 |
+
td_sec = td.total_seconds()
|
| 470 |
+
total_seconds = int(td_sec)
|
| 471 |
+
milliseconds = int((td_sec - total_seconds) * 1000)
|
| 472 |
+
hours = total_seconds // 3600
|
| 473 |
+
minutes = (total_seconds % 3600) // 60
|
| 474 |
+
seconds = total_seconds % 60
|
| 475 |
+
return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"
|
| 476 |
+
|
| 477 |
+
def detect_speech(self, audio:np.array):
|
| 478 |
+
"""
|
| 479 |
+
Run VAD on the audio file to detect speech segments.
|
| 480 |
+
Args:
|
| 481 |
+
audio_path (str): Path to the audio file
|
| 482 |
+
Returns:
|
| 483 |
+
list: List of processed timestamps as (start, end) tuples
|
| 484 |
+
"""
|
| 485 |
+
self.logger.info("Starting VAD process")
|
| 486 |
+
start_time = time.time()
|
| 487 |
+
# Get speech timestamps
|
| 488 |
+
raw_timestamps = get_speech_timestamps(
|
| 489 |
+
audio,
|
| 490 |
+
model=self.silero_vad,
|
| 491 |
+
threshold=self.activate_threshold,
|
| 492 |
+
max_speech_duration_s=self.max_speech_duration,
|
| 493 |
+
min_speech_duration_ms=int(self.min_speech_duration * 1000),
|
| 494 |
+
min_silence_duration_ms=self.min_silence_duration,
|
| 495 |
+
return_seconds=True
|
| 496 |
+
)
|
| 497 |
+
|
| 498 |
+
# Convert to simple format and process
|
| 499 |
+
timestamps = [(item['start'], item['end']) for item in raw_timestamps]
|
| 500 |
+
processed_timestamps = self.process_timestamps(timestamps)
|
| 501 |
+
|
| 502 |
+
# Clean up
|
| 503 |
+
del audio
|
| 504 |
+
gc.collect()
|
| 505 |
+
|
| 506 |
+
self.logger.info(f"VAD completed in {time.time() - start_time:.3f} seconds")
|
| 507 |
+
return processed_timestamps
|
| 508 |
+
|
| 509 |
+
"""
|
| 510 |
+
Save timestamps in both second and sample indices formats.
|
| 511 |
+
Args:
|
| 512 |
+
timestamps (list): List of (start, end) tuples
|
| 513 |
+
output_prefix (str): Prefix for output files
|
| 514 |
+
"""
|
| 515 |
+
# Save timestamps in seconds (VTT format)
|
| 516 |
+
seconds_path = f"{output_prefix}_timestamps_second.txt"
|
| 517 |
+
with open(seconds_path, "w", encoding='UTF-8') as file:
|
| 518 |
+
self.logger.info("Saving timestamps in seconds format")
|
| 519 |
+
for start, end in timestamps:
|
| 520 |
+
s_time = self.format_time(start)
|
| 521 |
+
e_time = self.format_time(end)
|
| 522 |
+
line = f"{s_time} --> {e_time}\n"
|
| 523 |
+
file.write(line)
|
| 524 |
+
|
| 525 |
+
# Save timestamps in sample indices
|
| 526 |
+
indices_path = f"{output_prefix}_timestamps_indices.txt"
|
| 527 |
+
with open(indices_path, "w", encoding='UTF-8') as file:
|
| 528 |
+
self.logger.info("Saving timestamps in indices format")
|
| 529 |
+
for start, end in timestamps:
|
| 530 |
+
line = f"{int(start * self.sample_rate)} --> {int(end * self.sample_rate)}\n"
|
| 531 |
+
file.write(line)
|
| 532 |
+
|
| 533 |
+
self.logger.info(f"Timestamps saved to {seconds_path} and {indices_path}")
|
| 534 |
+
|
| 535 |
+
def extract_speech_segments(self, audio_segment, timestamps):
|
| 536 |
+
"""
|
| 537 |
+
Extract speech segments from the audio and combine them into a single audio file.
|
| 538 |
+
Args:
|
| 539 |
+
timestamps (list): List of (start, end) tuples indicating speech segments
|
| 540 |
+
Returns:
|
| 541 |
+
AudioSegment: The combined speech segments
|
| 542 |
+
"""
|
| 543 |
+
audio_segment = audio_segment.numpy()
|
| 544 |
+
combined_speech = np.array([], dtype=np.float32)
|
| 545 |
+
|
| 546 |
+
# Extract and combine each speech segment
|
| 547 |
+
for i, (start, end) in enumerate(timestamps):
|
| 548 |
+
# Convert seconds to milliseconds for pydub
|
| 549 |
+
start_ms = int(start * 1000)
|
| 550 |
+
end_ms = int(end * 1000)
|
| 551 |
+
|
| 552 |
+
# Ensure the end time does not exceed the length of the audio segment
|
| 553 |
+
if end_ms > len(audio_segment):
|
| 554 |
+
end_ms = len(audio_segment)
|
| 555 |
+
|
| 556 |
+
# Extract the segment
|
| 557 |
+
segment = audio_segment[start_ms:end_ms]
|
| 558 |
+
|
| 559 |
+
# Add to combined audio
|
| 560 |
+
combined_speech = np.append(combined_speech, segment)
|
| 561 |
+
|
| 562 |
+
return combined_speech
|
| 563 |
+
|
| 564 |
+
def process_audio(self, audio_array:np.array):
|
| 565 |
+
"""
|
| 566 |
+
Complete processing pipeline: detect speech, save timestamps, and optionally extract speech.
|
| 567 |
+
Returns:
|
| 568 |
+
tuple: (timestamps, output_speech_path if extract_speech else None)
|
| 569 |
+
"""
|
| 570 |
+
|
| 571 |
+
# Run VAD to detect speech
|
| 572 |
+
timestamps = self.detect_speech(audio_array)
|
| 573 |
+
|
| 574 |
+
combined_speech = self.extract_speech_segments(audio_array, timestamps)
|
| 575 |
+
|
| 576 |
+
return timestamps, combined_speech
|
| 577 |
+
|
| 578 |
+
|
| 579 |
+
|
| 580 |
+
class VadProcessor:
|
| 581 |
+
def __init__(
|
| 582 |
+
self,
|
| 583 |
+
prob_threshold=0.5,
|
| 584 |
+
silence_s=0.2,
|
| 585 |
+
cache_s=0.15,
|
| 586 |
+
sr=16000
|
| 587 |
+
):
|
| 588 |
+
self.prob_threshold = prob_threshold
|
| 589 |
+
self.cache_s = cache_s
|
| 590 |
+
self.sr = sr
|
| 591 |
+
self.silence_s = silence_s
|
| 592 |
+
|
| 593 |
+
self.vad = VadV2(self.prob_threshold, self.sr, self.silence_s * 1000, self.cache_s * 1000, max_speech_duration_s=15)
|
| 594 |
+
|
| 595 |
+
|
| 596 |
+
def process_audio(self, audio_buffer: np.ndarray):
|
| 597 |
+
audio = np.array([], np.float32)
|
| 598 |
+
for i in range(0, len(audio_buffer), 512):
|
| 599 |
+
chunk = audio_buffer[i:i+512]
|
| 600 |
+
ret = self.vad(chunk)
|
| 601 |
+
if ret:
|
| 602 |
+
audio = np.append(audio, ret['audio'])
|
| 603 |
+
return audio
|
python/pipelines/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from .base import MetaItem
|
| 3 |
+
from .pipe_vad import VadPipe
|
python/pipelines/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (247 Bytes). View file
|
|
|
python/pipelines/__pycache__/base.cpython-312.pyc
ADDED
|
Binary file (4.04 kB). View file
|
|
|
python/pipelines/__pycache__/pipe_vad.cpython-312.pyc
ADDED
|
Binary file (4.14 kB). View file
|
|
|
python/pipelines/base.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from dataclasses import dataclass, field
|
| 3 |
+
from multiprocessing import Process, Queue
|
| 4 |
+
from multiprocessing import Event
|
| 5 |
+
from logging import getLogger
|
| 6 |
+
|
| 7 |
+
logger = getLogger(__name__)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@dataclass
|
| 11 |
+
class Segment:
|
| 12 |
+
t0: int
|
| 13 |
+
t1: int
|
| 14 |
+
text: str
|
| 15 |
+
|
| 16 |
+
@dataclass
|
| 17 |
+
class MetaItem:
|
| 18 |
+
segments: list[Segment] = field(default_factory=list)
|
| 19 |
+
source_audio: bytes = b""
|
| 20 |
+
audio: bytes = b''
|
| 21 |
+
transcribe_content: str = ''
|
| 22 |
+
translate_content: str = ''
|
| 23 |
+
source_language: str = 'zh'
|
| 24 |
+
destination_language: str = 'en'
|
| 25 |
+
speech_status: str = 'END' # "END", "START"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class BasePipe(Process):
|
| 29 |
+
def __init__(self, in_queue=None, out_queue=None) -> None:
|
| 30 |
+
super().__init__() # Initialize the Process class
|
| 31 |
+
self._in_queue = in_queue if in_queue else Queue()
|
| 32 |
+
self._out_queue = out_queue if out_queue else Queue()
|
| 33 |
+
self._ready = Event()
|
| 34 |
+
|
| 35 |
+
def set_ready(self):
|
| 36 |
+
self._ready.set()
|
| 37 |
+
|
| 38 |
+
def is_ready(self):
|
| 39 |
+
return self._ready.is_set()
|
| 40 |
+
|
| 41 |
+
def wait(self):
|
| 42 |
+
self._ready.wait()
|
| 43 |
+
|
| 44 |
+
@property
|
| 45 |
+
def output_queue(self):
|
| 46 |
+
return self._out_queue
|
| 47 |
+
|
| 48 |
+
@property
|
| 49 |
+
def input_queue(self):
|
| 50 |
+
return self._in_queue
|
| 51 |
+
|
| 52 |
+
def process(self, in_data: MetaItem) -> MetaItem:
|
| 53 |
+
raise NotImplementedError("Subclasses should implement this method.")
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
@classmethod
|
| 57 |
+
def init(cls):
|
| 58 |
+
raise NotImplementedError
|
| 59 |
+
|
| 60 |
+
def run(self):
|
| 61 |
+
logger.info(f"start initial {self.__class__.__name__}")
|
| 62 |
+
self.init()
|
| 63 |
+
logger.info(f"finish initial {self.__class__.__name__}")
|
| 64 |
+
self.set_ready()
|
| 65 |
+
while True:
|
| 66 |
+
item = self.input_queue.get()
|
| 67 |
+
if item is None: # Check for termination signal
|
| 68 |
+
break
|
| 69 |
+
out_item = self.process(item)
|
| 70 |
+
if out_item:
|
| 71 |
+
self.output_queue.put(out_item)
|
python/pipelines/pipe_vad.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from .base import MetaItem, BasePipe
|
| 3 |
+
from ..helpers.vadprocessor import FixedVADIterator
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import logging
|
| 7 |
+
|
| 8 |
+
# import noisereduce as nr
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class VadPipe(BasePipe):
|
| 12 |
+
vac = None
|
| 13 |
+
sample_rate = 16000
|
| 14 |
+
|
| 15 |
+
def __init__(self, in_queue=None, out_queue=None) -> None:
|
| 16 |
+
super().__init__(in_queue, out_queue)
|
| 17 |
+
self._offset = 0 # 处理的frame size offset
|
| 18 |
+
self._status = 'END'
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def reset(self):
|
| 22 |
+
self._offset = 0
|
| 23 |
+
self._status = 'END'
|
| 24 |
+
|
| 25 |
+
self.vac.reset_states()
|
| 26 |
+
|
| 27 |
+
@classmethod
|
| 28 |
+
def init(cls):
|
| 29 |
+
if cls.vac is None:
|
| 30 |
+
cls.vac = FixedVADIterator(
|
| 31 |
+
threshold=0.6,
|
| 32 |
+
sampling_rate=cls.sample_rate,
|
| 33 |
+
# speech_pad_ms=10
|
| 34 |
+
min_silence_duration_ms = 100,
|
| 35 |
+
# speech_pad_ms = 30,
|
| 36 |
+
)
|
| 37 |
+
cls.vac.reset_states()
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# def reduce_noise(self, data):
|
| 41 |
+
# return nr.reduce_noise(y=data, sr=self.sample_rate)
|
| 42 |
+
|
| 43 |
+
def _process_speech_chunk(self, source_audio:np.ndarray):
|
| 44 |
+
speech_dict = self.vac(source_audio, return_seconds=False)
|
| 45 |
+
# print(f"speech_dict : {speech_dict}")
|
| 46 |
+
if speech_dict:
|
| 47 |
+
relative_start_frame = None
|
| 48 |
+
relative_end_frame = None
|
| 49 |
+
start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
|
| 50 |
+
if start_frame:
|
| 51 |
+
relative_start_frame =start_frame - self._offset
|
| 52 |
+
if end_frame:
|
| 53 |
+
relative_end_frame = end_frame - self._offset
|
| 54 |
+
return relative_start_frame, relative_end_frame
|
| 55 |
+
|
| 56 |
+
def process(self, in_data: MetaItem) -> MetaItem:
|
| 57 |
+
if self._offset == 0:
|
| 58 |
+
self.vac.reset_states()
|
| 59 |
+
|
| 60 |
+
# silence_audio_100ms = np.zeros(int(0.1*self.sample_rate))
|
| 61 |
+
source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
|
| 62 |
+
print(f"source_audio.shape = {source_audio.shape}")
|
| 63 |
+
speech_data = self._process_speech_chunk(source_audio)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
if speech_data: # 表示有音频的变化点出现
|
| 67 |
+
rel_start_frame, rel_end_frame = speech_data
|
| 68 |
+
|
| 69 |
+
if rel_start_frame is not None and rel_end_frame is None:
|
| 70 |
+
self._status = "START" # 语音开始
|
| 71 |
+
target_audio = source_audio[max(rel_start_frame-100, 0):]
|
| 72 |
+
logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
|
| 73 |
+
elif rel_start_frame is None and rel_end_frame is not None:
|
| 74 |
+
self._status = "END" # 音频结束
|
| 75 |
+
target_audio = source_audio[:rel_end_frame]
|
| 76 |
+
logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
|
| 77 |
+
else:
|
| 78 |
+
self._status = 'END'
|
| 79 |
+
target_audio = source_audio[max(rel_start_frame-100, 0):rel_end_frame]
|
| 80 |
+
logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
|
| 81 |
+
# logging.debug("❌ No valid speech segment detected, setting status to END")
|
| 82 |
+
else:
|
| 83 |
+
if self._status == 'START':
|
| 84 |
+
target_audio = source_audio
|
| 85 |
+
# logging.debug("🔊 Continuing to capture audio as speech is still ongoing")
|
| 86 |
+
else: # end
|
| 87 |
+
target_audio = np.array([],dtype=np.float32)
|
| 88 |
+
# self._status = 'END'
|
| 89 |
+
# logging.debug("❌ No speech detected, setting status to END")
|
| 90 |
+
print(f"strat: {rel_start_frame} end: {rel_end_frame}")
|
| 91 |
+
self._offset += len(source_audio)
|
| 92 |
+
|
| 93 |
+
in_data.audio = target_audio.tobytes()
|
| 94 |
+
in_data.source_audio = b''
|
| 95 |
+
in_data.speech_status = self._status
|
| 96 |
+
return in_data
|
python/processing.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 4 |
+
parent_dir = os.path.dirname(current_dir)
|
| 5 |
+
sys.path.append(parent_dir)
|
| 6 |
+
# sys.path.append("/Users/chenxiang/translator/Translator/llama-cpp-python/llama_cpp")
|
| 7 |
+
|
| 8 |
+
from .pipelines import MetaItem, VadPipe
|
| 9 |
+
|
| 10 |
+
class ProcessingPipes:
|
| 11 |
+
def __init__(self) -> None:
|
| 12 |
+
|
| 13 |
+
self._process = []
|
| 14 |
+
# vad
|
| 15 |
+
self._vad_pipe = self._launch_process(VadPipe())
|
| 16 |
+
|
| 17 |
+
def _launch_process(self, process_obj):
|
| 18 |
+
process_obj.daemon = True
|
| 19 |
+
process_obj.start()
|
| 20 |
+
self._process.append(process_obj)
|
| 21 |
+
return process_obj
|
| 22 |
+
|
| 23 |
+
def wait_ready(self):
|
| 24 |
+
for p in self._process:
|
| 25 |
+
p.wait()
|
| 26 |
+
|
| 27 |
+
def voice_detect(self, audio_buffer: bytes) -> MetaItem:
|
| 28 |
+
item = MetaItem(source_audio=audio_buffer)
|
| 29 |
+
self._vad_pipe.input_queue.put(item)
|
| 30 |
+
return self._vad_pipe.output_queue.get()
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
if __name__ == "__main__":
|
| 34 |
+
import soundfile
|
| 35 |
+
import numpy as np
|
| 36 |
+
|
| 37 |
+
wav_path1 = "/Users/chenxiang/translator/core/vad_cpp/bin/Chinese-liyongle-part1.mp3"
|
| 38 |
+
wav_path2 = "/Users/chenxiang/translator/core/whisper_wrapper/bin/zh.wav"
|
| 39 |
+
|
| 40 |
+
tp = ProcessingPipes()
|
| 41 |
+
audio, sr, = soundfile.read(wav_path2)
|
| 42 |
+
|
| 43 |
+
# 确保是单声道
|
| 44 |
+
if len(audio.shape) > 1:
|
| 45 |
+
print("不是单声道")
|
| 46 |
+
audio = audio.mean(axis=1)
|
| 47 |
+
|
| 48 |
+
# 重采样到 16kHz(如果需要)
|
| 49 |
+
if sr != 16000:
|
| 50 |
+
print("采样率不是 16000, 重新采样到 16kHz(如果需要)")
|
| 51 |
+
import resampy
|
| 52 |
+
audio = resampy.resample(audio, sr, 16000)
|
| 53 |
+
|
| 54 |
+
# 转换为 float32
|
| 55 |
+
print(f"original audio data type = {audio.dtype}")
|
| 56 |
+
audio = audio.astype(np.float32)
|
| 57 |
+
|
| 58 |
+
print(f"original audio data size = {audio.shape}")
|
| 59 |
+
|
| 60 |
+
result = tp.voice_detect(audio)
|
| 61 |
+
# print(f"{result.speech_status} {result.segments} {result.segments}")
|
| 62 |
+
print("********** END *************")
|
reference/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
reference/cpp/onnx_wrapper.cpp
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <stdexcept>
|
| 2 |
+
#include <cmath>
|
| 3 |
+
#include <iostream>
|
| 4 |
+
|
| 5 |
+
#include "onnx_wrapper.h"
|
| 6 |
+
|
| 7 |
+
static void get_input_names(Ort::Session* session, std::vector<std::string> &input_names_str,
|
| 8 |
+
std::vector<const char *> &input_names_char) {
|
| 9 |
+
Ort::AllocatorWithDefaultOptions allocator;
|
| 10 |
+
size_t nodes_num = session->GetInputCount();
|
| 11 |
+
input_names_str.resize(nodes_num);
|
| 12 |
+
input_names_char.resize(nodes_num);
|
| 13 |
+
for (size_t i = 0; i != nodes_num; ++i) {
|
| 14 |
+
auto t = session->GetInputNameAllocated(i, allocator);
|
| 15 |
+
input_names_str[i] = t.get();
|
| 16 |
+
input_names_char[i] = input_names_str[i].c_str();
|
| 17 |
+
}
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
static void get_output_names(Ort::Session* session, std::vector<std::string> &output_names_,
|
| 21 |
+
std::vector<const char *> &vad_out_names_) {
|
| 22 |
+
Ort::AllocatorWithDefaultOptions allocator;
|
| 23 |
+
size_t nodes_num = session->GetOutputCount();
|
| 24 |
+
output_names_.resize(nodes_num);
|
| 25 |
+
vad_out_names_.resize(nodes_num);
|
| 26 |
+
for (size_t i = 0; i != nodes_num; ++i) {
|
| 27 |
+
auto t = session->GetOutputNameAllocated(i, allocator);
|
| 28 |
+
output_names_[i] = t.get();
|
| 29 |
+
vad_out_names_[i] = output_names_[i].c_str();
|
| 30 |
+
}
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
OnnxVadWrapper::OnnxVadWrapper(const std::string& model_path, bool force_cpu, int thread_num)
|
| 34 |
+
: sample_rates_{16000}, model_path_(model_path) {
|
| 35 |
+
Ort::SessionOptions session_options;
|
| 36 |
+
session_options.SetIntraOpNumThreads(thread_num);
|
| 37 |
+
session_options.SetGraphOptimizationLevel(ORT_ENABLE_ALL);
|
| 38 |
+
session_options.DisableCpuMemArena();
|
| 39 |
+
|
| 40 |
+
// if (force_cpu && supports_cpu()) {
|
| 41 |
+
// session_options.AppendExecutionProvider_CPU();
|
| 42 |
+
// }
|
| 43 |
+
|
| 44 |
+
// 初始化 ONNX Session
|
| 45 |
+
try {
|
| 46 |
+
env_ = Ort::Env(ORT_LOGGING_LEVEL_WARNING, "OnnxVadWrapper");
|
| 47 |
+
session_ = std::make_unique<Ort::Session>(env_, ORTCHAR(model_path.c_str()), session_options);
|
| 48 |
+
std::cout << "Successfully load model from " << model_path << std::endl;
|
| 49 |
+
} catch (std::exception const &e) {
|
| 50 |
+
std::cout << "Error when load vad onnx model: " << e.what() << std::endl;
|
| 51 |
+
exit(-1);
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
get_input_names(session_.get(), input_names_, vad_in_names_);
|
| 55 |
+
get_output_names(session_.get(), output_names_, vad_out_names_);
|
| 56 |
+
|
| 57 |
+
reset_states();
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
OnnxVadWrapper::~OnnxVadWrapper() = default;
|
| 61 |
+
|
| 62 |
+
void OnnxVadWrapper::reset_states(int batch_size) {
|
| 63 |
+
int total_size = 2 * batch_size * 128;
|
| 64 |
+
state_.resize(total_size); /////
|
| 65 |
+
state_.assign(state_.size(), 0.0f);
|
| 66 |
+
context_.clear();
|
| 67 |
+
last_sr_ = 0;
|
| 68 |
+
last_batch_size_ = 0;
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
std::pair<std::vector<float>, std::vector<float>> OnnxVadWrapper::operator()(const std::vector<float>& x, int sr) {
|
| 72 |
+
validate_input(x, sr);
|
| 73 |
+
|
| 74 |
+
int num_samples = (sr == 16000) ? 512 : 256;
|
| 75 |
+
int context_size = (sr == 16000) ? 64 : 32;
|
| 76 |
+
|
| 77 |
+
int batch_size = 1; // 假设单通道输入
|
| 78 |
+
if (x.size() != num_samples) {
|
| 79 |
+
throw std::invalid_argument("Input must be exactly " + std::to_string(num_samples) + " samples.");
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
if (!last_batch_size_) reset_states(batch_size);
|
| 83 |
+
if (last_sr_ != 0 && last_sr_ != sr) reset_states(batch_size);
|
| 84 |
+
if (last_batch_size_ != 0 && last_batch_size_ != batch_size) reset_states(batch_size);
|
| 85 |
+
|
| 86 |
+
if (context_.empty()) {
|
| 87 |
+
context_.resize(batch_size * context_size, 0.0f);
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
// 合并 context 和 input
|
| 91 |
+
std::vector<float> x_with_context(context_.begin(), context_.end());
|
| 92 |
+
x_with_context.insert(x_with_context.end(), x.begin(), x.end());
|
| 93 |
+
|
| 94 |
+
// Prepare inputs
|
| 95 |
+
std::vector<Ort::Value> inputs;
|
| 96 |
+
auto mem_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
|
| 97 |
+
std::array<int64_t, 3> input_shape = {1, 1, static_cast<int64_t>(x_with_context.size())};
|
| 98 |
+
Ort::Value input_tensor = Ort::Value::CreateTensor<float>(
|
| 99 |
+
mem_info, const_cast<float*>(x_with_context.data()), x_with_context.size(),
|
| 100 |
+
input_shape.data(), input_shape.size());
|
| 101 |
+
inputs.emplace_back(std::move(input_tensor));
|
| 102 |
+
|
| 103 |
+
std::array<int64_t, 3> state_shape = {2, batch_size, 128};
|
| 104 |
+
Ort::Value state_tensor = Ort::Value::CreateTensor<float>(
|
| 105 |
+
mem_info, state_.data(), state_.size(), state_shape.data(), state_shape.size());
|
| 106 |
+
inputs.emplace_back(std::move(state_tensor));
|
| 107 |
+
|
| 108 |
+
std::array<int64_t, 1> sr_shape = {1};
|
| 109 |
+
float sr_f = static_cast<float>(sr);
|
| 110 |
+
Ort::Value sr_tensor = Ort::Value::CreateTensor<float>(
|
| 111 |
+
mem_info, &sr_f, 1, sr_shape.data(), sr_shape.size());
|
| 112 |
+
inputs.emplace_back(std::move(sr_tensor));
|
| 113 |
+
|
| 114 |
+
// const char* input_names[] = {"input", "state", "sr"};
|
| 115 |
+
// std::vector<Ort::Value> inputs = {std::move(input_tensor), std::move(state_tensor), std::move(sr_tensor)};
|
| 116 |
+
|
| 117 |
+
// Run inference
|
| 118 |
+
std::vector<Ort::Value> outputs;
|
| 119 |
+
try {
|
| 120 |
+
outputs = session_->Run(
|
| 121 |
+
Ort::RunOptions{nullptr}, vad_in_names_.data(), inputs.data(),
|
| 122 |
+
inputs.size(), vad_out_names_.data(), vad_out_names_.size());
|
| 123 |
+
} catch (std::exception const &e) {
|
| 124 |
+
std::cout << "Error when run vad onnx forword: " << e.what() << std::endl;
|
| 125 |
+
exit(-1);
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
// Get output
|
| 129 |
+
float* out_data = outputs[0].GetTensorMutableData<float>();
|
| 130 |
+
size_t out_len = outputs[0].GetTensorTypeAndShapeInfo().GetElementCount();
|
| 131 |
+
std::vector<float> out(out_data, out_data + out_len);
|
| 132 |
+
|
| 133 |
+
// Update state and context
|
| 134 |
+
float* new_state = outputs[1].GetTensorMutableData<float>();
|
| 135 |
+
std::copy(new_state, new_state + state_.size(), state_.begin());
|
| 136 |
+
|
| 137 |
+
context_.assign(x_with_context.end() - context_size, x_with_context.end());
|
| 138 |
+
|
| 139 |
+
last_sr_ = sr;
|
| 140 |
+
last_batch_size_ = batch_size;
|
| 141 |
+
|
| 142 |
+
return {out, {}};
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
std::vector<float> OnnxVadWrapper::audio_forward(const std::vector<float>& audio, int sr) {
|
| 146 |
+
std::vector<float> x = audio;
|
| 147 |
+
reset_states();
|
| 148 |
+
|
| 149 |
+
int num_samples = (sr == 16000) ? 512 : 256;
|
| 150 |
+
std::vector<float> result;
|
| 151 |
+
|
| 152 |
+
// Pad to multiple of num_samples
|
| 153 |
+
int pad_num = (num_samples - (x.size() % num_samples)) % num_samples;
|
| 154 |
+
x.resize(x.size() + pad_num, 0.0f);
|
| 155 |
+
|
| 156 |
+
for (size_t i = 0; i < x.size(); i += num_samples) {
|
| 157 |
+
std::vector<float> chunk(x.begin() + i, x.begin() + i + num_samples);
|
| 158 |
+
auto [out, _] = (*this)(chunk, sr);
|
| 159 |
+
result.insert(result.end(), out.begin(), out.end());
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
return result;
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
bool OnnxVadWrapper::supports_cpu() {
|
| 166 |
+
auto providers = Ort::GetAvailableProviders();
|
| 167 |
+
|
| 168 |
+
for (const std::string& provider : providers) {
|
| 169 |
+
if (provider == "CPUExecutionProvider") {
|
| 170 |
+
return true;
|
| 171 |
+
}
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
return false;
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
void OnnxVadWrapper::validate_input(const std::vector<float>& x, int sr) {
|
| 178 |
+
if (sr != 16000 && sr % 16000 != 0) {
|
| 179 |
+
throw std::invalid_argument("Unsupported sampling rate: " + std::to_string(sr));
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
if ((sr / x.size()) > 31.25) {
|
| 183 |
+
throw std::invalid_argument("Input audio chunk is too short");
|
| 184 |
+
}
|
| 185 |
+
}
|
reference/cpp/onnx_wrapper.h
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <vector>
|
| 2 |
+
#include <string>
|
| 3 |
+
|
| 4 |
+
#if defined(__APPLE__)
|
| 5 |
+
#include <onnxruntime/onnxruntime_cxx_api.h>
|
| 6 |
+
#else
|
| 7 |
+
#include "onnxruntime_run_options_config_keys.h"
|
| 8 |
+
#include "onnxruntime_cxx_api.h"
|
| 9 |
+
#endif
|
| 10 |
+
|
| 11 |
+
#ifdef _WIN32
|
| 12 |
+
|
| 13 |
+
#define ORTSTRING(str) StrToWstr(str)
|
| 14 |
+
#define ORTCHAR(str) StrToWstr(str).c_str()
|
| 15 |
+
|
| 16 |
+
inline std::wstring String2wstring(const std::string& str, const std::string& locale)
|
| 17 |
+
{
|
| 18 |
+
typedef std::codecvt_byname<wchar_t, char, std::mbstate_t> F;
|
| 19 |
+
std::wstring_convert<F> strCnv(new F(locale));
|
| 20 |
+
return strCnv.from_bytes(str);
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
inline std::wstring StrToWstr(std::string str) {
|
| 24 |
+
if (str.length() == 0)
|
| 25 |
+
return L"";
|
| 26 |
+
return String2wstring(str, "zh-CN");
|
| 27 |
+
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
#else
|
| 31 |
+
|
| 32 |
+
#define ORTSTRING(str) str
|
| 33 |
+
#define ORTCHAR(str) str
|
| 34 |
+
|
| 35 |
+
#endif
|
| 36 |
+
|
| 37 |
+
class OnnxVadWrapper {
|
| 38 |
+
public:
|
| 39 |
+
explicit OnnxVadWrapper(const std::string& model_path, bool force_cpu = false, int thread_num = 1);
|
| 40 |
+
~OnnxVadWrapper();
|
| 41 |
+
|
| 42 |
+
// 重载 operator(),使得对象可以像函数一样调用
|
| 43 |
+
std::pair<std::vector<float>, std::vector<float>> operator()(const std::vector<float>& x, int sr);
|
| 44 |
+
|
| 45 |
+
// 批量处理整个音频
|
| 46 |
+
std::vector<float> audio_forward(const std::vector<float>& audio, int sr);
|
| 47 |
+
|
| 48 |
+
// 重置 RNN 状态
|
| 49 |
+
void reset_states(int batch_size = 1);
|
| 50 |
+
|
| 51 |
+
private:
|
| 52 |
+
Ort::Env env_;
|
| 53 |
+
|
| 54 |
+
std::unique_ptr<Ort::Session> session_;
|
| 55 |
+
std::vector<std::string> input_names_, output_names_;
|
| 56 |
+
std::vector<const char *> vad_in_names_;
|
| 57 |
+
std::vector<const char *> vad_out_names_;
|
| 58 |
+
|
| 59 |
+
std::vector<int> sample_rates_;
|
| 60 |
+
std::string model_path_;
|
| 61 |
+
|
| 62 |
+
std::vector<float> state_; // RNN State
|
| 63 |
+
std::vector<float> context_; // Context buffer
|
| 64 |
+
int last_sr_ = 0;
|
| 65 |
+
int last_batch_size_ = 0;
|
| 66 |
+
|
| 67 |
+
void read_model();
|
| 68 |
+
bool supports_cpu();
|
| 69 |
+
void validate_input(const std::vector<float>& x, int sr);
|
| 70 |
+
};
|
reference/cpp/vad_iterator_onnx.cpp
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <cmath>
|
| 2 |
+
#include <stdexcept>
|
| 3 |
+
#include <unordered_map>
|
| 4 |
+
#include <vector>
|
| 5 |
+
|
| 6 |
+
#include "vad_iterator_onnx.h"
|
| 7 |
+
|
| 8 |
+
VadIteratorOnnx::VadIteratorOnnx(float threshold,
|
| 9 |
+
int sampling_rate,
|
| 10 |
+
int min_silence_duration_ms,
|
| 11 |
+
float max_speech_duration_s,
|
| 12 |
+
int speech_pad_ms)
|
| 13 |
+
: threshold_(threshold),
|
| 14 |
+
sampling_rate_(sampling_rate),
|
| 15 |
+
min_silence_samples_(sampling_rate_ * min_silence_duration_ms / 1000.0),
|
| 16 |
+
speech_pad_samples_(sampling_rate_ * speech_pad_ms / 1000.0),
|
| 17 |
+
triggered_(false),
|
| 18 |
+
temp_end_(0),
|
| 19 |
+
current_sample_(0),
|
| 20 |
+
start_(0) {
|
| 21 |
+
|
| 22 |
+
if (sampling_rate_ != 8000 && sampling_rate_ != 16000) {
|
| 23 |
+
throw std::invalid_argument("Only support sampling rates of 8000 or 16000");
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
model_ = std::make_unique<OnnxVadWrapper>("path/to/vad.onnx", true); // 可配置路径
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
VadIteratorOnnx::~VadIteratorOnnx() = default;
|
| 30 |
+
|
| 31 |
+
void VadIteratorOnnx::reset_states() {
|
| 32 |
+
model_->reset_states();
|
| 33 |
+
triggered_ = false;
|
| 34 |
+
temp_end_ = 0;
|
| 35 |
+
current_sample_ = 0;
|
| 36 |
+
start_ = 0;
|
| 37 |
+
buffer_.clear();
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
std::unordered_map<std::string, double>
|
| 41 |
+
VadIteratorOnnx::operator()(const std::vector<float>& x, bool return_seconds) {
|
| 42 |
+
std::unordered_map<std::string, double> result;
|
| 43 |
+
|
| 44 |
+
int window_size_samples = (sampling_rate_ == 16000) ? 512 : 256;
|
| 45 |
+
|
| 46 |
+
// 将新音频追加到缓存中
|
| 47 |
+
buffer_.insert(buffer_.end(), x.begin(), x.end());
|
| 48 |
+
while (buffer_.size() > 0) {
|
| 49 |
+
std::unordered_map<std::string, double> tmp;
|
| 50 |
+
std::vector<float> chunk(buffer_.begin(), buffer_.begin() + std::min(static_cast<int>(x.size()), window_size_samples));
|
| 51 |
+
// 补零到固定长度
|
| 52 |
+
if (chunk.size() < static_cast<size_t>(window_size_samples)) {
|
| 53 |
+
chunk.resize(window_size_samples, 0.0f);
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
current_sample_ += window_size_samples;
|
| 57 |
+
|
| 58 |
+
// 推理得到语音概率
|
| 59 |
+
auto [output, _] = (*model_)(chunk, sampling_rate_);
|
| 60 |
+
float speech_prob = output[0];
|
| 61 |
+
|
| 62 |
+
if (speech_prob >= threshold_ && temp_end_ > 0) {
|
| 63 |
+
temp_end_ = 0;
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
if (speech_prob >= threshold_ && !triggered_) {
|
| 67 |
+
triggered_ = true;
|
| 68 |
+
start_ = std::max(0.0, current_sample_ - speech_pad_samples_ - window_size_samples);
|
| 69 |
+
tmp["start"] = return_seconds ? start_ / sampling_rate_ : start_;
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
if (speech_prob < (threshold_ - 0.15) && triggered_) {
|
| 73 |
+
if (temp_end_ == 0) {
|
| 74 |
+
temp_end_ = current_sample_;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
if (current_sample_ - temp_end_ >= min_silence_samples_) {
|
| 78 |
+
double speech_end = temp_end_ + speech_pad_samples_ - window_size_samples;
|
| 79 |
+
tmp["end"] = return_seconds ? speech_end / sampling_rate_ : speech_end;
|
| 80 |
+
temp_end_ = 0;
|
| 81 |
+
triggered_ = false;
|
| 82 |
+
}
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
// 移除已处理的数据
|
| 86 |
+
std::vector<float>(buffer_.begin() + window_size_samples, buffer_.end()).swap(buffer_);
|
| 87 |
+
|
| 88 |
+
if (result.empty()) {
|
| 89 |
+
result = tmp;
|
| 90 |
+
} else if (!tmp.empty()) {
|
| 91 |
+
// 如果当前结果有 'end',更新最终 end
|
| 92 |
+
if (tmp.find("end") != tmp.end()) {
|
| 93 |
+
result["end"] = tmp["end"];
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
// 如果有新的 start,但前一个有 end,则合并成连续语音段
|
| 97 |
+
if (tmp.find("start") != tmp.end() && result.find("end") != result.end()) {
|
| 98 |
+
result.erase("end");
|
| 99 |
+
}
|
| 100 |
+
}
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
return result;
|
| 104 |
+
}
|
reference/cpp/vad_iterator_onnx.h
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <vector>
|
| 4 |
+
#include <string>
|
| 5 |
+
#include <unordered_map>
|
| 6 |
+
|
| 7 |
+
#include "onnx_wrapper.h"
|
| 8 |
+
|
| 9 |
+
class OnnxVadWrapper; // 前向声明
|
| 10 |
+
|
| 11 |
+
class VadIteratorOnnx {
|
| 12 |
+
public:
|
| 13 |
+
explicit VadIteratorOnnx(float threshold = 0.5,
|
| 14 |
+
int sampling_rate = 16000,
|
| 15 |
+
int min_silence_duration_ms = 100,
|
| 16 |
+
float max_speech_duration_s = INFINITY,
|
| 17 |
+
int speech_pad_ms = 30);
|
| 18 |
+
|
| 19 |
+
virtual ~VadIteratorOnnx();
|
| 20 |
+
|
| 21 |
+
// 重置内部状态
|
| 22 |
+
virtual void reset_states();
|
| 23 |
+
|
| 24 |
+
// 输入音频块,返回语音事件(start/end)
|
| 25 |
+
virtual std::unordered_map<std::string, double> operator()(const std::vector<float>& x, bool return_seconds = false);
|
| 26 |
+
|
| 27 |
+
private:
|
| 28 |
+
std::unique_ptr<OnnxVadWrapper> model_;
|
| 29 |
+
std::vector<float> buffer_; // 缓冲区用于保存未处理完的音频
|
| 30 |
+
float threshold_;
|
| 31 |
+
int sampling_rate_;
|
| 32 |
+
double min_silence_samples_;
|
| 33 |
+
double speech_pad_samples_;
|
| 34 |
+
bool triggered_;
|
| 35 |
+
double temp_end_;
|
| 36 |
+
double current_sample_;
|
| 37 |
+
double start_;
|
| 38 |
+
};
|
reference/python/__pycache__/audio_utils.cpython-312.pyc
ADDED
|
Binary file (2.21 kB). View file
|
|
|
reference/python/audio_utils.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import soundfile as sf
|
| 3 |
+
import time
|
| 4 |
+
|
| 5 |
+
def audio_stream_generator(audio_file_path, chunk_size=4096, simulate_realtime=True):
|
| 6 |
+
"""
|
| 7 |
+
音频流生成器,从音频文件中读取数据并以流的方式输出
|
| 8 |
+
|
| 9 |
+
参数:
|
| 10 |
+
audio_file_path: 音频文件路径
|
| 11 |
+
chunk_size: 每个数据块的大小(采样点数)
|
| 12 |
+
simulate_realtime: 是否模拟实时流处理的速度
|
| 13 |
+
|
| 14 |
+
生成:
|
| 15 |
+
numpy.ndarray: 每次生成一个chunk_size大小的np.float32数据块
|
| 16 |
+
"""
|
| 17 |
+
# 加载音频文件
|
| 18 |
+
audio_data, sample_rate = sf.read(audio_file_path)
|
| 19 |
+
|
| 20 |
+
# 确保音频数据是float32类型
|
| 21 |
+
if audio_data.dtype != np.float32:
|
| 22 |
+
audio_data = audio_data.astype(np.float32)
|
| 23 |
+
|
| 24 |
+
# 如果是立体声,转换为单声道
|
| 25 |
+
if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
|
| 26 |
+
audio_data = audio_data.mean(axis=1)
|
| 27 |
+
|
| 28 |
+
print(f"已加载音频文件: {audio_file_path}")
|
| 29 |
+
print(f"采样率: {sample_rate} Hz")
|
| 30 |
+
print(f"音频长度: {len(audio_data)/sample_rate:.2f} 秒")
|
| 31 |
+
|
| 32 |
+
# 计算每个块的时长(秒)
|
| 33 |
+
chunk_duration = chunk_size / sample_rate if simulate_realtime else 0
|
| 34 |
+
|
| 35 |
+
# 按块生成数据
|
| 36 |
+
audio_len = len(audio_data)
|
| 37 |
+
for pos in range(0, audio_len, chunk_size):
|
| 38 |
+
# 获取当前块
|
| 39 |
+
end_pos = min(pos + chunk_size, audio_len)
|
| 40 |
+
chunk = audio_data[pos:end_pos]
|
| 41 |
+
|
| 42 |
+
# 如果块大小不足,用0填充
|
| 43 |
+
if len(chunk) < chunk_size:
|
| 44 |
+
padded_chunk = np.zeros(chunk_size, dtype=np.float32)
|
| 45 |
+
padded_chunk[:len(chunk)] = chunk
|
| 46 |
+
chunk = padded_chunk
|
| 47 |
+
|
| 48 |
+
# 模拟实时处理的延迟
|
| 49 |
+
if simulate_realtime:
|
| 50 |
+
time.sleep(chunk_duration)
|
| 51 |
+
|
| 52 |
+
yield chunk
|
| 53 |
+
|
| 54 |
+
print("音频流处理完成")
|
reference/python/test_vad.ipynb
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [],
|
| 8 |
+
"source": [
|
| 9 |
+
"from audio_utils import audio_stream_generator\n",
|
| 10 |
+
"import IPython.display as ipd\n",
|
| 11 |
+
"import sys\n"
|
| 12 |
+
]
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"cell_type": "code",
|
| 16 |
+
"execution_count": 3,
|
| 17 |
+
"metadata": {},
|
| 18 |
+
"outputs": [],
|
| 19 |
+
"source": [
|
| 20 |
+
"sys.path.append(\"/Users/chenxiang/translator/core/vad_cpp/\")\n",
|
| 21 |
+
"from python.helpers.vadprocessor import FixedVADIterator\n"
|
| 22 |
+
]
|
| 23 |
+
},
|
| 24 |
+
{
|
| 25 |
+
"cell_type": "code",
|
| 26 |
+
"execution_count": 4,
|
| 27 |
+
"metadata": {},
|
| 28 |
+
"outputs": [],
|
| 29 |
+
"source": [
|
| 30 |
+
"vac = FixedVADIterator(\n",
|
| 31 |
+
" threshold=0.5,\n",
|
| 32 |
+
" sampling_rate=16000,\n",
|
| 33 |
+
" # speech_pad_ms=10\n",
|
| 34 |
+
" min_silence_duration_ms = 100,\n",
|
| 35 |
+
" # speech_pad_ms = 30,\n",
|
| 36 |
+
" max_speech_duration_s=5.0,\n",
|
| 37 |
+
" )\n"
|
| 38 |
+
]
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"cell_type": "code",
|
| 42 |
+
"execution_count": 5,
|
| 43 |
+
"metadata": {},
|
| 44 |
+
"outputs": [],
|
| 45 |
+
"source": [
|
| 46 |
+
"SAMPLE_FILE_PATH = \"/Users/chenxiang/translator/core/vad_cpp/bin/Chinese-liyongle-part1.mp3\"\n",
|
| 47 |
+
"SAMPLING_RATE = 16000\n",
|
| 48 |
+
"\n",
|
| 49 |
+
"chunks_generator = audio_stream_generator(SAMPLE_FILE_PATH, chunk_size=4096)\n",
|
| 50 |
+
"vac.reset_states()"
|
| 51 |
+
]
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"cell_type": "code",
|
| 55 |
+
"execution_count": null,
|
| 56 |
+
"metadata": {},
|
| 57 |
+
"outputs": [
|
| 58 |
+
{
|
| 59 |
+
"name": "stdout",
|
| 60 |
+
"output_type": "stream",
|
| 61 |
+
"text": [
|
| 62 |
+
"已加载音频文件: /Users/chenxiang/translator/core/vad_cpp/bin/Chinese-liyongle-part1.mp3\n",
|
| 63 |
+
"采样率: 44100 Hz\n",
|
| 64 |
+
"音频长度: 64.00 秒\n",
|
| 65 |
+
"None\n"
|
| 66 |
+
]
|
| 67 |
+
}
|
| 68 |
+
],
|
| 69 |
+
"source": [
|
| 70 |
+
"# speech_dict = vac(next(chunks_generator), return_seconds=False)\n",
|
| 71 |
+
"# print(speech_dict)\n"
|
| 72 |
+
]
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"cell_type": "code",
|
| 76 |
+
"execution_count": 6,
|
| 77 |
+
"metadata": {},
|
| 78 |
+
"outputs": [
|
| 79 |
+
{
|
| 80 |
+
"name": "stdout",
|
| 81 |
+
"output_type": "stream",
|
| 82 |
+
"text": [
|
| 83 |
+
"已加载音频文件: /Users/chenxiang/translator/core/vad_cpp/bin/Chinese-liyongle-part1.mp3\n",
|
| 84 |
+
"采样率: 44100 Hz\n",
|
| 85 |
+
"音频长度: 64.00 秒\n",
|
| 86 |
+
"0 ->>> None\n",
|
| 87 |
+
"1 ->>> None\n",
|
| 88 |
+
"2 ->>> {'start': 10272}\n",
|
| 89 |
+
"3 ->>> None\n",
|
| 90 |
+
"4 ->>> None\n",
|
| 91 |
+
"5 ->>> None\n",
|
| 92 |
+
"6 ->>> None\n",
|
| 93 |
+
"7 ->>> None\n",
|
| 94 |
+
"8 ->>> None\n",
|
| 95 |
+
"9 ->>> None\n",
|
| 96 |
+
"10 ->>> None\n",
|
| 97 |
+
"11 ->>> None\n",
|
| 98 |
+
"12 ->>> None\n",
|
| 99 |
+
"13 ->>> {'end': 55264}\n",
|
| 100 |
+
"14 ->>> None\n",
|
| 101 |
+
"15 ->>> {'start': 60960}\n",
|
| 102 |
+
"16 ->>> None\n",
|
| 103 |
+
"17 ->>> None\n",
|
| 104 |
+
"18 ->>> None\n",
|
| 105 |
+
"19 ->>> None\n",
|
| 106 |
+
"20 ->>> {'end': 82912}\n",
|
| 107 |
+
"21 ->>> {'start': 89120}\n",
|
| 108 |
+
"22 ->>> None\n",
|
| 109 |
+
"23 ->>> None\n",
|
| 110 |
+
"24 ->>> None\n",
|
| 111 |
+
"25 ->>> None\n",
|
| 112 |
+
"26 ->>> None\n",
|
| 113 |
+
"27 ->>> None\n",
|
| 114 |
+
"28 ->>> None\n",
|
| 115 |
+
"29 ->>> None\n",
|
| 116 |
+
"30 ->>> None\n",
|
| 117 |
+
"31 ->>> None\n",
|
| 118 |
+
"32 ->>> None\n",
|
| 119 |
+
"33 ->>> None\n",
|
| 120 |
+
"34 ->>> None\n",
|
| 121 |
+
"35 ->>> None\n",
|
| 122 |
+
"36 ->>> None\n",
|
| 123 |
+
"37 ->>> None\n",
|
| 124 |
+
"38 ->>> None\n",
|
| 125 |
+
"39 ->>> None\n",
|
| 126 |
+
"40 ->>> None\n",
|
| 127 |
+
"41 ->>> None\n",
|
| 128 |
+
"42 ->>> None\n",
|
| 129 |
+
"43 ->>> None\n",
|
| 130 |
+
"44 ->>> None\n",
|
| 131 |
+
"45 ->>> None\n",
|
| 132 |
+
"46 ->>> None\n",
|
| 133 |
+
"47 ->>> None\n",
|
| 134 |
+
"48 ->>> None\n",
|
| 135 |
+
"49 ->>> None\n",
|
| 136 |
+
"50 ->>> {'end': 206816}\n",
|
| 137 |
+
"51 ->>> None\n",
|
| 138 |
+
"52 ->>> None\n",
|
| 139 |
+
"53 ->>> {'start': 219680}\n",
|
| 140 |
+
"54 ->>> None\n",
|
| 141 |
+
"55 ->>> None\n",
|
| 142 |
+
"56 ->>> None\n",
|
| 143 |
+
"57 ->>> None\n",
|
| 144 |
+
"58 ->>> None\n",
|
| 145 |
+
"59 ->>> None\n",
|
| 146 |
+
"60 ->>> None\n",
|
| 147 |
+
"61 ->>> None\n",
|
| 148 |
+
"62 ->>> None\n",
|
| 149 |
+
"63 ->>> None\n",
|
| 150 |
+
"64 ->>> None\n",
|
| 151 |
+
"65 ->>> None\n",
|
| 152 |
+
"66 ->>> None\n",
|
| 153 |
+
"67 ->>> None\n",
|
| 154 |
+
"68 ->>> None\n",
|
| 155 |
+
"69 ->>> None\n",
|
| 156 |
+
"70 ->>> None\n",
|
| 157 |
+
"71 ->>> None\n",
|
| 158 |
+
"72 ->>> None\n",
|
| 159 |
+
"73 ->>> None\n",
|
| 160 |
+
"74 ->>> None\n",
|
| 161 |
+
"75 ->>> None\n",
|
| 162 |
+
"76 ->>> None\n",
|
| 163 |
+
"77 ->>> None\n",
|
| 164 |
+
"78 ->>> None\n",
|
| 165 |
+
"79 ->>> None\n",
|
| 166 |
+
"80 ->>> None\n",
|
| 167 |
+
"81 ->>> None\n",
|
| 168 |
+
"82 ->>> None\n",
|
| 169 |
+
"83 ->>> None\n",
|
| 170 |
+
"84 ->>> None\n",
|
| 171 |
+
"85 ->>> None\n",
|
| 172 |
+
"86 ->>> None\n",
|
| 173 |
+
"87 ->>> None\n",
|
| 174 |
+
"88 ->>> None\n",
|
| 175 |
+
"89 ->>> None\n",
|
| 176 |
+
"90 ->>> None\n",
|
| 177 |
+
"91 ->>> None\n",
|
| 178 |
+
"92 ->>> None\n",
|
| 179 |
+
"93 ->>> None\n",
|
| 180 |
+
"94 ->>> None\n",
|
| 181 |
+
"95 ->>> None\n",
|
| 182 |
+
"96 ->>> {'end': 394720}\n",
|
| 183 |
+
"97 ->>> None\n",
|
| 184 |
+
"98 ->>> None\n",
|
| 185 |
+
"99 ->>> None\n",
|
| 186 |
+
"100 ->>> {'start': 410144}\n",
|
| 187 |
+
"101 ->>> None\n",
|
| 188 |
+
"102 ->>> None\n",
|
| 189 |
+
"103 ->>> None\n",
|
| 190 |
+
"104 ->>> None\n",
|
| 191 |
+
"105 ->>> None\n",
|
| 192 |
+
"106 ->>> None\n",
|
| 193 |
+
"107 ->>> None\n",
|
| 194 |
+
"108 ->>> None\n",
|
| 195 |
+
"109 ->>> None\n",
|
| 196 |
+
"110 ->>> None\n",
|
| 197 |
+
"111 ->>> None\n",
|
| 198 |
+
"112 ->>> None\n",
|
| 199 |
+
"113 ->>> None\n",
|
| 200 |
+
"114 ->>> None\n",
|
| 201 |
+
"115 ->>> None\n",
|
| 202 |
+
"116 ->>> None\n",
|
| 203 |
+
"117 ->>> None\n",
|
| 204 |
+
"118 ->>> None\n",
|
| 205 |
+
"119 ->>> None\n",
|
| 206 |
+
"120 ->>> None\n",
|
| 207 |
+
"121 ->>> None\n",
|
| 208 |
+
"122 ->>> {'end': 500192}\n",
|
| 209 |
+
"123 ->>> {'start': 503328}\n",
|
| 210 |
+
"124 ->>> {'end': 509920}\n",
|
| 211 |
+
"125 ->>> None\n",
|
| 212 |
+
"126 ->>> {'start': 519200}\n",
|
| 213 |
+
"127 ->>> None\n",
|
| 214 |
+
"128 ->>> None\n",
|
| 215 |
+
"129 ->>> None\n",
|
| 216 |
+
"130 ->>> None\n",
|
| 217 |
+
"131 ->>> None\n",
|
| 218 |
+
"132 ->>> None\n",
|
| 219 |
+
"133 ->>> None\n",
|
| 220 |
+
"134 ->>> None\n",
|
| 221 |
+
"135 ->>> {'end': 554976}\n",
|
| 222 |
+
"136 ->>> {'start': 556576}\n",
|
| 223 |
+
"137 ->>> None\n"
|
| 224 |
+
]
|
| 225 |
+
},
|
| 226 |
+
{
|
| 227 |
+
"ename": "KeyboardInterrupt",
|
| 228 |
+
"evalue": "",
|
| 229 |
+
"output_type": "error",
|
| 230 |
+
"traceback": [
|
| 231 |
+
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
| 232 |
+
"\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)",
|
| 233 |
+
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m i = \u001b[32m0\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mchunks_generator\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 3\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# vad_iterator.reset_states()\u001b[39;49;00m\n\u001b[32m 4\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# audio_buffer = np.append(audio_buffer, chunk)\u001b[39;49;00m\n\u001b[32m 6\u001b[39m \u001b[43m \u001b[49m\u001b[43mspeech_dict\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[43mvac\u001b[49m\u001b[43m(\u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreturn_seconds\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[32m 7\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mprint\u001b[39;49m\u001b[43m(\u001b[49m\u001b[33;43mf\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mi\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[33;43m ->>> \u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mspeech_dict\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
|
| 234 |
+
"\u001b[36mFile \u001b[39m\u001b[32m/Users/chenxiang/translator/core/vad_cpp/reference/归档/audio_utils.py:50\u001b[39m, in \u001b[36maudio_stream_generator\u001b[39m\u001b[34m(audio_file_path, chunk_size, simulate_realtime)\u001b[39m\n\u001b[32m 48\u001b[39m \u001b[38;5;66;03m# 模拟实时处理的延迟\u001b[39;00m\n\u001b[32m 49\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m simulate_realtime:\n\u001b[32m---> \u001b[39m\u001b[32m50\u001b[39m \u001b[43mtime\u001b[49m\u001b[43m.\u001b[49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[43mchunk_duration\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 52\u001b[39m \u001b[38;5;28;01myield\u001b[39;00m chunk\n\u001b[32m 54\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m音频流处理完成\u001b[39m\u001b[33m\"\u001b[39m)\n",
|
| 235 |
+
"\u001b[31mKeyboardInterrupt\u001b[39m: "
|
| 236 |
+
]
|
| 237 |
+
}
|
| 238 |
+
],
|
| 239 |
+
"source": [
|
| 240 |
+
"i = 0\n",
|
| 241 |
+
"for chunk in chunks_generator:\n",
|
| 242 |
+
" # vad_iterator.reset_states()\n",
|
| 243 |
+
" # audio_buffer = np.append(audio_buffer, chunk)\n",
|
| 244 |
+
" \n",
|
| 245 |
+
" speech_dict = vac(chunk, return_seconds=False)\n",
|
| 246 |
+
" print(f\"{i} ->>> {speech_dict}\")\n",
|
| 247 |
+
" # if speech_dict:\n",
|
| 248 |
+
" # print(speech_dict)\n",
|
| 249 |
+
" i+=1"
|
| 250 |
+
]
|
| 251 |
+
},
|
| 252 |
+
{
|
| 253 |
+
"cell_type": "code",
|
| 254 |
+
"execution_count": null,
|
| 255 |
+
"metadata": {},
|
| 256 |
+
"outputs": [],
|
| 257 |
+
"source": [
|
| 258 |
+
"audio_data, sample_rate = sf.read(audio_file_path)\n",
|
| 259 |
+
"\n",
|
| 260 |
+
"# 确保音频数据是float32类型\n",
|
| 261 |
+
"if audio_data.dtype != np.float32:\n",
|
| 262 |
+
" audio_data = audio_data.astype(np.float32)\n",
|
| 263 |
+
"\n",
|
| 264 |
+
"# 如果是立体声,转换为单声道\n",
|
| 265 |
+
"if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:\n",
|
| 266 |
+
" audio_data = audio_data.mean(axis=1)\n",
|
| 267 |
+
" \n",
|
| 268 |
+
"print(f\"已加载音频文件: {audio_file_path}\")\n",
|
| 269 |
+
"print(f\"采样率: {sample_rate} Hz\")\n",
|
| 270 |
+
"print(f\"音频长度: {len(audio_data)/sample_rate:.2f} 秒\")"
|
| 271 |
+
]
|
| 272 |
+
}
|
| 273 |
+
],
|
| 274 |
+
"metadata": {
|
| 275 |
+
"kernelspec": {
|
| 276 |
+
"display_name": "base",
|
| 277 |
+
"language": "python",
|
| 278 |
+
"name": "python3"
|
| 279 |
+
},
|
| 280 |
+
"language_info": {
|
| 281 |
+
"codemirror_mode": {
|
| 282 |
+
"name": "ipython",
|
| 283 |
+
"version": 3
|
| 284 |
+
},
|
| 285 |
+
"file_extension": ".py",
|
| 286 |
+
"mimetype": "text/x-python",
|
| 287 |
+
"name": "python",
|
| 288 |
+
"nbconvert_exporter": "python",
|
| 289 |
+
"pygments_lexer": "ipython3",
|
| 290 |
+
"version": "3.12.2"
|
| 291 |
+
}
|
| 292 |
+
},
|
| 293 |
+
"nbformat": 4,
|
| 294 |
+
"nbformat_minor": 2
|
| 295 |
+
}
|
silero_vad_onnx/CMakeLists.txt
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
cmake_minimum_required(VERSION 3.16)
|
| 2 |
+
project(VadOnnx)
|
| 3 |
+
|
| 4 |
+
set(CMAKE_CXX_STANDARD 17)
|
| 5 |
+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
| 6 |
+
set(CMAKE_CXX_EXTENSIONS OFF)
|
| 7 |
+
|
| 8 |
+
# 添加 ONNX Runtime include 路径
|
| 9 |
+
include_directories(${ONNXRUNTIME_DIR}/include)
|
| 10 |
+
|
| 11 |
+
# 添加项目头文件目录
|
| 12 |
+
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
| 13 |
+
|
| 14 |
+
add_library(silero_vad_onnx SHARED ${CMAKE_CURRENT_SOURCE_DIR}/vad_iterator.cpp
|
| 15 |
+
${CMAKE_CURRENT_SOURCE_DIR}/time_stamp.cpp)
|
| 16 |
+
|
| 17 |
+
# 设置库输出名称(跨平台兼容)
|
| 18 |
+
# set_target_properties(silero_vad_onnx PROPERTIES
|
| 19 |
+
# PREFIX ""
|
| 20 |
+
# SUFFIX ".so"
|
| 21 |
+
# LIBRARY_OUTPUT_NAME_DEBUG "silero_vad_onnx"
|
| 22 |
+
# LIBRARY_OUTPUT_NAME_RELEASE "silero_vad_onnx"
|
| 23 |
+
# )
|
| 24 |
+
|
| 25 |
+
# 链接 ONNX Runtime 库
|
| 26 |
+
if(APPLE)
|
| 27 |
+
# macOS 上链接 dylib
|
| 28 |
+
target_link_libraries(silero_vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/libonnxruntime.dylib)
|
| 29 |
+
elseif(UNIX)
|
| 30 |
+
# Linux 上链接 so
|
| 31 |
+
target_link_libraries(silero_vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/libonnxruntime.so)
|
| 32 |
+
elseif(WIN32)
|
| 33 |
+
# Windows 上链接 dll + lib
|
| 34 |
+
target_link_libraries(silero_vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/onnxruntime.lib)
|
| 35 |
+
set_target_properties(silero_vad_onnx PROPERTIES SUFFIX ".dll")
|
| 36 |
+
else()
|
| 37 |
+
message(WARNING "Unknown platform, no ONNX Runtime linking applied.")
|
| 38 |
+
endif()
|
| 39 |
+
|
silero_vad_onnx/time_stamp.cpp
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "time_stamp.h"
|
| 2 |
+
#include <cstdio>
|
| 3 |
+
#include <cstdarg>
|
| 4 |
+
#include <memory>
|
| 5 |
+
#include <string>
|
| 6 |
+
|
| 7 |
+
timestamp_t::timestamp_t(int s, int e)
|
| 8 |
+
: start(s), end(e) {}
|
| 9 |
+
|
| 10 |
+
timestamp_t& timestamp_t::operator=(const timestamp_t& a) {
|
| 11 |
+
if (this != &a) {
|
| 12 |
+
start = a.start;
|
| 13 |
+
end = a.end;
|
| 14 |
+
}
|
| 15 |
+
return *this;
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
bool timestamp_t::operator==(const timestamp_t& a) const {
|
| 19 |
+
return (start == a.start && end == a.end);
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
std::string timestamp_t::c_str() const {
|
| 23 |
+
return format("{start:%08d, end:%08d}", start, end);
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
std::string timestamp_t::format(const char* fmt, ...) const {
|
| 27 |
+
char buf[256];
|
| 28 |
+
va_list args;
|
| 29 |
+
va_start(args, fmt);
|
| 30 |
+
const auto r = std::vsnprintf(buf, sizeof(buf), fmt, args);
|
| 31 |
+
va_end(args);
|
| 32 |
+
|
| 33 |
+
if (r < 0)
|
| 34 |
+
return {};
|
| 35 |
+
|
| 36 |
+
const size_t len = r;
|
| 37 |
+
if (len < sizeof(buf))
|
| 38 |
+
return std::string(buf, len);
|
| 39 |
+
|
| 40 |
+
#if __cplusplus >= 201703L
|
| 41 |
+
std::string s(len + 1, '\0');
|
| 42 |
+
va_start(args, fmt);
|
| 43 |
+
std::vsnprintf(s.data(), len + 1, fmt, args);
|
| 44 |
+
va_end(args);
|
| 45 |
+
return s;
|
| 46 |
+
#else
|
| 47 |
+
std::unique_ptr<char[]> vbuf(new char[len + 1]);
|
| 48 |
+
va_start(args, fmt);
|
| 49 |
+
std::vsnprintf(vbuf.get(), len + 1, fmt, args);
|
| 50 |
+
va_end(args);
|
| 51 |
+
return std::string(vbuf.get(), len);
|
| 52 |
+
#endif
|
| 53 |
+
}
|
silero_vad_onnx/time_stamp.h
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef TIME_STAMP_H
|
| 2 |
+
#define TIME_STAMP_H
|
| 3 |
+
|
| 4 |
+
#include <string>
|
| 5 |
+
|
| 6 |
+
// timestamp_t class: stores the start and end (in samples) of a speech segment.
|
| 7 |
+
class timestamp_t {
|
| 8 |
+
public:
|
| 9 |
+
int start;
|
| 10 |
+
int end;
|
| 11 |
+
|
| 12 |
+
timestamp_t(int start = -1, int end = -1);
|
| 13 |
+
|
| 14 |
+
timestamp_t& operator=(const timestamp_t& a);
|
| 15 |
+
|
| 16 |
+
bool operator==(const timestamp_t& a) const;
|
| 17 |
+
|
| 18 |
+
// Returns a formatted string of the timestamp.
|
| 19 |
+
std::string c_str() const;
|
| 20 |
+
|
| 21 |
+
private:
|
| 22 |
+
// Helper function for formatting.
|
| 23 |
+
std::string format(const char* fmt, ...) const;
|
| 24 |
+
};
|
| 25 |
+
|
| 26 |
+
#endif // TIME_STAMP_H
|
silero_vad_onnx/vad_iterator.cpp
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "vad_iterator.h"
|
| 2 |
+
#include <cmath>
|
| 3 |
+
#include <cstdio>
|
| 4 |
+
#include <cstring>
|
| 5 |
+
#include <memory>
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
void VadIterator::init_onnx_model(const std::string& model_path) {
|
| 9 |
+
init_engine_threads(1, 1);
|
| 10 |
+
session = std::make_shared<Ort::Session>(env, model_path.c_str(), session_options);
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
void VadIterator::init_engine_threads(int inter_threads, int intra_threads) {
|
| 14 |
+
session_options.SetIntraOpNumThreads(intra_threads);
|
| 15 |
+
session_options.SetInterOpNumThreads(inter_threads);
|
| 16 |
+
session_options.SetGraphOptimizationLevel(ORT_ENABLE_ALL);
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
void VadIterator::reset_states() {
|
| 20 |
+
std::memset(_state.data(), 0, _state.size() * sizeof(float));
|
| 21 |
+
triggered = false;
|
| 22 |
+
temp_end = 0;
|
| 23 |
+
current_sample = 0;
|
| 24 |
+
prev_end = next_start = 0;
|
| 25 |
+
speeches.clear();
|
| 26 |
+
current_speech = timestamp_t();
|
| 27 |
+
std::fill(_context.begin(), _context.end(), 0.0f);
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
void VadIterator::predict(const std::vector<float>& data_chunk) {
|
| 31 |
+
std::vector<float> new_data(effective_window_size, 0.0f);
|
| 32 |
+
std::copy(_context.begin(), _context.end(), new_data.begin());
|
| 33 |
+
std::copy(data_chunk.begin(), data_chunk.end(), new_data.begin() + context_samples);
|
| 34 |
+
input = new_data;
|
| 35 |
+
|
| 36 |
+
Ort::Value input_ort = Ort::Value::CreateTensor<float>(
|
| 37 |
+
memory_info, input.data(), input.size(), input_node_dims, 2);
|
| 38 |
+
Ort::Value state_ort = Ort::Value::CreateTensor<float>(
|
| 39 |
+
memory_info, _state.data(), _state.size(), state_node_dims, 3);
|
| 40 |
+
Ort::Value sr_ort = Ort::Value::CreateTensor<int64_t>(
|
| 41 |
+
memory_info, sr.data(), sr.size(), sr_node_dims, 1);
|
| 42 |
+
|
| 43 |
+
ort_inputs.clear();
|
| 44 |
+
ort_inputs.emplace_back(std::move(input_ort));
|
| 45 |
+
ort_inputs.emplace_back(std::move(state_ort));
|
| 46 |
+
ort_inputs.emplace_back(std::move(sr_ort));
|
| 47 |
+
|
| 48 |
+
ort_outputs = session->Run(
|
| 49 |
+
Ort::RunOptions{nullptr},
|
| 50 |
+
input_node_names.data(), ort_inputs.data(), ort_inputs.size(),
|
| 51 |
+
output_node_names.data(), output_node_names.size());
|
| 52 |
+
|
| 53 |
+
float speech_prob = ort_outputs[0].GetTensorMutableData<float>()[0];
|
| 54 |
+
float* stateN = ort_outputs[1].GetTensorMutableData<float>();
|
| 55 |
+
std::memcpy(_state.data(), stateN, size_state * sizeof(float));
|
| 56 |
+
|
| 57 |
+
current_sample += static_cast<unsigned int>(window_size_samples);
|
| 58 |
+
|
| 59 |
+
if (speech_prob >= threshold) {
|
| 60 |
+
if (temp_end != 0) {
|
| 61 |
+
temp_end = 0;
|
| 62 |
+
if (next_start < prev_end)
|
| 63 |
+
next_start = current_sample - window_size_samples;
|
| 64 |
+
}
|
| 65 |
+
if (!triggered) {
|
| 66 |
+
triggered = true;
|
| 67 |
+
current_speech.start = current_sample - window_size_samples;
|
| 68 |
+
}
|
| 69 |
+
std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
|
| 70 |
+
return;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
if (triggered && ((current_sample - current_speech.start) > max_speech_samples)) {
|
| 74 |
+
if (prev_end > 0) {
|
| 75 |
+
current_speech.end = prev_end;
|
| 76 |
+
speeches.push_back(current_speech);
|
| 77 |
+
current_speech = timestamp_t();
|
| 78 |
+
if (next_start < prev_end)
|
| 79 |
+
triggered = false;
|
| 80 |
+
else
|
| 81 |
+
current_speech.start = next_start;
|
| 82 |
+
prev_end = 0;
|
| 83 |
+
next_start = 0;
|
| 84 |
+
temp_end = 0;
|
| 85 |
+
} else {
|
| 86 |
+
current_speech.end = current_sample;
|
| 87 |
+
speeches.push_back(current_speech);
|
| 88 |
+
current_speech = timestamp_t();
|
| 89 |
+
prev_end = 0;
|
| 90 |
+
next_start = 0;
|
| 91 |
+
temp_end = 0;
|
| 92 |
+
triggered = false;
|
| 93 |
+
}
|
| 94 |
+
std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
|
| 95 |
+
return;
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
if ((speech_prob >= (threshold - 0.15)) && (speech_prob < threshold)) {
|
| 99 |
+
std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
|
| 100 |
+
return;
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
if (speech_prob < (threshold - 0.15)) {
|
| 104 |
+
if (triggered) {
|
| 105 |
+
if (temp_end == 0)
|
| 106 |
+
temp_end = current_sample;
|
| 107 |
+
if (current_sample - temp_end > min_silence_samples_at_max_speech)
|
| 108 |
+
prev_end = temp_end;
|
| 109 |
+
if ((current_sample - temp_end) >= min_silence_samples) {
|
| 110 |
+
current_speech.end = temp_end;
|
| 111 |
+
if (current_speech.end - current_speech.start > min_speech_samples) {
|
| 112 |
+
speeches.push_back(current_speech);
|
| 113 |
+
current_speech = timestamp_t();
|
| 114 |
+
prev_end = 0;
|
| 115 |
+
next_start = 0;
|
| 116 |
+
temp_end = 0;
|
| 117 |
+
triggered = false;
|
| 118 |
+
}
|
| 119 |
+
}
|
| 120 |
+
}
|
| 121 |
+
std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
|
| 122 |
+
return;
|
| 123 |
+
}
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
void VadIterator::process(const std::vector<float>& input_wav) {
|
| 127 |
+
reset_states();
|
| 128 |
+
audio_length_samples = static_cast<int>(input_wav.size());
|
| 129 |
+
|
| 130 |
+
for (size_t j = 0; j < static_cast<size_t>(audio_length_samples); j += static_cast<size_t>(window_size_samples)) {
|
| 131 |
+
if (j + static_cast<size_t>(window_size_samples) > static_cast<size_t>(audio_length_samples))
|
| 132 |
+
break;
|
| 133 |
+
std::vector<float> chunk(&input_wav[j], &input_wav[j] + window_size_samples);
|
| 134 |
+
predict(chunk);
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
if (current_speech.start >= 0) {
|
| 138 |
+
current_speech.end = audio_length_samples;
|
| 139 |
+
speeches.push_back(current_speech);
|
| 140 |
+
current_speech = timestamp_t();
|
| 141 |
+
prev_end = 0;
|
| 142 |
+
next_start = 0;
|
| 143 |
+
temp_end = 0;
|
| 144 |
+
triggered = false;
|
| 145 |
+
}
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
const std::vector<timestamp_t>& VadIterator::get_speech_timestamps() const {
|
| 149 |
+
return speeches;
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
void VadIterator::reset() {
|
| 153 |
+
reset_states();
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
// 构造函数实现
|
| 157 |
+
VadIterator::VadIterator(const std::string ModelPath,
|
| 158 |
+
int Sample_rate,
|
| 159 |
+
int windows_frame_size,
|
| 160 |
+
float Threshold,
|
| 161 |
+
int min_silence_duration_ms,
|
| 162 |
+
int speech_pad_ms,
|
| 163 |
+
int min_speech_duration_ms,
|
| 164 |
+
float max_speech_duration_s)
|
| 165 |
+
: sample_rate(Sample_rate),
|
| 166 |
+
threshold(Threshold),
|
| 167 |
+
speech_pad_samples(speech_pad_ms),
|
| 168 |
+
prev_end(0),
|
| 169 |
+
memory_info(Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtDeviceAllocator, OrtMemType::OrtMemTypeDefault))
|
| 170 |
+
{
|
| 171 |
+
|
| 172 |
+
sr_per_ms = sample_rate / 1000;
|
| 173 |
+
window_size_samples = windows_frame_size * sr_per_ms;
|
| 174 |
+
effective_window_size = window_size_samples + context_samples;
|
| 175 |
+
|
| 176 |
+
input_node_dims[0] = 1;
|
| 177 |
+
input_node_dims[1] = effective_window_size;
|
| 178 |
+
|
| 179 |
+
_state.resize(size_state);
|
| 180 |
+
sr.resize(1);
|
| 181 |
+
sr[0] = sample_rate;
|
| 182 |
+
_context.assign(context_samples, 0.0f);
|
| 183 |
+
|
| 184 |
+
min_speech_samples = sr_per_ms * min_speech_duration_ms;
|
| 185 |
+
|
| 186 |
+
if (max_speech_duration_s < 0) {
|
| 187 |
+
max_speech_samples = std::numeric_limits<float>::infinity();
|
| 188 |
+
} else {
|
| 189 |
+
max_speech_samples = (sample_rate * max_speech_duration_s - window_size_samples - 2 * speech_pad_samples);
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
min_silence_samples = sr_per_ms * min_silence_duration_ms;
|
| 193 |
+
min_silence_samples_at_max_speech = sr_per_ms * 98;
|
| 194 |
+
|
| 195 |
+
init_onnx_model(ModelPath);
|
| 196 |
+
}
|
silero_vad_onnx/vad_iterator.h
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef VAD_ITERATOR_H
|
| 2 |
+
#define VAD_ITERATOR_H
|
| 3 |
+
|
| 4 |
+
#include "time_stamp.h"
|
| 5 |
+
#include <vector>
|
| 6 |
+
#include <string>
|
| 7 |
+
#if defined(__APPLE__)
|
| 8 |
+
#include <onnxruntime/onnxruntime_cxx_api.h>
|
| 9 |
+
#else
|
| 10 |
+
#include "onnxruntime_run_options_config_keys.h"
|
| 11 |
+
#include "onnxruntime_cxx_api.h"
|
| 12 |
+
#endif
|
| 13 |
+
// 前向声明 timestamp_t
|
| 14 |
+
class timestamp_t;
|
| 15 |
+
|
| 16 |
+
class VadIterator {
|
| 17 |
+
public:
|
| 18 |
+
// 构造函数
|
| 19 |
+
VadIterator(const std::string ModelPath,
|
| 20 |
+
int Sample_rate = 16000,
|
| 21 |
+
int windows_frame_size = 32,
|
| 22 |
+
float Threshold = 0.5,
|
| 23 |
+
int min_silence_duration_ms = 100,
|
| 24 |
+
int speech_pad_ms = 30,
|
| 25 |
+
int min_speech_duration_ms = 250,
|
| 26 |
+
float max_speech_duration_s = -1); // -1 表示无穷大
|
| 27 |
+
|
| 28 |
+
// 公共方法
|
| 29 |
+
void process(const std::vector<float>& input_wav);
|
| 30 |
+
const std::vector<timestamp_t>& get_speech_timestamps() const;
|
| 31 |
+
void reset();
|
| 32 |
+
|
| 33 |
+
private:
|
| 34 |
+
// ONNX Runtime 资源
|
| 35 |
+
Ort::Env env;
|
| 36 |
+
Ort::SessionOptions session_options;
|
| 37 |
+
std::shared_ptr<Ort::Session> session = nullptr;
|
| 38 |
+
Ort::AllocatorWithDefaultOptions allocator;
|
| 39 |
+
Ort::MemoryInfo memory_info;
|
| 40 |
+
|
| 41 |
+
// Context 相关变量
|
| 42 |
+
const int context_samples = 64;
|
| 43 |
+
std::vector<float> _context;
|
| 44 |
+
int window_size_samples;
|
| 45 |
+
int effective_window_size;
|
| 46 |
+
int sr_per_ms;
|
| 47 |
+
|
| 48 |
+
// ONNX 输入输出相关
|
| 49 |
+
std::vector<Ort::Value> ort_inputs;
|
| 50 |
+
std::vector<const char*> input_node_names = {"input", "state", "sr"};
|
| 51 |
+
std::vector<float> input;
|
| 52 |
+
unsigned int size_state = 2 * 1 * 128;
|
| 53 |
+
std::vector<float> _state;
|
| 54 |
+
std::vector<int64_t> sr;
|
| 55 |
+
int64_t input_node_dims[2];
|
| 56 |
+
const int64_t state_node_dims[3] = {2, 1, 128};
|
| 57 |
+
const int64_t sr_node_dims[1] = {1};
|
| 58 |
+
std::vector<Ort::Value> ort_outputs;
|
| 59 |
+
std::vector<const char*> output_node_names = {"output", "stateN"};
|
| 60 |
+
|
| 61 |
+
// 模型参数
|
| 62 |
+
int sample_rate;
|
| 63 |
+
float threshold;
|
| 64 |
+
int min_silence_samples;
|
| 65 |
+
int min_silence_samples_at_max_speech;
|
| 66 |
+
int min_speech_samples;
|
| 67 |
+
float max_speech_samples;
|
| 68 |
+
int speech_pad_samples;
|
| 69 |
+
int audio_length_samples;
|
| 70 |
+
|
| 71 |
+
// 状态管理
|
| 72 |
+
bool triggered = false;
|
| 73 |
+
unsigned int temp_end = 0;
|
| 74 |
+
unsigned int current_sample = 0;
|
| 75 |
+
int prev_end;
|
| 76 |
+
int next_start = 0;
|
| 77 |
+
std::vector<timestamp_t> speeches;
|
| 78 |
+
timestamp_t current_speech;
|
| 79 |
+
|
| 80 |
+
// 私有方法
|
| 81 |
+
void init_onnx_model(const std::string& model_path);
|
| 82 |
+
void init_engine_threads(int inter_threads, int intra_threads);
|
| 83 |
+
void reset_states();
|
| 84 |
+
void predict(const std::vector<float>& data_chunk);
|
| 85 |
+
};
|
| 86 |
+
|
| 87 |
+
#endif // VAD_ITERATOR_H
|
silero_vad_onnx/wav.h
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Copyright (c) 2016 Personal (Binbin Zhang)
|
| 2 |
+
//
|
| 3 |
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
// you may not use this file except in compliance with the License.
|
| 5 |
+
// You may obtain a copy of the License at
|
| 6 |
+
//
|
| 7 |
+
// http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
//
|
| 9 |
+
// Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
// See the License for the specific language governing permissions and
|
| 13 |
+
// limitations under the License.
|
| 14 |
+
|
| 15 |
+
#ifndef FRONTEND_WAV_H_
|
| 16 |
+
#define FRONTEND_WAV_H_
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
#include <assert.h>
|
| 20 |
+
#include <stdint.h>
|
| 21 |
+
#include <stdio.h>
|
| 22 |
+
#include <stdlib.h>
|
| 23 |
+
#include <string.h>
|
| 24 |
+
|
| 25 |
+
#include <string>
|
| 26 |
+
|
| 27 |
+
#include <iostream>
|
| 28 |
+
|
| 29 |
+
// #include "utils/log.h"
|
| 30 |
+
|
| 31 |
+
namespace wav {
|
| 32 |
+
|
| 33 |
+
struct WavHeader {
|
| 34 |
+
char riff[4]; // "riff"
|
| 35 |
+
unsigned int size;
|
| 36 |
+
char wav[4]; // "WAVE"
|
| 37 |
+
char fmt[4]; // "fmt "
|
| 38 |
+
unsigned int fmt_size;
|
| 39 |
+
uint16_t format;
|
| 40 |
+
uint16_t channels;
|
| 41 |
+
unsigned int sample_rate;
|
| 42 |
+
unsigned int bytes_per_second;
|
| 43 |
+
uint16_t block_size;
|
| 44 |
+
uint16_t bit;
|
| 45 |
+
char data[4]; // "data"
|
| 46 |
+
unsigned int data_size;
|
| 47 |
+
};
|
| 48 |
+
|
| 49 |
+
class WavReader {
|
| 50 |
+
public:
|
| 51 |
+
WavReader() : data_(nullptr) {}
|
| 52 |
+
explicit WavReader(const std::string& filename) { Open(filename); }
|
| 53 |
+
|
| 54 |
+
bool Open(const std::string& filename) {
|
| 55 |
+
FILE* fp = fopen(filename.c_str(), "rb"); //文件读取
|
| 56 |
+
if (NULL == fp) {
|
| 57 |
+
std::cout << "Error in read " << filename;
|
| 58 |
+
return false;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
WavHeader header;
|
| 62 |
+
fread(&header, 1, sizeof(header), fp);
|
| 63 |
+
if (header.fmt_size < 16) {
|
| 64 |
+
printf("WaveData: expect PCM format data "
|
| 65 |
+
"to have fmt chunk of at least size 16.\n");
|
| 66 |
+
return false;
|
| 67 |
+
} else if (header.fmt_size > 16) {
|
| 68 |
+
int offset = 44 - 8 + header.fmt_size - 16;
|
| 69 |
+
fseek(fp, offset, SEEK_SET);
|
| 70 |
+
fread(header.data, 8, sizeof(char), fp);
|
| 71 |
+
}
|
| 72 |
+
// check "riff" "WAVE" "fmt " "data"
|
| 73 |
+
|
| 74 |
+
// Skip any sub-chunks between "fmt" and "data". Usually there will
|
| 75 |
+
// be a single "fact" sub chunk, but on Windows there can also be a
|
| 76 |
+
// "list" sub chunk.
|
| 77 |
+
while (0 != strncmp(header.data, "data", 4)) {
|
| 78 |
+
// We will just ignore the data in these chunks.
|
| 79 |
+
fseek(fp, header.data_size, SEEK_CUR);
|
| 80 |
+
// read next sub chunk
|
| 81 |
+
fread(header.data, 8, sizeof(char), fp);
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
if (header.data_size == 0) {
|
| 85 |
+
int offset = ftell(fp);
|
| 86 |
+
fseek(fp, 0, SEEK_END);
|
| 87 |
+
header.data_size = ftell(fp) - offset;
|
| 88 |
+
fseek(fp, offset, SEEK_SET);
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
num_channel_ = header.channels;
|
| 92 |
+
sample_rate_ = header.sample_rate;
|
| 93 |
+
bits_per_sample_ = header.bit;
|
| 94 |
+
int num_data = header.data_size / (bits_per_sample_ / 8);
|
| 95 |
+
data_ = new float[num_data]; // Create 1-dim array
|
| 96 |
+
num_samples_ = num_data / num_channel_;
|
| 97 |
+
|
| 98 |
+
std::cout << "num_channel_ :" << num_channel_ << std::endl;
|
| 99 |
+
std::cout << "sample_rate_ :" << sample_rate_ << std::endl;
|
| 100 |
+
std::cout << "bits_per_sample_:" << bits_per_sample_ << std::endl;
|
| 101 |
+
std::cout << "num_samples :" << num_data << std::endl;
|
| 102 |
+
std::cout << "num_data_size :" << header.data_size << std::endl;
|
| 103 |
+
|
| 104 |
+
switch (bits_per_sample_) {
|
| 105 |
+
case 8: {
|
| 106 |
+
char sample;
|
| 107 |
+
for (int i = 0; i < num_data; ++i) {
|
| 108 |
+
fread(&sample, 1, sizeof(char), fp);
|
| 109 |
+
data_[i] = static_cast<float>(sample) / 32768;
|
| 110 |
+
}
|
| 111 |
+
break;
|
| 112 |
+
}
|
| 113 |
+
case 16: {
|
| 114 |
+
int16_t sample;
|
| 115 |
+
for (int i = 0; i < num_data; ++i) {
|
| 116 |
+
fread(&sample, 1, sizeof(int16_t), fp);
|
| 117 |
+
data_[i] = static_cast<float>(sample) / 32768;
|
| 118 |
+
}
|
| 119 |
+
break;
|
| 120 |
+
}
|
| 121 |
+
case 32:
|
| 122 |
+
{
|
| 123 |
+
if (header.format == 1) //S32
|
| 124 |
+
{
|
| 125 |
+
int sample;
|
| 126 |
+
for (int i = 0; i < num_data; ++i) {
|
| 127 |
+
fread(&sample, 1, sizeof(int), fp);
|
| 128 |
+
data_[i] = static_cast<float>(sample) / 32768;
|
| 129 |
+
}
|
| 130 |
+
}
|
| 131 |
+
else if (header.format == 3) // IEEE-float
|
| 132 |
+
{
|
| 133 |
+
float sample;
|
| 134 |
+
for (int i = 0; i < num_data; ++i) {
|
| 135 |
+
fread(&sample, 1, sizeof(float), fp);
|
| 136 |
+
data_[i] = static_cast<float>(sample);
|
| 137 |
+
}
|
| 138 |
+
}
|
| 139 |
+
else {
|
| 140 |
+
printf("unsupported quantization bits\n");
|
| 141 |
+
}
|
| 142 |
+
break;
|
| 143 |
+
}
|
| 144 |
+
default:
|
| 145 |
+
printf("unsupported quantization bits\n");
|
| 146 |
+
break;
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
fclose(fp);
|
| 150 |
+
return true;
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
int num_channel() const { return num_channel_; }
|
| 154 |
+
int sample_rate() const { return sample_rate_; }
|
| 155 |
+
int bits_per_sample() const { return bits_per_sample_; }
|
| 156 |
+
int num_samples() const { return num_samples_; }
|
| 157 |
+
|
| 158 |
+
~WavReader() {
|
| 159 |
+
delete[] data_;
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
const float* data() const { return data_; }
|
| 163 |
+
|
| 164 |
+
private:
|
| 165 |
+
int num_channel_;
|
| 166 |
+
int sample_rate_;
|
| 167 |
+
int bits_per_sample_;
|
| 168 |
+
int num_samples_; // sample points per channel
|
| 169 |
+
float* data_;
|
| 170 |
+
};
|
| 171 |
+
|
| 172 |
+
class WavWriter {
|
| 173 |
+
public:
|
| 174 |
+
WavWriter(const float* data, int num_samples, int num_channel,
|
| 175 |
+
int sample_rate, int bits_per_sample)
|
| 176 |
+
: data_(data),
|
| 177 |
+
num_samples_(num_samples),
|
| 178 |
+
num_channel_(num_channel),
|
| 179 |
+
sample_rate_(sample_rate),
|
| 180 |
+
bits_per_sample_(bits_per_sample) {}
|
| 181 |
+
|
| 182 |
+
void Write(const std::string& filename) {
|
| 183 |
+
FILE* fp = fopen(filename.c_str(), "w");
|
| 184 |
+
// init char 'riff' 'WAVE' 'fmt ' 'data'
|
| 185 |
+
WavHeader header;
|
| 186 |
+
char wav_header[44] = {0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57,
|
| 187 |
+
0x41, 0x56, 0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00,
|
| 188 |
+
0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
| 189 |
+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
| 190 |
+
0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00};
|
| 191 |
+
memcpy(&header, wav_header, sizeof(header));
|
| 192 |
+
header.channels = num_channel_;
|
| 193 |
+
header.bit = bits_per_sample_;
|
| 194 |
+
header.sample_rate = sample_rate_;
|
| 195 |
+
header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8);
|
| 196 |
+
header.size = sizeof(header) - 8 + header.data_size;
|
| 197 |
+
header.bytes_per_second =
|
| 198 |
+
sample_rate_ * num_channel_ * (bits_per_sample_ / 8);
|
| 199 |
+
header.block_size = num_channel_ * (bits_per_sample_ / 8);
|
| 200 |
+
|
| 201 |
+
fwrite(&header, 1, sizeof(header), fp);
|
| 202 |
+
|
| 203 |
+
for (int i = 0; i < num_samples_; ++i) {
|
| 204 |
+
for (int j = 0; j < num_channel_; ++j) {
|
| 205 |
+
switch (bits_per_sample_) {
|
| 206 |
+
case 8: {
|
| 207 |
+
char sample = static_cast<char>(data_[i * num_channel_ + j]);
|
| 208 |
+
fwrite(&sample, 1, sizeof(sample), fp);
|
| 209 |
+
break;
|
| 210 |
+
}
|
| 211 |
+
case 16: {
|
| 212 |
+
int16_t sample = static_cast<int16_t>(data_[i * num_channel_ + j]);
|
| 213 |
+
fwrite(&sample, 1, sizeof(sample), fp);
|
| 214 |
+
break;
|
| 215 |
+
}
|
| 216 |
+
case 32: {
|
| 217 |
+
int sample = static_cast<int>(data_[i * num_channel_ + j]);
|
| 218 |
+
fwrite(&sample, 1, sizeof(sample), fp);
|
| 219 |
+
break;
|
| 220 |
+
}
|
| 221 |
+
}
|
| 222 |
+
}
|
| 223 |
+
}
|
| 224 |
+
fclose(fp);
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
private:
|
| 228 |
+
const float* data_;
|
| 229 |
+
int num_samples_; // total float points in data_
|
| 230 |
+
int num_channel_;
|
| 231 |
+
int sample_rate_;
|
| 232 |
+
int bits_per_sample_;
|
| 233 |
+
};
|
| 234 |
+
|
| 235 |
+
} // namespace wav
|
| 236 |
+
|
| 237 |
+
#endif // FRONTEND_WAV_H_
|
vad_onnx/CMakeLists.txt
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
cmake_minimum_required(VERSION 3.16)
|
| 2 |
+
project(VadOnnx)
|
| 3 |
+
|
| 4 |
+
set(CMAKE_CXX_STANDARD 17)
|
| 5 |
+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
| 6 |
+
set(CMAKE_CXX_EXTENSIONS OFF)
|
| 7 |
+
|
| 8 |
+
# 添加 ONNX Runtime include 路径
|
| 9 |
+
include_directories(${ONNXRUNTIME_DIR}/include)
|
| 10 |
+
|
| 11 |
+
# 添加项目头文件目录
|
| 12 |
+
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
| 13 |
+
|
| 14 |
+
add_library(vad_onnx SHARED ${CMAKE_CURRENT_SOURCE_DIR}/vad_onnx.cpp)
|
| 15 |
+
|
| 16 |
+
# 设置库输出名称(跨平台兼容)
|
| 17 |
+
# set_target_properties(vad_onnx PROPERTIES
|
| 18 |
+
# PREFIX ""
|
| 19 |
+
# SUFFIX ".so"
|
| 20 |
+
# LIBRARY_OUTPUT_NAME_DEBUG "vad_onnx"
|
| 21 |
+
# LIBRARY_OUTPUT_NAME_RELEASE "vad_onnx"
|
| 22 |
+
# )
|
| 23 |
+
|
| 24 |
+
# 链接 ONNX Runtime 库
|
| 25 |
+
if(APPLE)
|
| 26 |
+
# macOS 上链接 dylib
|
| 27 |
+
target_link_libraries(vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/libonnxruntime.dylib)
|
| 28 |
+
elseif(UNIX)
|
| 29 |
+
# Linux 上链接 so
|
| 30 |
+
target_link_libraries(vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/libonnxruntime.so)
|
| 31 |
+
elseif(WIN32)
|
| 32 |
+
# Windows 上链接 dll + lib
|
| 33 |
+
target_link_libraries(vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/onnxruntime.lib)
|
| 34 |
+
set_target_properties(vad_onnx PROPERTIES SUFFIX ".dll")
|
| 35 |
+
else()
|
| 36 |
+
message(WARNING "Unknown platform, no ONNX Runtime linking applied.")
|
| 37 |
+
endif()
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
|
vad_onnx/vad_onnx.cpp
ADDED
|
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <stdexcept>
|
| 2 |
+
#include <cmath>
|
| 3 |
+
#include <iostream>
|
| 4 |
+
|
| 5 |
+
#include "vad_onnx.h"
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
static void get_input_names(Ort::Session* session, std::vector<std::string> &input_names_str,
|
| 9 |
+
std::vector<const char *> &input_names_char) {
|
| 10 |
+
Ort::AllocatorWithDefaultOptions allocator;
|
| 11 |
+
size_t nodes_num = session->GetInputCount();
|
| 12 |
+
input_names_str.resize(nodes_num);
|
| 13 |
+
input_names_char.resize(nodes_num);
|
| 14 |
+
|
| 15 |
+
for (size_t i = 0; i != nodes_num; ++i) {
|
| 16 |
+
auto t = session->GetInputNameAllocated(i, allocator);
|
| 17 |
+
input_names_str[i] = t.get();
|
| 18 |
+
input_names_char[i] = input_names_str[i].c_str();
|
| 19 |
+
}
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
static void get_output_names(Ort::Session* session, std::vector<std::string> &output_names_,
|
| 23 |
+
std::vector<const char *> &vad_out_names_) {
|
| 24 |
+
Ort::AllocatorWithDefaultOptions allocator;
|
| 25 |
+
size_t nodes_num = session->GetOutputCount();
|
| 26 |
+
output_names_.resize(nodes_num);
|
| 27 |
+
vad_out_names_.resize(nodes_num);
|
| 28 |
+
for (size_t i = 0; i != nodes_num; ++i) {
|
| 29 |
+
auto t = session->GetOutputNameAllocated(i, allocator);
|
| 30 |
+
output_names_[i] = t.get();
|
| 31 |
+
vad_out_names_[i] = output_names_[i].c_str();
|
| 32 |
+
}
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
VadOnnx::VadOnnx(const std::string& model_path,
|
| 36 |
+
int batch_size,
|
| 37 |
+
int thread_num,
|
| 38 |
+
float threshold,
|
| 39 |
+
int sampling_rate,
|
| 40 |
+
int min_silence_duration_ms,
|
| 41 |
+
float max_speech_duration_s,
|
| 42 |
+
int speech_pad_ms)
|
| 43 |
+
: batch_size_(batch_size),
|
| 44 |
+
thread_num_(thread_num),
|
| 45 |
+
threshold_(threshold),
|
| 46 |
+
sample_rates_(sampling_rate),
|
| 47 |
+
min_silence_samples_(sampling_rate * min_silence_duration_ms / 1000.0),
|
| 48 |
+
speech_pad_samples_(sampling_rate * speech_pad_ms / 1000.0),
|
| 49 |
+
triggered_(false),
|
| 50 |
+
temp_end_(0),
|
| 51 |
+
current_sample_(0),
|
| 52 |
+
start_(0),
|
| 53 |
+
memory_info(Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU))
|
| 54 |
+
{
|
| 55 |
+
|
| 56 |
+
init_onnx_model(model_path);
|
| 57 |
+
|
| 58 |
+
get_input_names(session.get(), input_names_, vad_in_names_);
|
| 59 |
+
get_output_names(session.get(), output_names_, vad_out_names_);
|
| 60 |
+
|
| 61 |
+
sr.resize(1);
|
| 62 |
+
sr[0] = sample_rates_;
|
| 63 |
+
|
| 64 |
+
if (batch_size_ != 1) {
|
| 65 |
+
state_shape = {2, batch_size_, 128};
|
| 66 |
+
state_size = 2 * batch_size_ * 128;
|
| 67 |
+
}
|
| 68 |
+
state_.resize(state_size);
|
| 69 |
+
|
| 70 |
+
context_size = (sample_rates_ == 16000) ? 64 : 32;
|
| 71 |
+
context_.resize(context_size);
|
| 72 |
+
|
| 73 |
+
effective_window_size = window_size_samples + context_size;
|
| 74 |
+
input_node_shape[0] = 1;
|
| 75 |
+
input_node_shape[1] = effective_window_size;
|
| 76 |
+
|
| 77 |
+
reset_states();
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
VadOnnx::~VadOnnx() = default;
|
| 81 |
+
|
| 82 |
+
void VadOnnx::reset_states() {
|
| 83 |
+
std::memset(state_.data(), 0, state_.size() * sizeof(float));
|
| 84 |
+
std::fill(context_.begin(), context_.end(), 0.0f);
|
| 85 |
+
triggered_ = false;
|
| 86 |
+
temp_end_ = 0;
|
| 87 |
+
current_sample_ = 0;
|
| 88 |
+
start_ = 0;
|
| 89 |
+
last_sr_ = 0;
|
| 90 |
+
last_batch_size_ = 0;
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
float VadOnnx::forward_infer(std::vector<float>& data_chunk) {
|
| 94 |
+
// 合并 context 和 input
|
| 95 |
+
std::vector<float> x_with_context(effective_window_size, 0.0f);
|
| 96 |
+
std::copy(context_.begin(), context_.end(), x_with_context.begin());
|
| 97 |
+
std::copy(data_chunk.begin(), data_chunk.end(), x_with_context.begin() + context_size);
|
| 98 |
+
input = x_with_context;
|
| 99 |
+
|
| 100 |
+
// Prepare inputs
|
| 101 |
+
Ort::Value input_tensor = Ort::Value::CreateTensor<float>(
|
| 102 |
+
memory_info, input.data(), input.size(), input_node_shape.data(), 2);
|
| 103 |
+
Ort::Value state_tensor = Ort::Value::CreateTensor<float>(
|
| 104 |
+
memory_info, state_.data(), state_.size(), state_shape.data(), 3);
|
| 105 |
+
Ort::Value sr_tensor = Ort::Value::CreateTensor<int64_t>(
|
| 106 |
+
memory_info, sr.data(), 1, sr_shape.data(), 1);
|
| 107 |
+
|
| 108 |
+
ort_inputs.clear();
|
| 109 |
+
ort_inputs.emplace_back(std::move(input_tensor));
|
| 110 |
+
ort_inputs.emplace_back(std::move(state_tensor));
|
| 111 |
+
ort_inputs.emplace_back(std::move(sr_tensor));
|
| 112 |
+
|
| 113 |
+
// Run inference
|
| 114 |
+
ort_outputs = session->Run(
|
| 115 |
+
Ort::RunOptions{nullptr}, vad_in_names_.data(), ort_inputs.data(),
|
| 116 |
+
ort_inputs.size(), vad_out_names_.data(), vad_out_names_.size());
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
// Get output
|
| 120 |
+
float speech_prob = ort_outputs[0].GetTensorMutableData<float>()[0];
|
| 121 |
+
|
| 122 |
+
// Update state
|
| 123 |
+
float* stateN = ort_outputs[1].GetTensorMutableData<float>();
|
| 124 |
+
std::memcpy(state_.data(), stateN, state_size * sizeof(float));
|
| 125 |
+
|
| 126 |
+
// Update context
|
| 127 |
+
std::copy(x_with_context.end() - context_size, x_with_context.end(), context_.begin());
|
| 128 |
+
|
| 129 |
+
return speech_prob;
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
std::vector<float> VadOnnx::vad_dectect(std::vector<float>& audio) {
|
| 133 |
+
std::vector<float> result;
|
| 134 |
+
|
| 135 |
+
// Pad to multiple of num_samples
|
| 136 |
+
int pad_num = (window_size_samples - (audio.size() % window_size_samples)) % window_size_samples;
|
| 137 |
+
audio.insert(audio.end(), pad_num, 0.0f);
|
| 138 |
+
|
| 139 |
+
for (size_t i = 0; i < audio.size(); i += window_size_samples) {
|
| 140 |
+
std::vector<float> chunk(audio.begin() + i, audio.begin() + i + window_size_samples);
|
| 141 |
+
auto prob = forward_infer(chunk);
|
| 142 |
+
result.emplace_back(prob);
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
return result;
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
std::map<std::string, double> VadOnnx::vad_dectect(std::vector<float>& audio, bool return_seconds) {
|
| 149 |
+
std::map<std::string, double> result;
|
| 150 |
+
|
| 151 |
+
// 将新音频追加到缓存中
|
| 152 |
+
buffer_.insert(buffer_.end(), audio.begin(), audio.end());
|
| 153 |
+
|
| 154 |
+
while (buffer_.size() > 0) {
|
| 155 |
+
std::map<std::string, double> tmp;
|
| 156 |
+
std::vector<float> chunk(buffer_.begin(), buffer_.begin() + std::min(static_cast<int>(buffer_.size()), window_size_samples));
|
| 157 |
+
// 补零到固定长度
|
| 158 |
+
if (chunk.size() < static_cast<size_t>(window_size_samples)) {
|
| 159 |
+
chunk.resize(window_size_samples, 0.0f);
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
current_sample_ += window_size_samples;
|
| 163 |
+
|
| 164 |
+
// 推理得到语音概率
|
| 165 |
+
float speech_prob = forward_infer(chunk);
|
| 166 |
+
|
| 167 |
+
if (speech_prob >= threshold_ && temp_end_ > 0) {
|
| 168 |
+
temp_end_ = 0;
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
if (speech_prob >= threshold_ && !triggered_) {
|
| 172 |
+
triggered_ = true;
|
| 173 |
+
start_ = std::max(0.0, current_sample_ - window_size_samples);
|
| 174 |
+
tmp["start"] = return_seconds ? start_ / sample_rates_ : start_;
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
if (speech_prob < (threshold_ - 0.15) && triggered_) {
|
| 178 |
+
if (temp_end_ == 0) {
|
| 179 |
+
temp_end_ = current_sample_;
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
if (current_sample_ - temp_end_ >= min_silence_samples_) {
|
| 183 |
+
double speech_end = temp_end_;
|
| 184 |
+
tmp["end"] = return_seconds ? speech_end / sample_rates_ : speech_end;
|
| 185 |
+
temp_end_ = 0;
|
| 186 |
+
triggered_ = false;
|
| 187 |
+
}
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
// 移除已处理的数据
|
| 191 |
+
if (window_size_samples >= buffer_.size()) {
|
| 192 |
+
buffer_.clear(); // 全部丢弃
|
| 193 |
+
} else {
|
| 194 |
+
std::copy(buffer_.begin() + window_size_samples, buffer_.end(), buffer_.begin());
|
| 195 |
+
buffer_.resize(buffer_.size() - window_size_samples);
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
// 合并检测结果
|
| 199 |
+
if (result.empty()) {
|
| 200 |
+
result = tmp;
|
| 201 |
+
} else if (!tmp.empty()) {
|
| 202 |
+
// 如果当前结果有 'end',更新最终 end
|
| 203 |
+
if (tmp.find("end") != tmp.end()) {
|
| 204 |
+
result["end"] = tmp["end"];
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
// 如果有新的 start,但前一个有 end,则合并成连续语音段
|
| 208 |
+
if (tmp.find("start") != tmp.end() && result.find("end") != result.end()) {
|
| 209 |
+
result.erase("end");
|
| 210 |
+
}
|
| 211 |
+
}
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
return result;
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
void VadOnnx::init_onnx_model(const std::string& model_path) {
|
| 218 |
+
init_engine_threads(1, 1);
|
| 219 |
+
init_exec_provider();
|
| 220 |
+
|
| 221 |
+
// 初始化 ONNX Session
|
| 222 |
+
env_ = Ort::Env(ORT_LOGGING_LEVEL_WARNING, "VadOnnx");
|
| 223 |
+
session = std::make_unique<Ort::Session>(env_, ORTCHAR(model_path.c_str()), session_options);
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
void VadOnnx::init_engine_threads(int inter_threads, int intra_threads) {
|
| 227 |
+
session_options.SetInterOpNumThreads(inter_threads);
|
| 228 |
+
session_options.SetIntraOpNumThreads(intra_threads);
|
| 229 |
+
session_options.SetGraphOptimizationLevel(ORT_ENABLE_ALL);
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
void VadOnnx::init_exec_provider() {
|
| 233 |
+
// 获取所有可用的 Execution Providers
|
| 234 |
+
std::vector<std::string> providers = Ort::GetAvailableProviders();
|
| 235 |
+
// 根据支持情况添加 Execution Provider
|
| 236 |
+
if (std::find(providers.begin(), providers.end(), "CUDAExecutionProvider") != providers.end()) {
|
| 237 |
+
OrtCUDAProviderOptions cuda_options{};
|
| 238 |
+
session_options.AppendExecutionProvider_CUDA(cuda_options);
|
| 239 |
+
}
|
| 240 |
+
// #if defined(__APPLE__)
|
| 241 |
+
// if (std::find(providers.begin(), providers.end(), "CoreMLExecutionProvider") != providers.end()) {
|
| 242 |
+
// session_options.AppendExecutionProvider_CoreML();
|
| 243 |
+
// }
|
| 244 |
+
// #endif
|
| 245 |
+
}
|
vad_onnx/vad_onnx.h
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <vector>
|
| 2 |
+
#include <string>
|
| 3 |
+
#include <map>
|
| 4 |
+
|
| 5 |
+
#if defined(__APPLE__)
|
| 6 |
+
#include <onnxruntime/onnxruntime_cxx_api.h>
|
| 7 |
+
#else
|
| 8 |
+
#include "onnxruntime_run_options_config_keys.h"
|
| 9 |
+
#include "onnxruntime_cxx_api.h"
|
| 10 |
+
#endif
|
| 11 |
+
|
| 12 |
+
#ifdef _WIN32
|
| 13 |
+
|
| 14 |
+
#define ORTSTRING(str) StrToWstr(str)
|
| 15 |
+
#define ORTCHAR(str) StrToWstr(str).c_str()
|
| 16 |
+
|
| 17 |
+
inline std::wstring String2wstring(const std::string& str, const std::string& locale)
|
| 18 |
+
{
|
| 19 |
+
typedef std::codecvt_byname<wchar_t, char, std::mbstate_t> F;
|
| 20 |
+
std::wstring_convert<F> strCnv(new F(locale));
|
| 21 |
+
return strCnv.from_bytes(str);
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
inline std::wstring StrToWstr(std::string str) {
|
| 25 |
+
if (str.length() == 0)
|
| 26 |
+
return L"";
|
| 27 |
+
return String2wstring(str, "zh-CN");
|
| 28 |
+
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
#else
|
| 32 |
+
|
| 33 |
+
#define ORTSTRING(str) str
|
| 34 |
+
#define ORTCHAR(str) str
|
| 35 |
+
|
| 36 |
+
#endif
|
| 37 |
+
|
| 38 |
+
class VadOnnx {
|
| 39 |
+
|
| 40 |
+
public:
|
| 41 |
+
explicit VadOnnx(const std::string& model_path,
|
| 42 |
+
int batch_size = 1,
|
| 43 |
+
int thread_num = 1,
|
| 44 |
+
float threshold = 0.5,
|
| 45 |
+
int sampling_rate = 16000,
|
| 46 |
+
int min_silence_duration_ms = 100,
|
| 47 |
+
float max_speech_duration_s = INFINITY,
|
| 48 |
+
int speech_pad_ms = 30);
|
| 49 |
+
~VadOnnx();
|
| 50 |
+
|
| 51 |
+
// 处理固定长度音频(16000 -> 512 , 8000 -> 256)
|
| 52 |
+
float forward_infer(std::vector<float>& data_chunk);
|
| 53 |
+
|
| 54 |
+
// 处理整个长音频,返回概率
|
| 55 |
+
std::vector<float> vad_dectect(std::vector<float>& audio);
|
| 56 |
+
|
| 57 |
+
// 处理整个长音频,返回有效音频区间
|
| 58 |
+
std::map<std::string, double> vad_dectect(std::vector<float>& audio, bool return_seconds);
|
| 59 |
+
|
| 60 |
+
// 重置 RNN 状态
|
| 61 |
+
void reset_states();
|
| 62 |
+
|
| 63 |
+
private:
|
| 64 |
+
// onnx资源参数
|
| 65 |
+
Ort::Env env_;
|
| 66 |
+
Ort::SessionOptions session_options;
|
| 67 |
+
std::unique_ptr<Ort::Session> session = nullptr;
|
| 68 |
+
Ort::AllocatorWithDefaultOptions allocator;
|
| 69 |
+
Ort::MemoryInfo memory_info;
|
| 70 |
+
int thread_num_;
|
| 71 |
+
|
| 72 |
+
// onnx输入输出相关
|
| 73 |
+
std::vector<Ort::Value> ort_inputs, ort_outputs;
|
| 74 |
+
std::vector<std::string> input_names_, output_names_;
|
| 75 |
+
std::vector<const char *> vad_in_names_;
|
| 76 |
+
std::vector<const char *> vad_out_names_;
|
| 77 |
+
|
| 78 |
+
int window_size_samples = 512;
|
| 79 |
+
int effective_window_size;
|
| 80 |
+
std::array<int64_t, 2> input_node_shape;
|
| 81 |
+
std::vector<float> input;
|
| 82 |
+
std::array<int64_t, 3> state_shape = {2, 1, 128};
|
| 83 |
+
int state_size = 2 * 1 * 128;
|
| 84 |
+
std::vector<float> state_; // RNN State
|
| 85 |
+
int context_size;
|
| 86 |
+
std::vector<float> context_; // Context buffer
|
| 87 |
+
std::array<int64_t, 1> sr_shape = {1};
|
| 88 |
+
std::vector<int64_t> sr;
|
| 89 |
+
|
| 90 |
+
// vad推理参数
|
| 91 |
+
std::vector<float> buffer_; // 缓冲区用于保存未处理完的音频
|
| 92 |
+
double min_silence_samples_;
|
| 93 |
+
double speech_pad_samples_;
|
| 94 |
+
double temp_end_;
|
| 95 |
+
double current_sample_;
|
| 96 |
+
double start_;
|
| 97 |
+
float threshold_;
|
| 98 |
+
bool triggered_;
|
| 99 |
+
int batch_size_;
|
| 100 |
+
int sample_rates_;
|
| 101 |
+
int last_sr_ = 0;
|
| 102 |
+
int last_batch_size_ = 0;
|
| 103 |
+
|
| 104 |
+
void init_onnx_model(const std::string& model_path);
|
| 105 |
+
void init_engine_threads(int inter_threads, int intra_threads);
|
| 106 |
+
void init_exec_provider();
|
| 107 |
+
};
|