hzeng412 xianglarry commited on
Commit
d21d362
·
0 Parent(s):

Duplicate from MoYoYoTech/vad_cpp

Browse files

Co-authored-by: chenxiang <xianglarry@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
CMakeLists.txt ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cmake_minimum_required(VERSION 3.16)
2
+ project(VadOnnx)
3
+
4
+ set(CMAKE_CXX_STANDARD 17)
5
+ set(CMAKE_CXX_STANDARD_REQUIRED ON)
6
+ set(CMAKE_CXX_EXTENSIONS OFF)
7
+
8
+ # 可选:测试是否为大端系统(用于音频处理等场景)
9
+ include(TestBigEndian)
10
+ test_big_endian(BIG_ENDIAN)
11
+ if(BIG_ENDIAN)
12
+ message("Big endian system")
13
+ else()
14
+ message("Little endian system")
15
+ endif()
16
+
17
+ # 查找 ONNX Runtime 安装路径
18
+ if(NOT DEFINED ONNXRUNTIME_DIR OR NOT EXISTS ${ONNXRUNTIME_DIR})
19
+ message(FATAL_ERROR "Please specify ONNXRUNTIME_DIR when configuring, e.g. cmake -DONNXRUNTIME_DIR=/path/to/onnxruntime ..")
20
+ endif()
21
+
22
+ # 使用 pkg-config 查找 sndfile
23
+ find_package(PkgConfig REQUIRED)
24
+ pkg_check_modules(SNDFILE REQUIRED IMPORTED_TARGET sndfile)
25
+
26
+ # # 添加 ONNX Runtime include 路径
27
+ # include_directories(${ONNXRUNTIME_DIR}/include)
28
+
29
+ # file(GLOB SOURCES_FILES "${CMAKE_SOURCE_DIR}/src/*.cpp")
30
+ # set(files ${files1})
31
+
32
+ # add_library(vad_onnx SHARED ${SOURCES_FILES})
33
+ # add_library(vad_onnx SHARED ${CMAKE_SOURCE_DIR}/src/vad_onnx.cpp)
34
+
35
+ # 设置库输出名称(跨平台兼容)
36
+ # set_target_properties(vad_onnx PROPERTIES
37
+ # PREFIX ""
38
+ # SUFFIX ".so"
39
+ # LIBRARY_OUTPUT_NAME_DEBUG "vad_onnx"
40
+ # LIBRARY_OUTPUT_NAME_RELEASE "vad_onnx"
41
+ # )
42
+
43
+ # 链接 ONNX Runtime 库
44
+ # if(APPLE)
45
+ # # macOS 上链接 dylib
46
+ # target_link_libraries(vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/libonnxruntime.dylib)
47
+ # elseif(UNIX)
48
+ # # Linux 上链接 so
49
+ # target_link_libraries(vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/libonnxruntime.so)
50
+ # elseif(WIN32)
51
+ # # Windows 上链接 dll + lib
52
+ # target_link_libraries(vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/onnxruntime.lib)
53
+ # set_target_properties(vad_onnx PROPERTIES SUFFIX ".dll")
54
+ # else()
55
+ # message(WARNING "Unknown platform, no ONNX Runtime linking applied.")
56
+ # endif()
57
+
58
+ # 添加项目头文件目录
59
+ # include_directories(${CMAKE_SOURCE_DIR}/src)
60
+
61
+ # 可选:添加 ffmpeg 等其他依赖
62
+ # if (ENABLE_FFMPEG)
63
+ # include_directories(${FFMPEG_DIR}/include)
64
+ # target_link_libraries(vad_onnx PRIVATE ${FFMPEG_DIR}/lib/libavcodec.a ${FFMPEG_DIR}/lib/libavutil.a)
65
+ # endif()
66
+
67
+ # https://github.com/snakers4/silero-vad examples/cpp
68
+ add_subdirectory(silero_vad_onnx)
69
+
70
+ # from moyoyo/translator python/helpers/vadprocessor.py
71
+ add_subdirectory(vad_onnx)
72
+
73
+ # 编译测试程序
74
+ add_subdirectory(bin)
README.md ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
4
+
5
+ ## 简介
6
+ 这是一个对 silero_vad (https://github.com/snakers4/silero-vad) 的简易封装,便于开发。
7
+
8
+ ## 目录简介
9
+
10
+ ## 环境安装
11
+ ### 系统环境
12
+
13
+ > 1. 在macOS 14.5 版本已验证执行OK。
14
+ > 2. mac上安装 onnxruntime(brew 安装未使能 CoreMl, 需下载源码, 手动编译使能, 并安装)
15
+ ```bash
16
+ brew install onnxruntime
17
+ ```
18
+ > 3. 下载onnxruntime源码, 手动编译使能, 并安装
19
+ ```bash
20
+ brew install cmake protobuf python # 可选
21
+ git clone https://github.com/microsoft/onnxruntime
22
+ cd onnxruntime
23
+ ./build.sh --config Release --enable_coreml
24
+ # 或者 ./build.sh --config Release --enable_coreml --build_wheel --parallel
25
+ sudo ./install_to_system.sh # 如果不安装,库目录build/Release,头文件目录 build/Release/include/
26
+ ```
27
+ ## 目录简介
28
+
29
+ ```
30
+ .
31
+ ├── README.md
32
+ ├── bin/
33
+ │ ├──main_silero.cpp // 参照 silero_vad中cpp的example封装代码进行测试
34
+ │ ├── main.cpp // 参照 translator中FixedVADIterator封装代码进行测试
35
+ │ ├── wav.h // 定义读取 wav 文件类
36
+ │ └── ...
37
+ ├── python/
38
+ │ ├── processing.py // translator中FixedVADIterator的python脚本
39
+ │ └── ...
40
+ ├── reference/ // python、cpp参考代码
41
+ ├── silero_vad_onnx/ // 参照 silero_vad中cpp的封装
42
+ │ ├── time_stamp.cpp
43
+ │ ├── time_stamp.h
44
+ │ ├── vad_iterator.cpp
45
+ │ ├── vad_iterator.h
46
+ │ └── ...
47
+ ├── vad_onnx/ // 参照 translator中FixedVADIterator封装
48
+ │ ├── vad_onnx.cpp
49
+ │ ├── vad_onnx.h
50
+ │ └── ...
51
+ └── ...
52
+ ```
53
+
54
+ ## 编译
55
+ ```bash
56
+ git clone https://huggingface.co/MoYoYoTech/vad_cpp
57
+ cd vad_cpp
58
+ mkdir build
59
+ cd build
60
+ cmake .. -DONNXRUNTIME_DIR=/opt/homebrew/Cellar/onnxruntime/1.21.1 # 或者指定源码编译后的路径,包括 include 和 lib目录
61
+ make
62
+ ```
63
+
64
+ ## 运行&使用
65
+ ### 接口调用和使用参考 main_silero.cpp 和 main.cpp
66
+ ```bash
67
+ cd bin
68
+ # silero_vad_onnx.dylib 测试程序
69
+ ./main_silero "/Users/.../Translator/moyoyo_asr_models/silero-vad/silero_vad.onnx" "/Users/xxx/zh.wav"
70
+ # vad_onnx.dylib 测试程序
71
+ ./main "/Users/.../Translator/moyoyo_asr_models/silero-vad/silero_vad.onnx" "/Users/xxx/zh.wav"
72
+ ```
73
+
74
+ ### translator中FixedVADIterator的python测试脚本
75
+ ```bash
76
+ cd vad_cpp
77
+ # python/processing.py中配置 wav_path ; 在python/helpers/vadprocessor.py中配置 VAD_MODEL_PATH
78
+ python -m python.processing
79
+ # 结果显示
80
+ ....
81
+ 935936 ->>> {'start': 935456} -> {'start': 5664}
82
+ 984576 ->>> {'end': 983008} -> {'start': 5664, 'end': 983008}
83
+ strat: 5664 end: 983008
84
+ ```
bin/CMakeLists.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set(CMAKE_CXX_STANDARD 17)
2
+
3
+ if(WIN32)
4
+ add_compile_options("$<$<CXX_COMPILER_ID:MSVC>:/execution-charset:utf-8>")
5
+ add_compile_options("$<$<CXX_COMPILER_ID:MSVC>:/source-charset:utf-8>")
6
+ endif()
7
+
8
+ # 查找 onnxruntime
9
+ find_package(onnxruntime REQUIRED)
10
+ # 添加 ONNX Runtime include 路径
11
+ include_directories(${ONNXRUNTIME_DIR}/include)
12
+
13
+ include_directories(${PROJECT_SOURCE_DIR}/vad_onnx)
14
+ add_executable(main "main.cpp")
15
+ if(UNIX AND NOT APPLE)
16
+ target_link_options(main PRIVATE "-Wl,--no-as-needed")
17
+ endif()
18
+ target_link_libraries(main PUBLIC vad_onnx onnxruntime::onnxruntime PkgConfig::SNDFILE)
19
+
20
+
21
+ include_directories(${PROJECT_SOURCE_DIR}/silero_vad_onnx)
22
+ add_executable(main_silero "main_silero.cpp")
23
+ if(UNIX AND NOT APPLE)
24
+ target_link_options(main PRIVATE "-Wl,--no-as-needed")
25
+ endif()
26
+ target_link_libraries(main_silero PUBLIC silero_vad_onnx onnxruntime::onnxruntime)
bin/main.cpp ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "vad_onnx.h"
2
+ #include <iostream>
3
+ #include <sndfile.h>
4
+
5
+
6
+ int main(int argc, char* argv[]) {
7
+ if (argc < 3) {
8
+ std::cerr << "Usage: " << argv[0] << " <model_absolute_path>" << " <audio_file_absolute_path>" << std::endl;
9
+ return 1;
10
+ }
11
+
12
+ // 获取命令行传入的音频文件路径
13
+ std::string model_path = argv[1];
14
+ std::string wav_path = argv[2];
15
+
16
+ // std::string model_path = "/Users/chenxiang/translator/Translator/moyoyo_asr_models/silero-vad/silero_vad.onnx";
17
+ // std::string wav_path = "/Users/chenxiang/translator/core/whisper_wrapper/bin/zh.wav";
18
+
19
+ // 加载音频文件
20
+ SF_INFO sf_info;
21
+ SNDFILE* file = sf_open(wav_path.c_str(), SFM_READ, &sf_info);
22
+
23
+ int samplerate = sf_info.samplerate;
24
+ int channels = sf_info.channels;
25
+ int frames = sf_info.frames;
26
+
27
+ std::vector<float> audio(frames * channels);
28
+ sf_readf_float(file, audio.data(), sf_info.frames);
29
+ sf_close(file);
30
+
31
+ // 创建目标 buffer 来保存 512 帧音频数据
32
+ std::vector<float> audio_512frames(audio.begin(), audio.begin() + 512);
33
+
34
+ try {
35
+ VadOnnx vad_model = VadOnnx(model_path);
36
+
37
+ // 输入一段音频数据(512 samples)
38
+ float result_512 = vad_model.forward_infer(audio_512frames);
39
+ std::cout << "result_512 = " << result_512 << std::endl;
40
+
41
+
42
+ std::vector<float> result_1 = vad_model.vad_dectect(audio);
43
+ if (!result_1.empty()) {
44
+ std::cout << "result_1.size = " << result_1.size() << std::endl;
45
+ for (int i = 0; i < 5 && i < result_1.size(); ++i) {
46
+ std::cout << result_1[i] << ", ";
47
+ }
48
+ std::cout << "(only show 5)" << std::endl;
49
+ }
50
+
51
+ std::map<std::string, double> result_map;
52
+ result_map = vad_model.vad_dectect(audio, false);
53
+ std::cerr << "result: " << std::endl;
54
+ if (!result_map.empty()) {
55
+ for (const auto& pair : result_map) {
56
+ std::cout << pair.first << " : " << pair.second << std::endl;
57
+ }
58
+ }
59
+
60
+ } catch (const std::exception& ex) {
61
+ std::cerr << "Error: " << ex.what() << std::endl;
62
+ }
63
+ // // 输出音频信息
64
+ // std::cout << "========= 音频信息 =========" << std::endl;
65
+ // std::cout << "采样率: " << samplerate << " Hz" << std::endl;
66
+ // std::cout << "通道数: " << channels << std::endl;
67
+ // std::cout << "总帧数: " << frames << std::endl;
68
+ // std::cout << "===========================" << std::endl;
69
+
70
+ return 0;
71
+ }
bin/main_silero.cpp ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <iostream>
2
+ #include <vector>
3
+ #include <cmath>
4
+ #include <iomanip> // std::fixed, std::setprecision
5
+
6
+ // 自定义头文件
7
+ #include "wav.h" // 包含 wav::WavReader 定义
8
+ #include "time_stamp.h" // 包含 timestamp_t 定义
9
+ #include "vad_iterator.h" // 包含 VadIterator 类声明
10
+
11
+
12
+ int main(int argc, char* argv[]) {
13
+ if (argc < 3) {
14
+ std::cerr << "Usage: " << argv[0] << " <model_absolute_path>" << " <audio_file_absolute_path>" << std::endl;
15
+ return 1;
16
+ }
17
+
18
+ // 获取命令行传入的音频文件路径
19
+ std::string model_path = argv[1];
20
+ std::string wav_path = argv[2];
21
+
22
+ // std::string model_path = "/Users/chenxiang/translator/Translator/moyoyo_asr_models/silero-vad/silero_vad.onnx";
23
+ // std::string wav_path = "/Users/chenxiang/translator/core/whisper_wrapper/bin/zh.wav";
24
+
25
+ // Read the WAV file (expects 16000 Hz, mono, PCM).
26
+ wav::WavReader wav_reader(wav_path); // File located in the "audio" folder.
27
+ int numSamples = wav_reader.num_samples();
28
+ std::vector<float> input_wav(static_cast<size_t>(numSamples));
29
+ for (size_t i = 0; i < static_cast<size_t>(numSamples); i++) {
30
+ input_wav[i] = static_cast<float>(*(wav_reader.data() + i));
31
+ }
32
+ // Initialize the VadIterator.
33
+ VadIterator vad(model_path);
34
+
35
+ // Process the audio.
36
+ vad.process(input_wav);
37
+
38
+ // Retrieve the speech timestamps (in samples).
39
+ std::vector<timestamp_t> stamps = vad.get_speech_timestamps();
40
+
41
+ // Convert timestamps to seconds and round to one decimal place (for 16000 Hz).
42
+ const float sample_rate_float = 16000.0f;
43
+ for (size_t i = 0; i < stamps.size(); i++) {
44
+ float start_sec = std::rint((stamps[i].start / sample_rate_float) * 10.0f) / 10.0f;
45
+ float end_sec = std::rint((stamps[i].end / sample_rate_float) * 10.0f) / 10.0f;
46
+ std::cout << "Speech detected from "
47
+ << std::fixed << std::setprecision(1) << start_sec
48
+ << " s to "
49
+ << std::fixed << std::setprecision(1) << end_sec
50
+ << " s"
51
+ << " [ " << stamps[i].start << " " << stamps[i].end <<" ]"
52
+ << std::endl;
53
+ }
54
+
55
+ // Optionally, reset the internal state.
56
+ vad.reset();
57
+
58
+ return 0;
59
+ }
bin/test_main.cpp ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "vad_onnx.h"
2
+ #include <iostream>
3
+ #include <sndfile.h>
4
+ #include <vector>
5
+ #include <map>
6
+ #include <fstream>
7
+ #include <string>
8
+
9
+ int main(int argc, char* argv[]) {
10
+ if (argc < 3) {
11
+ std::cerr << "Usage: " << argv[0] << " <model_absolute_path> <audio_list_absolute_path>" << std::endl;
12
+ return 1;
13
+ }
14
+
15
+ // 获取命令行传入的模型路径和音频列表文件路径
16
+ std::string model_path = argv[1];
17
+ std::string audio_list_path = argv[2];
18
+
19
+ // 打开 audio_list.txt 文件
20
+ std::ifstream audio_list_file(audio_list_path);
21
+ if (!audio_list_file.is_open()) {
22
+ std::cerr << "Error: Unable to open audio list file: " << audio_list_path << std::endl;
23
+ return 1;
24
+ }
25
+
26
+ try {
27
+ VadOnnx vad_model = VadOnnx(model_path);
28
+
29
+ // 逐行读取音频文件路径并处理
30
+ std::string wav_path;
31
+ while (std::getline(audio_list_file, wav_path)) {
32
+ if (wav_path.empty()) {
33
+ continue; // 跳过空行
34
+ }
35
+ vad_model.reset_states(); // 重置状态
36
+
37
+ std::cout << wav_path << std::endl;
38
+
39
+ // 加载音频文件
40
+ SF_INFO sf_info;
41
+ SNDFILE* file = sf_open(wav_path.c_str(), SFM_READ, &sf_info);
42
+ if (!file) {
43
+ std::cerr << "Error: Unable to open audio file: " << wav_path << std::endl;
44
+ continue; // 跳过无法打开的文件
45
+ }
46
+
47
+ int samplerate = sf_info.samplerate;
48
+ int channels = sf_info.channels;
49
+ int frames = sf_info.frames;
50
+
51
+ std::vector<float> audio_buffer(4096 * channels); // 用于存储每次读取的 4096 帧音频数据
52
+
53
+ try {
54
+ // 循环读取音频文件,每次读取 4096 帧
55
+ int read_frames = 0;
56
+ while ((read_frames = sf_readf_float(file, audio_buffer.data(), 4096)) > 0) {
57
+ // 如果实际读取的帧数小于 4096,则调整 buffer 大小
58
+ audio_buffer.resize(read_frames * channels);
59
+
60
+ // 推理
61
+ std::map<std::string, double> result_map = vad_model.vad_dectect(audio_buffer, false);
62
+
63
+ // 打印推理结果
64
+ if (!result_map.empty()) {
65
+ for (const auto& pair : result_map) {
66
+ std::cout << pair.first << ", " << pair.second << std::endl;
67
+ }
68
+ }
69
+ }
70
+
71
+ sf_close(file);
72
+
73
+ } catch (const std::exception& ex) {
74
+ std::cerr << "Error processing file " << wav_path << ": " << ex.what() << std::endl;
75
+ sf_close(file);
76
+ }
77
+ }
78
+
79
+ audio_list_file.close();
80
+
81
+ } catch (const std::exception& ex) {
82
+ std::cerr << "Error: " << ex.what() << std::endl;
83
+ return 1;
84
+ }
85
+
86
+ return 0;
87
+ }
bin/test_silero.cpp ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <iostream>
2
+ #include <vector>
3
+ #include <cmath>
4
+ #include <iomanip> // std::fixed, std::setprecision
5
+ #include <fstream> // std::ifstream
6
+ #include <string> // std::string
7
+
8
+ // 自定义头文件
9
+ #include "wav.h" // 包含 wav::WavReader 定义
10
+ #include "time_stamp.h" // 包含 timestamp_t 定义
11
+ #include "vad_iterator.h" // 包含 VadIterator 类声明
12
+
13
+ int main(int argc, char* argv[]) {
14
+ if (argc < 3) {
15
+ std::cerr << "Usage: " << argv[0] << " <model_absolute_path>" << " <audio_list_absolute_path>" << std::endl;
16
+ return 1;
17
+ }
18
+
19
+ // 获取命令行传入的模型路径和音频列表文件路径
20
+ std::string model_path = argv[1];
21
+ std::string audio_list_path = argv[2];
22
+
23
+ // 打开 audio_list.txt 文件
24
+ std::ifstream audio_list_file(audio_list_path);
25
+ if (!audio_list_file.is_open()) {
26
+ std::cerr << "Error: Unable to open audio list file: " << audio_list_path << std::endl;
27
+ return 1;
28
+ }
29
+
30
+ // 初始化 VadIterator
31
+ VadIterator vad(model_path);
32
+
33
+ // 逐行读取音频文件路径并处理
34
+ std::string wav_path;
35
+ while (std::getline(audio_list_file, wav_path)) {
36
+ if (wav_path.empty()) {
37
+ continue; // 跳过空行
38
+ }
39
+
40
+ std::cout << wav_path << std::endl;
41
+
42
+ try {
43
+ // 读取 WAV 文件 (expects 16000 Hz, mono, PCM)
44
+ wav::WavReader wav_reader(wav_path);
45
+ int numSamples = wav_reader.num_samples();
46
+ std::vector<float> input_wav(static_cast<size_t>(numSamples));
47
+ for (size_t i = 0; i < static_cast<size_t>(numSamples); i++) {
48
+ input_wav[i] = static_cast<float>(*(wav_reader.data() + i));
49
+ }
50
+
51
+ // 处理音频
52
+ vad.process(input_wav);
53
+ // 获取语音时间戳 (以样本为单位)
54
+ std::vector<timestamp_t> stamps = vad.get_speech_timestamps();
55
+
56
+ // 将时间戳转换为秒并输出
57
+ const float sample_rate_float = 16000.0f;
58
+ for (size_t i = 0; i < stamps.size(); i++) {
59
+ float start_sec = std::rint((stamps[i].start / sample_rate_float) * 10.0f) / 10.0f;
60
+ float end_sec = std::rint((stamps[i].end / sample_rate_float) * 10.0f) / 10.0f;
61
+ // std::cout << "Speech detected from "
62
+ // << std::fixed << std::setprecision(1) << start_sec
63
+ // << " s to "
64
+ // << std::fixed << std::setprecision(1) << end_sec
65
+ // << " s"
66
+ // << " [ " << stamps[i].start << " " << stamps[i].end << " ]"
67
+ // << std::endl;
68
+ std::cout << stamps[i].start << ", " << stamps[i].end << std::endl;
69
+ }
70
+
71
+ // 重置内部状态
72
+ vad.reset();
73
+ } catch (const std::exception& e) {
74
+ std::cerr << "Error processing file " << wav_path << ": " << e.what() << std::endl;
75
+ }
76
+ }
77
+
78
+ audio_list_file.close();
79
+ return 0;
80
+ }
bin/wav.h ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2016 Personal (Binbin Zhang)
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ #ifndef FRONTEND_WAV_H_
16
+ #define FRONTEND_WAV_H_
17
+
18
+
19
+ #include <assert.h>
20
+ #include <stdint.h>
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+
25
+ #include <string>
26
+
27
+ #include <iostream>
28
+
29
+ // #include "utils/log.h"
30
+
31
+ namespace wav {
32
+
33
+ struct WavHeader {
34
+ char riff[4]; // "riff"
35
+ unsigned int size;
36
+ char wav[4]; // "WAVE"
37
+ char fmt[4]; // "fmt "
38
+ unsigned int fmt_size;
39
+ uint16_t format;
40
+ uint16_t channels;
41
+ unsigned int sample_rate;
42
+ unsigned int bytes_per_second;
43
+ uint16_t block_size;
44
+ uint16_t bit;
45
+ char data[4]; // "data"
46
+ unsigned int data_size;
47
+ };
48
+
49
+ class WavReader {
50
+ public:
51
+ WavReader() : data_(nullptr) {}
52
+ explicit WavReader(const std::string& filename) { Open(filename); }
53
+
54
+ bool Open(const std::string& filename) {
55
+ FILE* fp = fopen(filename.c_str(), "rb"); //文件读取
56
+ if (NULL == fp) {
57
+ std::cout << "Error in read " << filename;
58
+ return false;
59
+ }
60
+
61
+ WavHeader header;
62
+ fread(&header, 1, sizeof(header), fp);
63
+ if (header.fmt_size < 16) {
64
+ printf("WaveData: expect PCM format data "
65
+ "to have fmt chunk of at least size 16.\n");
66
+ return false;
67
+ } else if (header.fmt_size > 16) {
68
+ int offset = 44 - 8 + header.fmt_size - 16;
69
+ fseek(fp, offset, SEEK_SET);
70
+ fread(header.data, 8, sizeof(char), fp);
71
+ }
72
+ // check "riff" "WAVE" "fmt " "data"
73
+
74
+ // Skip any sub-chunks between "fmt" and "data". Usually there will
75
+ // be a single "fact" sub chunk, but on Windows there can also be a
76
+ // "list" sub chunk.
77
+ while (0 != strncmp(header.data, "data", 4)) {
78
+ // We will just ignore the data in these chunks.
79
+ fseek(fp, header.data_size, SEEK_CUR);
80
+ // read next sub chunk
81
+ fread(header.data, 8, sizeof(char), fp);
82
+ }
83
+
84
+ if (header.data_size == 0) {
85
+ int offset = ftell(fp);
86
+ fseek(fp, 0, SEEK_END);
87
+ header.data_size = ftell(fp) - offset;
88
+ fseek(fp, offset, SEEK_SET);
89
+ }
90
+
91
+ num_channel_ = header.channels;
92
+ sample_rate_ = header.sample_rate;
93
+ bits_per_sample_ = header.bit;
94
+ int num_data = header.data_size / (bits_per_sample_ / 8);
95
+ data_ = new float[num_data]; // Create 1-dim array
96
+ num_samples_ = num_data / num_channel_;
97
+
98
+ std::cout << "num_channel_ :" << num_channel_ << std::endl;
99
+ std::cout << "sample_rate_ :" << sample_rate_ << std::endl;
100
+ std::cout << "bits_per_sample_:" << bits_per_sample_ << std::endl;
101
+ std::cout << "num_samples :" << num_data << std::endl;
102
+ std::cout << "num_data_size :" << header.data_size << std::endl;
103
+
104
+ switch (bits_per_sample_) {
105
+ case 8: {
106
+ char sample;
107
+ for (int i = 0; i < num_data; ++i) {
108
+ fread(&sample, 1, sizeof(char), fp);
109
+ data_[i] = static_cast<float>(sample) / 32768;
110
+ }
111
+ break;
112
+ }
113
+ case 16: {
114
+ int16_t sample;
115
+ for (int i = 0; i < num_data; ++i) {
116
+ fread(&sample, 1, sizeof(int16_t), fp);
117
+ data_[i] = static_cast<float>(sample) / 32768;
118
+ }
119
+ break;
120
+ }
121
+ case 32:
122
+ {
123
+ if (header.format == 1) //S32
124
+ {
125
+ int sample;
126
+ for (int i = 0; i < num_data; ++i) {
127
+ fread(&sample, 1, sizeof(int), fp);
128
+ data_[i] = static_cast<float>(sample) / 32768;
129
+ }
130
+ }
131
+ else if (header.format == 3) // IEEE-float
132
+ {
133
+ float sample;
134
+ for (int i = 0; i < num_data; ++i) {
135
+ fread(&sample, 1, sizeof(float), fp);
136
+ data_[i] = static_cast<float>(sample);
137
+ }
138
+ }
139
+ else {
140
+ printf("unsupported quantization bits\n");
141
+ }
142
+ break;
143
+ }
144
+ default:
145
+ printf("unsupported quantization bits\n");
146
+ break;
147
+ }
148
+
149
+ fclose(fp);
150
+ return true;
151
+ }
152
+
153
+ int num_channel() const { return num_channel_; }
154
+ int sample_rate() const { return sample_rate_; }
155
+ int bits_per_sample() const { return bits_per_sample_; }
156
+ int num_samples() const { return num_samples_; }
157
+
158
+ ~WavReader() {
159
+ delete[] data_;
160
+ }
161
+
162
+ const float* data() const { return data_; }
163
+
164
+ private:
165
+ int num_channel_;
166
+ int sample_rate_;
167
+ int bits_per_sample_;
168
+ int num_samples_; // sample points per channel
169
+ float* data_;
170
+ };
171
+
172
+ class WavWriter {
173
+ public:
174
+ WavWriter(const float* data, int num_samples, int num_channel,
175
+ int sample_rate, int bits_per_sample)
176
+ : data_(data),
177
+ num_samples_(num_samples),
178
+ num_channel_(num_channel),
179
+ sample_rate_(sample_rate),
180
+ bits_per_sample_(bits_per_sample) {}
181
+
182
+ void Write(const std::string& filename) {
183
+ FILE* fp = fopen(filename.c_str(), "w");
184
+ // init char 'riff' 'WAVE' 'fmt ' 'data'
185
+ WavHeader header;
186
+ char wav_header[44] = {0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57,
187
+ 0x41, 0x56, 0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00,
188
+ 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
189
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
190
+ 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00};
191
+ memcpy(&header, wav_header, sizeof(header));
192
+ header.channels = num_channel_;
193
+ header.bit = bits_per_sample_;
194
+ header.sample_rate = sample_rate_;
195
+ header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8);
196
+ header.size = sizeof(header) - 8 + header.data_size;
197
+ header.bytes_per_second =
198
+ sample_rate_ * num_channel_ * (bits_per_sample_ / 8);
199
+ header.block_size = num_channel_ * (bits_per_sample_ / 8);
200
+
201
+ fwrite(&header, 1, sizeof(header), fp);
202
+
203
+ for (int i = 0; i < num_samples_; ++i) {
204
+ for (int j = 0; j < num_channel_; ++j) {
205
+ switch (bits_per_sample_) {
206
+ case 8: {
207
+ char sample = static_cast<char>(data_[i * num_channel_ + j]);
208
+ fwrite(&sample, 1, sizeof(sample), fp);
209
+ break;
210
+ }
211
+ case 16: {
212
+ int16_t sample = static_cast<int16_t>(data_[i * num_channel_ + j]);
213
+ fwrite(&sample, 1, sizeof(sample), fp);
214
+ break;
215
+ }
216
+ case 32: {
217
+ int sample = static_cast<int>(data_[i * num_channel_ + j]);
218
+ fwrite(&sample, 1, sizeof(sample), fp);
219
+ break;
220
+ }
221
+ }
222
+ }
223
+ }
224
+ fclose(fp);
225
+ }
226
+
227
+ private:
228
+ const float* data_;
229
+ int num_samples_; // total float points in data_
230
+ int num_channel_;
231
+ int sample_rate_;
232
+ int bits_per_sample_;
233
+ };
234
+
235
+ } // namespace wav
236
+
237
+ #endif // FRONTEND_WAV_H_
python/__inip__.py ADDED
File without changes
python/__pycache__/processing.cpython-312.pyc ADDED
Binary file (3.25 kB). View file
 
python/helpers/__init__.py ADDED
File without changes
python/helpers/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (164 Bytes). View file
 
python/helpers/__pycache__/vadprocessor.cpython-312.pyc ADDED
Binary file (27.6 kB). View file
 
python/helpers/vadprocessor.py ADDED
@@ -0,0 +1,603 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from copy import deepcopy
2
+ from time import time
3
+ # from config import VAD_MODEL_PATH
4
+ # from silero_vad import load_silero_vad
5
+ import numpy as np
6
+ import onnxruntime
7
+ import logging
8
+ from datetime import timedelta
9
+ import gc
10
+ # from pydub import AudioSegment
11
+ from collections import deque
12
+
13
+ VAD_MODEL_PATH = "/Users/xxx/Translator/moyoyo_asr_models/silero-vad/silero_vad.onnx"
14
+
15
+ class AdaptiveSilenceController:
16
+ def __init__(self, base_silence_ms=120, min_ms=50, max_ms=600):
17
+ self.base = base_silence_ms
18
+ self.min = min_ms
19
+ self.max = max_ms
20
+ self.recent_silences = deque(maxlen=20)
21
+ self.recent_speeches = deque(maxlen=20)
22
+
23
+ def update_silence(self, duration_ms):
24
+ self.recent_silences.append(duration_ms)
25
+
26
+ def update_speech(self, duration_ms):
27
+ self.recent_speeches.append(duration_ms)
28
+
29
+ def get_adaptive_silence_ms(self):
30
+ # 1. 快速说话特征:平均语音段长度短(如 < 250ms)
31
+ avg_speech = np.mean(self.recent_speeches) if self.recent_speeches else self.base
32
+ avg_silence = np.mean(self.recent_silences) if self.recent_silences else self.base
33
+
34
+ # 2. 快速语音则缩短 silence 阈值
35
+ speed_factor = 1.0
36
+ if avg_speech < 300:
37
+ speed_factor = 0.5
38
+ elif avg_speech < 600:
39
+ speed_factor = 0.8
40
+ logging.warning(f"Avg speech :{avg_speech}, Avg silence: {avg_silence}")
41
+ # 3. silence 的变化趋势也考虑进去
42
+ adaptive = self.base * speed_factor + 0.3 * avg_silence
43
+
44
+ return int(max(self.min, min(self.max, adaptive)))
45
+
46
+
47
+ class OnnxWrapper():
48
+
49
+ def __init__(self, path, force_onnx_cpu=False):
50
+ opts = onnxruntime.SessionOptions()
51
+ opts.inter_op_num_threads = 1
52
+ opts.intra_op_num_threads = 1
53
+
54
+ if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers():
55
+ self.session = onnxruntime.InferenceSession(path, providers=['CPUExecutionProvider'], sess_options=opts)
56
+ else:
57
+ self.session = onnxruntime.InferenceSession(path, sess_options=opts)
58
+
59
+ self.reset_states()
60
+ self.sample_rates = [16000]
61
+
62
+ def _validate_input(self, x: np.ndarray, sr: int):
63
+ if x.ndim == 1:
64
+ x = x[None]
65
+ if x.ndim > 2:
66
+ raise ValueError(f"Too many dimensions for input audio chunk {x.ndim}")
67
+
68
+ if sr != 16000 and (sr % 16000 == 0):
69
+ step = sr // 16000
70
+ x = x[:, ::step]
71
+ sr = 16000
72
+
73
+ if sr not in self.sample_rates:
74
+ raise ValueError(f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)")
75
+ if sr / x.shape[1] > 31.25:
76
+ raise ValueError("Input audio chunk is too short")
77
+
78
+ return x, sr
79
+
80
+ def reset_states(self, batch_size=1):
81
+ self._state = np.zeros((2, batch_size, 128)).astype(np.float32)
82
+ self._context = np.zeros(0)
83
+ self._last_sr = 0
84
+ self._last_batch_size = 0
85
+
86
+ def __call__(self, x, sr: int):
87
+
88
+ x, sr = self._validate_input(x, sr)
89
+ num_samples = 512 if sr == 16000 else 256
90
+
91
+ if x.shape[-1] != num_samples:
92
+ raise ValueError(
93
+ f"Provided number of samples is {x.shape[-1]} (Supported values: 256 for 8000 sample rate, 512 for 16000)")
94
+
95
+ batch_size = x.shape[0]
96
+ context_size = 64 if sr == 16000 else 32
97
+
98
+ if not self._last_batch_size:
99
+ self.reset_states(batch_size)
100
+ if (self._last_sr) and (self._last_sr != sr):
101
+ self.reset_states(batch_size)
102
+ if (self._last_batch_size) and (self._last_batch_size != batch_size):
103
+ self.reset_states(batch_size)
104
+
105
+ if not len(self._context):
106
+ self._context = np.zeros((batch_size, context_size)).astype(np.float32)
107
+
108
+ x = np.concatenate([self._context, x], axis=1)
109
+ if sr in [8000, 16000]:
110
+ ort_inputs = {'input': x, 'state': self._state, 'sr': np.array(sr, dtype='int64')}
111
+ ort_outs = self.session.run(None, ort_inputs)
112
+ out, state = ort_outs
113
+ self._state = state
114
+ else:
115
+ raise ValueError()
116
+
117
+ self._context = x[..., -context_size:]
118
+ self._last_sr = sr
119
+ self._last_batch_size = batch_size
120
+
121
+ # out = torch.from_numpy(out)
122
+ return out
123
+
124
+ def audio_forward(self, audio: np.ndarray, sr: int):
125
+ outs = []
126
+ x, sr = self._validate_input(audio, sr)
127
+ self.reset_states()
128
+ num_samples = 512 if sr == 16000 else 256
129
+
130
+ if x.shape[1] % num_samples:
131
+ pad_num = num_samples - (x.shape[1] % num_samples)
132
+ x = np.pad(x, ((0, 0), (0, pad_num)), 'constant', constant_values=(0.0, 0.0))
133
+
134
+ for i in range(0, x.shape[1], num_samples):
135
+ wavs_batch = x[:, i:i + num_samples]
136
+ out_chunk = self.__call__(wavs_batch, sr)
137
+ outs.append(out_chunk)
138
+
139
+ stacked = np.concatenate(outs, axis=1)
140
+ return stacked
141
+
142
+
143
+ class VADIteratorOnnx:
144
+ def __init__(self,
145
+ threshold: float = 0.5,
146
+ sampling_rate: int = 16000,
147
+ min_silence_duration_ms: int = 100,
148
+ max_speech_duration_s: float = float('inf'),
149
+ speech_pad_ms: int = 30
150
+ ):
151
+ self.model = OnnxWrapper(VAD_MODEL_PATH, True)
152
+ self.threshold = threshold
153
+ self.sampling_rate = sampling_rate
154
+
155
+ if sampling_rate not in [8000, 16000]:
156
+ raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')
157
+
158
+ self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
159
+ # self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
160
+ self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
161
+ self.reset_states()
162
+
163
+ def reset_states(self):
164
+
165
+ self.model.reset_states()
166
+ self.triggered = False
167
+ self.temp_end = 0
168
+ self.current_sample = 0
169
+ self.start = 0
170
+
171
+ def __call__(self, x: np.ndarray, return_seconds=False):
172
+ """
173
+ x: np.ndarray
174
+ audio chunk (see examples in repo)
175
+
176
+ return_seconds: bool (default - False)
177
+ whether return timestamps in seconds (default - samples)
178
+ """
179
+
180
+ window_size_samples = 512 if self.sampling_rate == 16000 else 256
181
+ x = x[:window_size_samples]
182
+ if len(x) < window_size_samples:
183
+ x = np.pad(x, ((0, 0), (0, window_size_samples - len(x))), 'constant', constant_values=0.0)
184
+
185
+ self.current_sample += window_size_samples
186
+
187
+ speech_prob = self.model(x, self.sampling_rate)[0,0]
188
+
189
+
190
+ if (speech_prob >= self.threshold) and self.temp_end:
191
+ self.temp_end = 0
192
+
193
+ if (speech_prob >= self.threshold) and not self.triggered:
194
+ self.triggered = True
195
+ # speech_start = max(0, self.current_sample - window_size_samples)
196
+ speech_start = max(0, self.current_sample - self.speech_pad_samples - window_size_samples)
197
+ self.start = speech_start
198
+ return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
199
+
200
+ # if (speech_prob >= self.threshold) and self.current_sample - self.start >= self.max_speech_samples:
201
+ # if self.temp_end:
202
+ # self.temp_end = 0
203
+ # self.start = self.current_sample
204
+ # return {'end': int(self.current_sample) if not return_seconds else round(self.current_sample / self.sampling_rate, 1)}
205
+
206
+ if (speech_prob < self.threshold - 0.15) and self.triggered:
207
+ if not self.temp_end:
208
+ self.temp_end = self.current_sample
209
+ if self.current_sample - self.temp_end < self.min_silence_samples:
210
+ return None
211
+ else:
212
+ # speech_end = self.temp_end - window_size_samples
213
+ speech_end = self.temp_end + self.speech_pad_samples - window_size_samples
214
+ self.temp_end = 0
215
+ self.triggered = False
216
+ return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
217
+
218
+ return None
219
+
220
+
221
+ class FixedVADIterator(VADIteratorOnnx):
222
+ '''It fixes VADIterator by allowing to process any audio length, not only exactly 512 frames at once.
223
+ If audio to be processed at once is long and multiple voiced segments detected,
224
+ then __call__ returns the start of the first segment, and end (or middle, which means no end) of the last segment.
225
+ '''
226
+
227
+ def reset_states(self):
228
+ super().reset_states()
229
+ self.buffer = np.array([],dtype=np.float32)
230
+
231
+ def __call__(self, x, return_seconds=False):
232
+ self.buffer = np.append(self.buffer, x)
233
+ # print(f"len(self.buffer): {len(self.buffer)}")
234
+ ret = None
235
+ i = 0
236
+ while len(self.buffer) >= 512:
237
+ # print(f"len(self.buffer): {len(self.buffer)}")
238
+ r = super().__call__(self.buffer[:512], return_seconds=return_seconds)
239
+ # print(f"super() : {r}")
240
+ self.buffer = self.buffer[512:]
241
+ if ret is None:
242
+ print(f"{i*512} ->>> {r} ->=== {ret}")
243
+ ret = r
244
+ elif r is not None:
245
+ if 'end' in r:
246
+ ret['end'] = r['end'] # the latter end
247
+ print(f"{i*512} ->>> {r} -> {ret}")
248
+ if 'start' in r and 'end' in ret: # there is an earlier start.
249
+ # Remove end, merging this segment with the previous one.
250
+ # print(f"{i*512} ->>>del {r} -> {ret}")
251
+ del ret['end']
252
+ print(f"{i*512} ->>> {r} -> {ret}")
253
+ # else:
254
+ # # print(f"{i*512} ->>> {r} -> {ret}")
255
+ i += 1
256
+ # print(f"FixedVADIterator output : {ret}")
257
+ return ret if ret != {} else None
258
+
259
+ class VadV2:
260
+ def __init__(self,
261
+ threshold: float = 0.5,
262
+ sampling_rate: int = 16000,
263
+ min_silence_duration_ms: int = 100,
264
+ speech_pad_ms: int = 30,
265
+ max_speech_duration_s: float = float('inf')):
266
+ # self.vad_iterator = VADIterator(threshold, sampling_rate, min_silence_duration_ms)
267
+ self.vad_iterator = VADIteratorOnnx(threshold, sampling_rate, min_silence_duration_ms, max_speech_duration_s)
268
+ self.speech_pad_samples = int(sampling_rate * speech_pad_ms / 1000)
269
+ self.sampling_rate = sampling_rate
270
+ self.audio_buffer = np.array([], dtype=np.float32)
271
+ self.start = 0
272
+ self.end = 0
273
+ self.offset = 0
274
+ assert speech_pad_ms <= min_silence_duration_ms, "speech_pad_ms should be less than min_silence_duration_ms"
275
+ self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
276
+
277
+ self.silence_chunk_size = 0
278
+ self.silence_chunk_threshold = 60 / (512 / self.sampling_rate)
279
+
280
+ def reset(self):
281
+ self.audio_buffer = np.array([], dtype=np.float32)
282
+ self.start = 0
283
+ self.end = 0
284
+ self.offset = 0
285
+ self.vad_iterator.reset_states()
286
+
287
+ def __call__(self, x: np.ndarray = None):
288
+ if x is None:
289
+ if self.start:
290
+ start = max(self.offset, self.start - self.speech_pad_samples)
291
+ end = self.offset + len(self.audio_buffer)
292
+ start_ts = round(start / self.sampling_rate, 1)
293
+ end_ts = round(end / self.sampling_rate, 1)
294
+ audio_data = self.audio_buffer[start - self.offset: end - self.offset]
295
+ result = {
296
+ "start": start_ts,
297
+ "end": end_ts,
298
+ "audio": audio_data,
299
+ }
300
+ else:
301
+ result = None
302
+ self.reset()
303
+ return result
304
+
305
+ self.audio_buffer = np.append(self.audio_buffer, deepcopy(x))
306
+
307
+ result = self.vad_iterator(x)
308
+ if result is not None:
309
+ # self.start = result.get('start', self.start)
310
+ # self.end = result.get('end', self.end)
311
+ self.silence_chunk_size = 0
312
+
313
+ if 'start' in result:
314
+ self.start = result['start']
315
+ if 'end' in result:
316
+ self.end = result['end']
317
+ else:
318
+ self.silence_chunk_size += 1
319
+
320
+ if self.start == 0 and len(self.audio_buffer) > self.speech_pad_samples:
321
+ self.offset += len(self.audio_buffer) - self.speech_pad_samples
322
+ self.audio_buffer = self.audio_buffer[-self.speech_pad_samples:]
323
+
324
+ if self.silence_chunk_size >= self.silence_chunk_threshold:
325
+ self.offset += len(self.audio_buffer) - self.speech_pad_samples
326
+ self.audio_buffer = self.audio_buffer[-self.speech_pad_samples:]
327
+ self.silence_chunk_size = 0
328
+
329
+ if self.end > self.start:
330
+ start = max(self.offset, self.start - self.speech_pad_samples)
331
+ end = self.end + self.speech_pad_samples
332
+ start_ts = round(start / self.sampling_rate, 1)
333
+ end_ts = round(end / self.sampling_rate, 1)
334
+ audio_data = self.audio_buffer[start - self.offset: end - self.offset]
335
+ self.audio_buffer = self.audio_buffer[self.end - self.offset:]
336
+ self.offset = self.end
337
+ self.start = self.end
338
+ # self.start = 0
339
+ self.end = 0
340
+ result = {
341
+ "start": start_ts,
342
+ "end": end_ts,
343
+ "audio": audio_data,
344
+ }
345
+
346
+ return result
347
+ return None
348
+
349
+
350
+ class SileroVADProcessor:
351
+ """
352
+ A class for processing audio files using Silero VAD to detect voice activity
353
+ and extract voice segments from audio files.
354
+ """
355
+
356
+ def __init__(self,
357
+ activate_threshold=0.5,
358
+ fusion_threshold=0.3,
359
+ min_speech_duration=0.25,
360
+ max_speech_duration=20,
361
+ min_silence_duration=250,
362
+ sample_rate=16000,
363
+ ort_providers=None):
364
+ """
365
+ Initialize the SileroVADProcessor.
366
+ Args:
367
+ activate_threshold (float): Threshold for voice activity detection
368
+ fusion_threshold (float): Threshold for merging close speech segments (seconds)
369
+ min_speech_duration (float): Minimum duration of speech to be considered valid (seconds)
370
+ max_speech_duration (float): Maximum duration of speech (seconds)
371
+ min_silence_duration (int): Minimum silence duration (ms)
372
+ sample_rate (int): Sample rate of the audio (8000 or 16000 Hz)
373
+ ort_providers (list): ONNX Runtime providers for acceleration
374
+ """
375
+ # VAD parameters
376
+ self.activate_threshold = activate_threshold
377
+ self.fusion_threshold = fusion_threshold
378
+ self.min_speech_duration = min_speech_duration
379
+ self.max_speech_duration = max_speech_duration
380
+ self.min_silence_duration = min_silence_duration
381
+ self.sample_rate = sample_rate
382
+ self.ort_providers = ort_providers if ort_providers else []
383
+
384
+ # Initialize logger
385
+ self.logger = logging.getLogger(__name__)
386
+
387
+ # Load Silero VAD model
388
+ self._init_onnx_session()
389
+ self.silero_vad = load_silero_vad(onnx=True)
390
+
391
+ def _init_onnx_session(self):
392
+ """Initialize ONNX Runtime session with appropriate settings."""
393
+ session_opts = onnxruntime.SessionOptions()
394
+ session_opts.log_severity_level = 3
395
+ session_opts.inter_op_num_threads = 0
396
+ session_opts.intra_op_num_threads = 0
397
+ session_opts.enable_cpu_mem_arena = True
398
+ session_opts.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
399
+ session_opts.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
400
+
401
+ session_opts.add_session_config_entry("session.intra_op.allow_spinning", "1")
402
+ session_opts.add_session_config_entry("session.inter_op.allow_spinning", "1")
403
+ session_opts.add_session_config_entry("session.set_denormal_as_zero", "1")
404
+
405
+ # Set the session_opts to be used by silero_vad
406
+ # onnxruntime.capi._pybind_state.get_default_session_options(session_opts)
407
+
408
+ def load_audio(self, audio_path):
409
+ """
410
+ Load audio file and prepare it for VAD processing.
411
+ Args:
412
+ audio_path (str): Path to the audio file
413
+ Returns:
414
+ numpy.ndarray: Audio data as numpy array
415
+ """
416
+ self.logger.info(f"Loading audio from {audio_path}")
417
+ audio_segment = AudioSegment.from_file(audio_path)
418
+ audio_segment = audio_segment.set_channels(1).set_frame_rate(self.sample_rate)
419
+
420
+ # Convert to numpy array and normalize
421
+ dtype = np.float16 if self.use_gpu_fp16 else np.float32
422
+ audio_array = np.array(audio_segment.get_array_of_samples(), dtype=dtype) * 0.000030517578 # 1/32768
423
+
424
+ self.audio_segment = audio_segment # Store for later use
425
+ return audio_array
426
+
427
+ @property
428
+ def model(self):
429
+ return self.silero_vad
430
+
431
+ def process_timestamps(self, timestamps):
432
+ """
433
+ Process VAD timestamps: filter short segments and merge close segments.
434
+ Args:
435
+ timestamps (list): List of (start, end) tuples
436
+ Returns:
437
+ list: Processed list of (start, end) tuples
438
+ """
439
+ # Filter out short durations
440
+ filtered_timestamps = [(start, end) for start, end in timestamps
441
+ if (end - start) >= self.min_speech_duration]
442
+
443
+ # Fuse timestamps in two passes for better merging
444
+ fused_timestamps_1st = []
445
+ for start, end in filtered_timestamps:
446
+ if fused_timestamps_1st and (start - fused_timestamps_1st[-1][1] <= self.fusion_threshold):
447
+ fused_timestamps_1st[-1] = (fused_timestamps_1st[-1][0], end)
448
+ else:
449
+ fused_timestamps_1st.append((start, end))
450
+
451
+ fused_timestamps_2nd = []
452
+ for start, end in fused_timestamps_1st:
453
+ if fused_timestamps_2nd and (start - fused_timestamps_2nd[-1][1] <= self.fusion_threshold):
454
+ fused_timestamps_2nd[-1] = (fused_timestamps_2nd[-1][0], end)
455
+ else:
456
+ fused_timestamps_2nd.append((start, end))
457
+
458
+ return fused_timestamps_2nd
459
+
460
+ def format_time(self, seconds):
461
+ """
462
+ Convert seconds to VTT time format 'hh:mm:ss.mmm'.
463
+ Args:
464
+ seconds (float): Time in seconds
465
+ Returns:
466
+ str: Formatted time string
467
+ """
468
+ td = timedelta(seconds=seconds)
469
+ td_sec = td.total_seconds()
470
+ total_seconds = int(td_sec)
471
+ milliseconds = int((td_sec - total_seconds) * 1000)
472
+ hours = total_seconds // 3600
473
+ minutes = (total_seconds % 3600) // 60
474
+ seconds = total_seconds % 60
475
+ return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"
476
+
477
+ def detect_speech(self, audio:np.array):
478
+ """
479
+ Run VAD on the audio file to detect speech segments.
480
+ Args:
481
+ audio_path (str): Path to the audio file
482
+ Returns:
483
+ list: List of processed timestamps as (start, end) tuples
484
+ """
485
+ self.logger.info("Starting VAD process")
486
+ start_time = time.time()
487
+ # Get speech timestamps
488
+ raw_timestamps = get_speech_timestamps(
489
+ audio,
490
+ model=self.silero_vad,
491
+ threshold=self.activate_threshold,
492
+ max_speech_duration_s=self.max_speech_duration,
493
+ min_speech_duration_ms=int(self.min_speech_duration * 1000),
494
+ min_silence_duration_ms=self.min_silence_duration,
495
+ return_seconds=True
496
+ )
497
+
498
+ # Convert to simple format and process
499
+ timestamps = [(item['start'], item['end']) for item in raw_timestamps]
500
+ processed_timestamps = self.process_timestamps(timestamps)
501
+
502
+ # Clean up
503
+ del audio
504
+ gc.collect()
505
+
506
+ self.logger.info(f"VAD completed in {time.time() - start_time:.3f} seconds")
507
+ return processed_timestamps
508
+
509
+ """
510
+ Save timestamps in both second and sample indices formats.
511
+ Args:
512
+ timestamps (list): List of (start, end) tuples
513
+ output_prefix (str): Prefix for output files
514
+ """
515
+ # Save timestamps in seconds (VTT format)
516
+ seconds_path = f"{output_prefix}_timestamps_second.txt"
517
+ with open(seconds_path, "w", encoding='UTF-8') as file:
518
+ self.logger.info("Saving timestamps in seconds format")
519
+ for start, end in timestamps:
520
+ s_time = self.format_time(start)
521
+ e_time = self.format_time(end)
522
+ line = f"{s_time} --> {e_time}\n"
523
+ file.write(line)
524
+
525
+ # Save timestamps in sample indices
526
+ indices_path = f"{output_prefix}_timestamps_indices.txt"
527
+ with open(indices_path, "w", encoding='UTF-8') as file:
528
+ self.logger.info("Saving timestamps in indices format")
529
+ for start, end in timestamps:
530
+ line = f"{int(start * self.sample_rate)} --> {int(end * self.sample_rate)}\n"
531
+ file.write(line)
532
+
533
+ self.logger.info(f"Timestamps saved to {seconds_path} and {indices_path}")
534
+
535
+ def extract_speech_segments(self, audio_segment, timestamps):
536
+ """
537
+ Extract speech segments from the audio and combine them into a single audio file.
538
+ Args:
539
+ timestamps (list): List of (start, end) tuples indicating speech segments
540
+ Returns:
541
+ AudioSegment: The combined speech segments
542
+ """
543
+ audio_segment = audio_segment.numpy()
544
+ combined_speech = np.array([], dtype=np.float32)
545
+
546
+ # Extract and combine each speech segment
547
+ for i, (start, end) in enumerate(timestamps):
548
+ # Convert seconds to milliseconds for pydub
549
+ start_ms = int(start * 1000)
550
+ end_ms = int(end * 1000)
551
+
552
+ # Ensure the end time does not exceed the length of the audio segment
553
+ if end_ms > len(audio_segment):
554
+ end_ms = len(audio_segment)
555
+
556
+ # Extract the segment
557
+ segment = audio_segment[start_ms:end_ms]
558
+
559
+ # Add to combined audio
560
+ combined_speech = np.append(combined_speech, segment)
561
+
562
+ return combined_speech
563
+
564
+ def process_audio(self, audio_array:np.array):
565
+ """
566
+ Complete processing pipeline: detect speech, save timestamps, and optionally extract speech.
567
+ Returns:
568
+ tuple: (timestamps, output_speech_path if extract_speech else None)
569
+ """
570
+
571
+ # Run VAD to detect speech
572
+ timestamps = self.detect_speech(audio_array)
573
+
574
+ combined_speech = self.extract_speech_segments(audio_array, timestamps)
575
+
576
+ return timestamps, combined_speech
577
+
578
+
579
+
580
+ class VadProcessor:
581
+ def __init__(
582
+ self,
583
+ prob_threshold=0.5,
584
+ silence_s=0.2,
585
+ cache_s=0.15,
586
+ sr=16000
587
+ ):
588
+ self.prob_threshold = prob_threshold
589
+ self.cache_s = cache_s
590
+ self.sr = sr
591
+ self.silence_s = silence_s
592
+
593
+ self.vad = VadV2(self.prob_threshold, self.sr, self.silence_s * 1000, self.cache_s * 1000, max_speech_duration_s=15)
594
+
595
+
596
+ def process_audio(self, audio_buffer: np.ndarray):
597
+ audio = np.array([], np.float32)
598
+ for i in range(0, len(audio_buffer), 512):
599
+ chunk = audio_buffer[i:i+512]
600
+ ret = self.vad(chunk)
601
+ if ret:
602
+ audio = np.append(audio, ret['audio'])
603
+ return audio
python/pipelines/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+
2
+ from .base import MetaItem
3
+ from .pipe_vad import VadPipe
python/pipelines/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (247 Bytes). View file
 
python/pipelines/__pycache__/base.cpython-312.pyc ADDED
Binary file (4.04 kB). View file
 
python/pipelines/__pycache__/pipe_vad.cpython-312.pyc ADDED
Binary file (4.14 kB). View file
 
python/pipelines/base.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from dataclasses import dataclass, field
3
+ from multiprocessing import Process, Queue
4
+ from multiprocessing import Event
5
+ from logging import getLogger
6
+
7
+ logger = getLogger(__name__)
8
+
9
+
10
+ @dataclass
11
+ class Segment:
12
+ t0: int
13
+ t1: int
14
+ text: str
15
+
16
+ @dataclass
17
+ class MetaItem:
18
+ segments: list[Segment] = field(default_factory=list)
19
+ source_audio: bytes = b""
20
+ audio: bytes = b''
21
+ transcribe_content: str = ''
22
+ translate_content: str = ''
23
+ source_language: str = 'zh'
24
+ destination_language: str = 'en'
25
+ speech_status: str = 'END' # "END", "START"
26
+
27
+
28
+ class BasePipe(Process):
29
+ def __init__(self, in_queue=None, out_queue=None) -> None:
30
+ super().__init__() # Initialize the Process class
31
+ self._in_queue = in_queue if in_queue else Queue()
32
+ self._out_queue = out_queue if out_queue else Queue()
33
+ self._ready = Event()
34
+
35
+ def set_ready(self):
36
+ self._ready.set()
37
+
38
+ def is_ready(self):
39
+ return self._ready.is_set()
40
+
41
+ def wait(self):
42
+ self._ready.wait()
43
+
44
+ @property
45
+ def output_queue(self):
46
+ return self._out_queue
47
+
48
+ @property
49
+ def input_queue(self):
50
+ return self._in_queue
51
+
52
+ def process(self, in_data: MetaItem) -> MetaItem:
53
+ raise NotImplementedError("Subclasses should implement this method.")
54
+
55
+
56
+ @classmethod
57
+ def init(cls):
58
+ raise NotImplementedError
59
+
60
+ def run(self):
61
+ logger.info(f"start initial {self.__class__.__name__}")
62
+ self.init()
63
+ logger.info(f"finish initial {self.__class__.__name__}")
64
+ self.set_ready()
65
+ while True:
66
+ item = self.input_queue.get()
67
+ if item is None: # Check for termination signal
68
+ break
69
+ out_item = self.process(item)
70
+ if out_item:
71
+ self.output_queue.put(out_item)
python/pipelines/pipe_vad.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from .base import MetaItem, BasePipe
3
+ from ..helpers.vadprocessor import FixedVADIterator
4
+
5
+ import numpy as np
6
+ import logging
7
+
8
+ # import noisereduce as nr
9
+
10
+
11
+ class VadPipe(BasePipe):
12
+ vac = None
13
+ sample_rate = 16000
14
+
15
+ def __init__(self, in_queue=None, out_queue=None) -> None:
16
+ super().__init__(in_queue, out_queue)
17
+ self._offset = 0 # 处理的frame size offset
18
+ self._status = 'END'
19
+
20
+
21
+ def reset(self):
22
+ self._offset = 0
23
+ self._status = 'END'
24
+
25
+ self.vac.reset_states()
26
+
27
+ @classmethod
28
+ def init(cls):
29
+ if cls.vac is None:
30
+ cls.vac = FixedVADIterator(
31
+ threshold=0.6,
32
+ sampling_rate=cls.sample_rate,
33
+ # speech_pad_ms=10
34
+ min_silence_duration_ms = 100,
35
+ # speech_pad_ms = 30,
36
+ )
37
+ cls.vac.reset_states()
38
+
39
+
40
+ # def reduce_noise(self, data):
41
+ # return nr.reduce_noise(y=data, sr=self.sample_rate)
42
+
43
+ def _process_speech_chunk(self, source_audio:np.ndarray):
44
+ speech_dict = self.vac(source_audio, return_seconds=False)
45
+ # print(f"speech_dict : {speech_dict}")
46
+ if speech_dict:
47
+ relative_start_frame = None
48
+ relative_end_frame = None
49
+ start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
50
+ if start_frame:
51
+ relative_start_frame =start_frame - self._offset
52
+ if end_frame:
53
+ relative_end_frame = end_frame - self._offset
54
+ return relative_start_frame, relative_end_frame
55
+
56
+ def process(self, in_data: MetaItem) -> MetaItem:
57
+ if self._offset == 0:
58
+ self.vac.reset_states()
59
+
60
+ # silence_audio_100ms = np.zeros(int(0.1*self.sample_rate))
61
+ source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
62
+ print(f"source_audio.shape = {source_audio.shape}")
63
+ speech_data = self._process_speech_chunk(source_audio)
64
+
65
+
66
+ if speech_data: # 表示有音频的变化点出现
67
+ rel_start_frame, rel_end_frame = speech_data
68
+
69
+ if rel_start_frame is not None and rel_end_frame is None:
70
+ self._status = "START" # 语音开始
71
+ target_audio = source_audio[max(rel_start_frame-100, 0):]
72
+ logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
73
+ elif rel_start_frame is None and rel_end_frame is not None:
74
+ self._status = "END" # 音频结束
75
+ target_audio = source_audio[:rel_end_frame]
76
+ logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
77
+ else:
78
+ self._status = 'END'
79
+ target_audio = source_audio[max(rel_start_frame-100, 0):rel_end_frame]
80
+ logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
81
+ # logging.debug("❌ No valid speech segment detected, setting status to END")
82
+ else:
83
+ if self._status == 'START':
84
+ target_audio = source_audio
85
+ # logging.debug("🔊 Continuing to capture audio as speech is still ongoing")
86
+ else: # end
87
+ target_audio = np.array([],dtype=np.float32)
88
+ # self._status = 'END'
89
+ # logging.debug("❌ No speech detected, setting status to END")
90
+ print(f"strat: {rel_start_frame} end: {rel_end_frame}")
91
+ self._offset += len(source_audio)
92
+
93
+ in_data.audio = target_audio.tobytes()
94
+ in_data.source_audio = b''
95
+ in_data.speech_status = self._status
96
+ return in_data
python/processing.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ current_dir = os.path.dirname(os.path.abspath(__file__))
4
+ parent_dir = os.path.dirname(current_dir)
5
+ sys.path.append(parent_dir)
6
+ # sys.path.append("/Users/chenxiang/translator/Translator/llama-cpp-python/llama_cpp")
7
+
8
+ from .pipelines import MetaItem, VadPipe
9
+
10
+ class ProcessingPipes:
11
+ def __init__(self) -> None:
12
+
13
+ self._process = []
14
+ # vad
15
+ self._vad_pipe = self._launch_process(VadPipe())
16
+
17
+ def _launch_process(self, process_obj):
18
+ process_obj.daemon = True
19
+ process_obj.start()
20
+ self._process.append(process_obj)
21
+ return process_obj
22
+
23
+ def wait_ready(self):
24
+ for p in self._process:
25
+ p.wait()
26
+
27
+ def voice_detect(self, audio_buffer: bytes) -> MetaItem:
28
+ item = MetaItem(source_audio=audio_buffer)
29
+ self._vad_pipe.input_queue.put(item)
30
+ return self._vad_pipe.output_queue.get()
31
+
32
+
33
+ if __name__ == "__main__":
34
+ import soundfile
35
+ import numpy as np
36
+
37
+ wav_path1 = "/Users/chenxiang/translator/core/vad_cpp/bin/Chinese-liyongle-part1.mp3"
38
+ wav_path2 = "/Users/chenxiang/translator/core/whisper_wrapper/bin/zh.wav"
39
+
40
+ tp = ProcessingPipes()
41
+ audio, sr, = soundfile.read(wav_path2)
42
+
43
+ # 确保是单声道
44
+ if len(audio.shape) > 1:
45
+ print("不是单声道")
46
+ audio = audio.mean(axis=1)
47
+
48
+ # 重采样到 16kHz(如果需要)
49
+ if sr != 16000:
50
+ print("采样率不是 16000, 重新采样到 16kHz(如果需要)")
51
+ import resampy
52
+ audio = resampy.resample(audio, sr, 16000)
53
+
54
+ # 转换为 float32
55
+ print(f"original audio data type = {audio.dtype}")
56
+ audio = audio.astype(np.float32)
57
+
58
+ print(f"original audio data size = {audio.shape}")
59
+
60
+ result = tp.voice_detect(audio)
61
+ # print(f"{result.speech_status} {result.segments} {result.segments}")
62
+ print("********** END *************")
reference/.DS_Store ADDED
Binary file (6.15 kB). View file
 
reference/cpp/onnx_wrapper.cpp ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <stdexcept>
2
+ #include <cmath>
3
+ #include <iostream>
4
+
5
+ #include "onnx_wrapper.h"
6
+
7
+ static void get_input_names(Ort::Session* session, std::vector<std::string> &input_names_str,
8
+ std::vector<const char *> &input_names_char) {
9
+ Ort::AllocatorWithDefaultOptions allocator;
10
+ size_t nodes_num = session->GetInputCount();
11
+ input_names_str.resize(nodes_num);
12
+ input_names_char.resize(nodes_num);
13
+ for (size_t i = 0; i != nodes_num; ++i) {
14
+ auto t = session->GetInputNameAllocated(i, allocator);
15
+ input_names_str[i] = t.get();
16
+ input_names_char[i] = input_names_str[i].c_str();
17
+ }
18
+ }
19
+
20
+ static void get_output_names(Ort::Session* session, std::vector<std::string> &output_names_,
21
+ std::vector<const char *> &vad_out_names_) {
22
+ Ort::AllocatorWithDefaultOptions allocator;
23
+ size_t nodes_num = session->GetOutputCount();
24
+ output_names_.resize(nodes_num);
25
+ vad_out_names_.resize(nodes_num);
26
+ for (size_t i = 0; i != nodes_num; ++i) {
27
+ auto t = session->GetOutputNameAllocated(i, allocator);
28
+ output_names_[i] = t.get();
29
+ vad_out_names_[i] = output_names_[i].c_str();
30
+ }
31
+ }
32
+
33
+ OnnxVadWrapper::OnnxVadWrapper(const std::string& model_path, bool force_cpu, int thread_num)
34
+ : sample_rates_{16000}, model_path_(model_path) {
35
+ Ort::SessionOptions session_options;
36
+ session_options.SetIntraOpNumThreads(thread_num);
37
+ session_options.SetGraphOptimizationLevel(ORT_ENABLE_ALL);
38
+ session_options.DisableCpuMemArena();
39
+
40
+ // if (force_cpu && supports_cpu()) {
41
+ // session_options.AppendExecutionProvider_CPU();
42
+ // }
43
+
44
+ // 初始化 ONNX Session
45
+ try {
46
+ env_ = Ort::Env(ORT_LOGGING_LEVEL_WARNING, "OnnxVadWrapper");
47
+ session_ = std::make_unique<Ort::Session>(env_, ORTCHAR(model_path.c_str()), session_options);
48
+ std::cout << "Successfully load model from " << model_path << std::endl;
49
+ } catch (std::exception const &e) {
50
+ std::cout << "Error when load vad onnx model: " << e.what() << std::endl;
51
+ exit(-1);
52
+ }
53
+
54
+ get_input_names(session_.get(), input_names_, vad_in_names_);
55
+ get_output_names(session_.get(), output_names_, vad_out_names_);
56
+
57
+ reset_states();
58
+ }
59
+
60
+ OnnxVadWrapper::~OnnxVadWrapper() = default;
61
+
62
+ void OnnxVadWrapper::reset_states(int batch_size) {
63
+ int total_size = 2 * batch_size * 128;
64
+ state_.resize(total_size); /////
65
+ state_.assign(state_.size(), 0.0f);
66
+ context_.clear();
67
+ last_sr_ = 0;
68
+ last_batch_size_ = 0;
69
+ }
70
+
71
+ std::pair<std::vector<float>, std::vector<float>> OnnxVadWrapper::operator()(const std::vector<float>& x, int sr) {
72
+ validate_input(x, sr);
73
+
74
+ int num_samples = (sr == 16000) ? 512 : 256;
75
+ int context_size = (sr == 16000) ? 64 : 32;
76
+
77
+ int batch_size = 1; // 假设单通道输入
78
+ if (x.size() != num_samples) {
79
+ throw std::invalid_argument("Input must be exactly " + std::to_string(num_samples) + " samples.");
80
+ }
81
+
82
+ if (!last_batch_size_) reset_states(batch_size);
83
+ if (last_sr_ != 0 && last_sr_ != sr) reset_states(batch_size);
84
+ if (last_batch_size_ != 0 && last_batch_size_ != batch_size) reset_states(batch_size);
85
+
86
+ if (context_.empty()) {
87
+ context_.resize(batch_size * context_size, 0.0f);
88
+ }
89
+
90
+ // 合并 context 和 input
91
+ std::vector<float> x_with_context(context_.begin(), context_.end());
92
+ x_with_context.insert(x_with_context.end(), x.begin(), x.end());
93
+
94
+ // Prepare inputs
95
+ std::vector<Ort::Value> inputs;
96
+ auto mem_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
97
+ std::array<int64_t, 3> input_shape = {1, 1, static_cast<int64_t>(x_with_context.size())};
98
+ Ort::Value input_tensor = Ort::Value::CreateTensor<float>(
99
+ mem_info, const_cast<float*>(x_with_context.data()), x_with_context.size(),
100
+ input_shape.data(), input_shape.size());
101
+ inputs.emplace_back(std::move(input_tensor));
102
+
103
+ std::array<int64_t, 3> state_shape = {2, batch_size, 128};
104
+ Ort::Value state_tensor = Ort::Value::CreateTensor<float>(
105
+ mem_info, state_.data(), state_.size(), state_shape.data(), state_shape.size());
106
+ inputs.emplace_back(std::move(state_tensor));
107
+
108
+ std::array<int64_t, 1> sr_shape = {1};
109
+ float sr_f = static_cast<float>(sr);
110
+ Ort::Value sr_tensor = Ort::Value::CreateTensor<float>(
111
+ mem_info, &sr_f, 1, sr_shape.data(), sr_shape.size());
112
+ inputs.emplace_back(std::move(sr_tensor));
113
+
114
+ // const char* input_names[] = {"input", "state", "sr"};
115
+ // std::vector<Ort::Value> inputs = {std::move(input_tensor), std::move(state_tensor), std::move(sr_tensor)};
116
+
117
+ // Run inference
118
+ std::vector<Ort::Value> outputs;
119
+ try {
120
+ outputs = session_->Run(
121
+ Ort::RunOptions{nullptr}, vad_in_names_.data(), inputs.data(),
122
+ inputs.size(), vad_out_names_.data(), vad_out_names_.size());
123
+ } catch (std::exception const &e) {
124
+ std::cout << "Error when run vad onnx forword: " << e.what() << std::endl;
125
+ exit(-1);
126
+ }
127
+
128
+ // Get output
129
+ float* out_data = outputs[0].GetTensorMutableData<float>();
130
+ size_t out_len = outputs[0].GetTensorTypeAndShapeInfo().GetElementCount();
131
+ std::vector<float> out(out_data, out_data + out_len);
132
+
133
+ // Update state and context
134
+ float* new_state = outputs[1].GetTensorMutableData<float>();
135
+ std::copy(new_state, new_state + state_.size(), state_.begin());
136
+
137
+ context_.assign(x_with_context.end() - context_size, x_with_context.end());
138
+
139
+ last_sr_ = sr;
140
+ last_batch_size_ = batch_size;
141
+
142
+ return {out, {}};
143
+ }
144
+
145
+ std::vector<float> OnnxVadWrapper::audio_forward(const std::vector<float>& audio, int sr) {
146
+ std::vector<float> x = audio;
147
+ reset_states();
148
+
149
+ int num_samples = (sr == 16000) ? 512 : 256;
150
+ std::vector<float> result;
151
+
152
+ // Pad to multiple of num_samples
153
+ int pad_num = (num_samples - (x.size() % num_samples)) % num_samples;
154
+ x.resize(x.size() + pad_num, 0.0f);
155
+
156
+ for (size_t i = 0; i < x.size(); i += num_samples) {
157
+ std::vector<float> chunk(x.begin() + i, x.begin() + i + num_samples);
158
+ auto [out, _] = (*this)(chunk, sr);
159
+ result.insert(result.end(), out.begin(), out.end());
160
+ }
161
+
162
+ return result;
163
+ }
164
+
165
+ bool OnnxVadWrapper::supports_cpu() {
166
+ auto providers = Ort::GetAvailableProviders();
167
+
168
+ for (const std::string& provider : providers) {
169
+ if (provider == "CPUExecutionProvider") {
170
+ return true;
171
+ }
172
+ }
173
+
174
+ return false;
175
+ }
176
+
177
+ void OnnxVadWrapper::validate_input(const std::vector<float>& x, int sr) {
178
+ if (sr != 16000 && sr % 16000 != 0) {
179
+ throw std::invalid_argument("Unsupported sampling rate: " + std::to_string(sr));
180
+ }
181
+
182
+ if ((sr / x.size()) > 31.25) {
183
+ throw std::invalid_argument("Input audio chunk is too short");
184
+ }
185
+ }
reference/cpp/onnx_wrapper.h ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <vector>
2
+ #include <string>
3
+
4
+ #if defined(__APPLE__)
5
+ #include <onnxruntime/onnxruntime_cxx_api.h>
6
+ #else
7
+ #include "onnxruntime_run_options_config_keys.h"
8
+ #include "onnxruntime_cxx_api.h"
9
+ #endif
10
+
11
+ #ifdef _WIN32
12
+
13
+ #define ORTSTRING(str) StrToWstr(str)
14
+ #define ORTCHAR(str) StrToWstr(str).c_str()
15
+
16
+ inline std::wstring String2wstring(const std::string& str, const std::string& locale)
17
+ {
18
+ typedef std::codecvt_byname<wchar_t, char, std::mbstate_t> F;
19
+ std::wstring_convert<F> strCnv(new F(locale));
20
+ return strCnv.from_bytes(str);
21
+ }
22
+
23
+ inline std::wstring StrToWstr(std::string str) {
24
+ if (str.length() == 0)
25
+ return L"";
26
+ return String2wstring(str, "zh-CN");
27
+
28
+ }
29
+
30
+ #else
31
+
32
+ #define ORTSTRING(str) str
33
+ #define ORTCHAR(str) str
34
+
35
+ #endif
36
+
37
+ class OnnxVadWrapper {
38
+ public:
39
+ explicit OnnxVadWrapper(const std::string& model_path, bool force_cpu = false, int thread_num = 1);
40
+ ~OnnxVadWrapper();
41
+
42
+ // 重载 operator(),使得对象可以像函数一样调用
43
+ std::pair<std::vector<float>, std::vector<float>> operator()(const std::vector<float>& x, int sr);
44
+
45
+ // 批量处理整个音频
46
+ std::vector<float> audio_forward(const std::vector<float>& audio, int sr);
47
+
48
+ // 重置 RNN 状态
49
+ void reset_states(int batch_size = 1);
50
+
51
+ private:
52
+ Ort::Env env_;
53
+
54
+ std::unique_ptr<Ort::Session> session_;
55
+ std::vector<std::string> input_names_, output_names_;
56
+ std::vector<const char *> vad_in_names_;
57
+ std::vector<const char *> vad_out_names_;
58
+
59
+ std::vector<int> sample_rates_;
60
+ std::string model_path_;
61
+
62
+ std::vector<float> state_; // RNN State
63
+ std::vector<float> context_; // Context buffer
64
+ int last_sr_ = 0;
65
+ int last_batch_size_ = 0;
66
+
67
+ void read_model();
68
+ bool supports_cpu();
69
+ void validate_input(const std::vector<float>& x, int sr);
70
+ };
reference/cpp/vad_iterator_onnx.cpp ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <cmath>
2
+ #include <stdexcept>
3
+ #include <unordered_map>
4
+ #include <vector>
5
+
6
+ #include "vad_iterator_onnx.h"
7
+
8
+ VadIteratorOnnx::VadIteratorOnnx(float threshold,
9
+ int sampling_rate,
10
+ int min_silence_duration_ms,
11
+ float max_speech_duration_s,
12
+ int speech_pad_ms)
13
+ : threshold_(threshold),
14
+ sampling_rate_(sampling_rate),
15
+ min_silence_samples_(sampling_rate_ * min_silence_duration_ms / 1000.0),
16
+ speech_pad_samples_(sampling_rate_ * speech_pad_ms / 1000.0),
17
+ triggered_(false),
18
+ temp_end_(0),
19
+ current_sample_(0),
20
+ start_(0) {
21
+
22
+ if (sampling_rate_ != 8000 && sampling_rate_ != 16000) {
23
+ throw std::invalid_argument("Only support sampling rates of 8000 or 16000");
24
+ }
25
+
26
+ model_ = std::make_unique<OnnxVadWrapper>("path/to/vad.onnx", true); // 可配置路径
27
+ }
28
+
29
+ VadIteratorOnnx::~VadIteratorOnnx() = default;
30
+
31
+ void VadIteratorOnnx::reset_states() {
32
+ model_->reset_states();
33
+ triggered_ = false;
34
+ temp_end_ = 0;
35
+ current_sample_ = 0;
36
+ start_ = 0;
37
+ buffer_.clear();
38
+ }
39
+
40
+ std::unordered_map<std::string, double>
41
+ VadIteratorOnnx::operator()(const std::vector<float>& x, bool return_seconds) {
42
+ std::unordered_map<std::string, double> result;
43
+
44
+ int window_size_samples = (sampling_rate_ == 16000) ? 512 : 256;
45
+
46
+ // 将新音频追加到缓存中
47
+ buffer_.insert(buffer_.end(), x.begin(), x.end());
48
+ while (buffer_.size() > 0) {
49
+ std::unordered_map<std::string, double> tmp;
50
+ std::vector<float> chunk(buffer_.begin(), buffer_.begin() + std::min(static_cast<int>(x.size()), window_size_samples));
51
+ // 补零到固定长度
52
+ if (chunk.size() < static_cast<size_t>(window_size_samples)) {
53
+ chunk.resize(window_size_samples, 0.0f);
54
+ }
55
+
56
+ current_sample_ += window_size_samples;
57
+
58
+ // 推理得到语音概率
59
+ auto [output, _] = (*model_)(chunk, sampling_rate_);
60
+ float speech_prob = output[0];
61
+
62
+ if (speech_prob >= threshold_ && temp_end_ > 0) {
63
+ temp_end_ = 0;
64
+ }
65
+
66
+ if (speech_prob >= threshold_ && !triggered_) {
67
+ triggered_ = true;
68
+ start_ = std::max(0.0, current_sample_ - speech_pad_samples_ - window_size_samples);
69
+ tmp["start"] = return_seconds ? start_ / sampling_rate_ : start_;
70
+ }
71
+
72
+ if (speech_prob < (threshold_ - 0.15) && triggered_) {
73
+ if (temp_end_ == 0) {
74
+ temp_end_ = current_sample_;
75
+ }
76
+
77
+ if (current_sample_ - temp_end_ >= min_silence_samples_) {
78
+ double speech_end = temp_end_ + speech_pad_samples_ - window_size_samples;
79
+ tmp["end"] = return_seconds ? speech_end / sampling_rate_ : speech_end;
80
+ temp_end_ = 0;
81
+ triggered_ = false;
82
+ }
83
+ }
84
+
85
+ // 移除已处理的数据
86
+ std::vector<float>(buffer_.begin() + window_size_samples, buffer_.end()).swap(buffer_);
87
+
88
+ if (result.empty()) {
89
+ result = tmp;
90
+ } else if (!tmp.empty()) {
91
+ // 如果当前结果有 'end',更新最终 end
92
+ if (tmp.find("end") != tmp.end()) {
93
+ result["end"] = tmp["end"];
94
+ }
95
+
96
+ // 如果有新的 start,但前一个有 end,则合并成连续语音段
97
+ if (tmp.find("start") != tmp.end() && result.find("end") != result.end()) {
98
+ result.erase("end");
99
+ }
100
+ }
101
+ }
102
+
103
+ return result;
104
+ }
reference/cpp/vad_iterator_onnx.h ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <vector>
4
+ #include <string>
5
+ #include <unordered_map>
6
+
7
+ #include "onnx_wrapper.h"
8
+
9
+ class OnnxVadWrapper; // 前向声明
10
+
11
+ class VadIteratorOnnx {
12
+ public:
13
+ explicit VadIteratorOnnx(float threshold = 0.5,
14
+ int sampling_rate = 16000,
15
+ int min_silence_duration_ms = 100,
16
+ float max_speech_duration_s = INFINITY,
17
+ int speech_pad_ms = 30);
18
+
19
+ virtual ~VadIteratorOnnx();
20
+
21
+ // 重置内部状态
22
+ virtual void reset_states();
23
+
24
+ // 输入音频块,返回语音事件(start/end)
25
+ virtual std::unordered_map<std::string, double> operator()(const std::vector<float>& x, bool return_seconds = false);
26
+
27
+ private:
28
+ std::unique_ptr<OnnxVadWrapper> model_;
29
+ std::vector<float> buffer_; // 缓冲区用于保存未处理完的音频
30
+ float threshold_;
31
+ int sampling_rate_;
32
+ double min_silence_samples_;
33
+ double speech_pad_samples_;
34
+ bool triggered_;
35
+ double temp_end_;
36
+ double current_sample_;
37
+ double start_;
38
+ };
reference/python/__pycache__/audio_utils.cpython-312.pyc ADDED
Binary file (2.21 kB). View file
 
reference/python/audio_utils.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import soundfile as sf
3
+ import time
4
+
5
+ def audio_stream_generator(audio_file_path, chunk_size=4096, simulate_realtime=True):
6
+ """
7
+ 音频流生成器,从音频文件中读取数据并以流的方式输出
8
+
9
+ 参数:
10
+ audio_file_path: 音频文件路径
11
+ chunk_size: 每个数据块的大小(采样点数)
12
+ simulate_realtime: 是否模拟实时流处理的速度
13
+
14
+ 生成:
15
+ numpy.ndarray: 每次生成一个chunk_size大小的np.float32数据块
16
+ """
17
+ # 加载音频文件
18
+ audio_data, sample_rate = sf.read(audio_file_path)
19
+
20
+ # 确保音频数据是float32类型
21
+ if audio_data.dtype != np.float32:
22
+ audio_data = audio_data.astype(np.float32)
23
+
24
+ # 如果是立体声,转换为单声道
25
+ if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
26
+ audio_data = audio_data.mean(axis=1)
27
+
28
+ print(f"已加载音频文件: {audio_file_path}")
29
+ print(f"采样率: {sample_rate} Hz")
30
+ print(f"音频长度: {len(audio_data)/sample_rate:.2f} 秒")
31
+
32
+ # 计算每个块的时长(秒)
33
+ chunk_duration = chunk_size / sample_rate if simulate_realtime else 0
34
+
35
+ # 按块生成数据
36
+ audio_len = len(audio_data)
37
+ for pos in range(0, audio_len, chunk_size):
38
+ # 获取当前块
39
+ end_pos = min(pos + chunk_size, audio_len)
40
+ chunk = audio_data[pos:end_pos]
41
+
42
+ # 如果块大小不足,用0填充
43
+ if len(chunk) < chunk_size:
44
+ padded_chunk = np.zeros(chunk_size, dtype=np.float32)
45
+ padded_chunk[:len(chunk)] = chunk
46
+ chunk = padded_chunk
47
+
48
+ # 模拟实时处理的延迟
49
+ if simulate_realtime:
50
+ time.sleep(chunk_duration)
51
+
52
+ yield chunk
53
+
54
+ print("音频流处理完成")
reference/python/test_vad.ipynb ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from audio_utils import audio_stream_generator\n",
10
+ "import IPython.display as ipd\n",
11
+ "import sys\n"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 3,
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "sys.path.append(\"/Users/chenxiang/translator/core/vad_cpp/\")\n",
21
+ "from python.helpers.vadprocessor import FixedVADIterator\n"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 4,
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "vac = FixedVADIterator(\n",
31
+ " threshold=0.5,\n",
32
+ " sampling_rate=16000,\n",
33
+ " # speech_pad_ms=10\n",
34
+ " min_silence_duration_ms = 100,\n",
35
+ " # speech_pad_ms = 30,\n",
36
+ " max_speech_duration_s=5.0,\n",
37
+ " )\n"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": 5,
43
+ "metadata": {},
44
+ "outputs": [],
45
+ "source": [
46
+ "SAMPLE_FILE_PATH = \"/Users/chenxiang/translator/core/vad_cpp/bin/Chinese-liyongle-part1.mp3\"\n",
47
+ "SAMPLING_RATE = 16000\n",
48
+ "\n",
49
+ "chunks_generator = audio_stream_generator(SAMPLE_FILE_PATH, chunk_size=4096)\n",
50
+ "vac.reset_states()"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": null,
56
+ "metadata": {},
57
+ "outputs": [
58
+ {
59
+ "name": "stdout",
60
+ "output_type": "stream",
61
+ "text": [
62
+ "已加载音频文件: /Users/chenxiang/translator/core/vad_cpp/bin/Chinese-liyongle-part1.mp3\n",
63
+ "采样率: 44100 Hz\n",
64
+ "音频长度: 64.00 秒\n",
65
+ "None\n"
66
+ ]
67
+ }
68
+ ],
69
+ "source": [
70
+ "# speech_dict = vac(next(chunks_generator), return_seconds=False)\n",
71
+ "# print(speech_dict)\n"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": 6,
77
+ "metadata": {},
78
+ "outputs": [
79
+ {
80
+ "name": "stdout",
81
+ "output_type": "stream",
82
+ "text": [
83
+ "已加载音频文件: /Users/chenxiang/translator/core/vad_cpp/bin/Chinese-liyongle-part1.mp3\n",
84
+ "采样率: 44100 Hz\n",
85
+ "音频长度: 64.00 秒\n",
86
+ "0 ->>> None\n",
87
+ "1 ->>> None\n",
88
+ "2 ->>> {'start': 10272}\n",
89
+ "3 ->>> None\n",
90
+ "4 ->>> None\n",
91
+ "5 ->>> None\n",
92
+ "6 ->>> None\n",
93
+ "7 ->>> None\n",
94
+ "8 ->>> None\n",
95
+ "9 ->>> None\n",
96
+ "10 ->>> None\n",
97
+ "11 ->>> None\n",
98
+ "12 ->>> None\n",
99
+ "13 ->>> {'end': 55264}\n",
100
+ "14 ->>> None\n",
101
+ "15 ->>> {'start': 60960}\n",
102
+ "16 ->>> None\n",
103
+ "17 ->>> None\n",
104
+ "18 ->>> None\n",
105
+ "19 ->>> None\n",
106
+ "20 ->>> {'end': 82912}\n",
107
+ "21 ->>> {'start': 89120}\n",
108
+ "22 ->>> None\n",
109
+ "23 ->>> None\n",
110
+ "24 ->>> None\n",
111
+ "25 ->>> None\n",
112
+ "26 ->>> None\n",
113
+ "27 ->>> None\n",
114
+ "28 ->>> None\n",
115
+ "29 ->>> None\n",
116
+ "30 ->>> None\n",
117
+ "31 ->>> None\n",
118
+ "32 ->>> None\n",
119
+ "33 ->>> None\n",
120
+ "34 ->>> None\n",
121
+ "35 ->>> None\n",
122
+ "36 ->>> None\n",
123
+ "37 ->>> None\n",
124
+ "38 ->>> None\n",
125
+ "39 ->>> None\n",
126
+ "40 ->>> None\n",
127
+ "41 ->>> None\n",
128
+ "42 ->>> None\n",
129
+ "43 ->>> None\n",
130
+ "44 ->>> None\n",
131
+ "45 ->>> None\n",
132
+ "46 ->>> None\n",
133
+ "47 ->>> None\n",
134
+ "48 ->>> None\n",
135
+ "49 ->>> None\n",
136
+ "50 ->>> {'end': 206816}\n",
137
+ "51 ->>> None\n",
138
+ "52 ->>> None\n",
139
+ "53 ->>> {'start': 219680}\n",
140
+ "54 ->>> None\n",
141
+ "55 ->>> None\n",
142
+ "56 ->>> None\n",
143
+ "57 ->>> None\n",
144
+ "58 ->>> None\n",
145
+ "59 ->>> None\n",
146
+ "60 ->>> None\n",
147
+ "61 ->>> None\n",
148
+ "62 ->>> None\n",
149
+ "63 ->>> None\n",
150
+ "64 ->>> None\n",
151
+ "65 ->>> None\n",
152
+ "66 ->>> None\n",
153
+ "67 ->>> None\n",
154
+ "68 ->>> None\n",
155
+ "69 ->>> None\n",
156
+ "70 ->>> None\n",
157
+ "71 ->>> None\n",
158
+ "72 ->>> None\n",
159
+ "73 ->>> None\n",
160
+ "74 ->>> None\n",
161
+ "75 ->>> None\n",
162
+ "76 ->>> None\n",
163
+ "77 ->>> None\n",
164
+ "78 ->>> None\n",
165
+ "79 ->>> None\n",
166
+ "80 ->>> None\n",
167
+ "81 ->>> None\n",
168
+ "82 ->>> None\n",
169
+ "83 ->>> None\n",
170
+ "84 ->>> None\n",
171
+ "85 ->>> None\n",
172
+ "86 ->>> None\n",
173
+ "87 ->>> None\n",
174
+ "88 ->>> None\n",
175
+ "89 ->>> None\n",
176
+ "90 ->>> None\n",
177
+ "91 ->>> None\n",
178
+ "92 ->>> None\n",
179
+ "93 ->>> None\n",
180
+ "94 ->>> None\n",
181
+ "95 ->>> None\n",
182
+ "96 ->>> {'end': 394720}\n",
183
+ "97 ->>> None\n",
184
+ "98 ->>> None\n",
185
+ "99 ->>> None\n",
186
+ "100 ->>> {'start': 410144}\n",
187
+ "101 ->>> None\n",
188
+ "102 ->>> None\n",
189
+ "103 ->>> None\n",
190
+ "104 ->>> None\n",
191
+ "105 ->>> None\n",
192
+ "106 ->>> None\n",
193
+ "107 ->>> None\n",
194
+ "108 ->>> None\n",
195
+ "109 ->>> None\n",
196
+ "110 ->>> None\n",
197
+ "111 ->>> None\n",
198
+ "112 ->>> None\n",
199
+ "113 ->>> None\n",
200
+ "114 ->>> None\n",
201
+ "115 ->>> None\n",
202
+ "116 ->>> None\n",
203
+ "117 ->>> None\n",
204
+ "118 ->>> None\n",
205
+ "119 ->>> None\n",
206
+ "120 ->>> None\n",
207
+ "121 ->>> None\n",
208
+ "122 ->>> {'end': 500192}\n",
209
+ "123 ->>> {'start': 503328}\n",
210
+ "124 ->>> {'end': 509920}\n",
211
+ "125 ->>> None\n",
212
+ "126 ->>> {'start': 519200}\n",
213
+ "127 ->>> None\n",
214
+ "128 ->>> None\n",
215
+ "129 ->>> None\n",
216
+ "130 ->>> None\n",
217
+ "131 ->>> None\n",
218
+ "132 ->>> None\n",
219
+ "133 ->>> None\n",
220
+ "134 ->>> None\n",
221
+ "135 ->>> {'end': 554976}\n",
222
+ "136 ->>> {'start': 556576}\n",
223
+ "137 ->>> None\n"
224
+ ]
225
+ },
226
+ {
227
+ "ename": "KeyboardInterrupt",
228
+ "evalue": "",
229
+ "output_type": "error",
230
+ "traceback": [
231
+ "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
232
+ "\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)",
233
+ "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m i = \u001b[32m0\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mchunks_generator\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 3\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# vad_iterator.reset_states()\u001b[39;49;00m\n\u001b[32m 4\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# audio_buffer = np.append(audio_buffer, chunk)\u001b[39;49;00m\n\u001b[32m 6\u001b[39m \u001b[43m \u001b[49m\u001b[43mspeech_dict\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[43mvac\u001b[49m\u001b[43m(\u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreturn_seconds\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[32m 7\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mprint\u001b[39;49m\u001b[43m(\u001b[49m\u001b[33;43mf\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mi\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[33;43m ->>> \u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mspeech_dict\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
234
+ "\u001b[36mFile \u001b[39m\u001b[32m/Users/chenxiang/translator/core/vad_cpp/reference/归档/audio_utils.py:50\u001b[39m, in \u001b[36maudio_stream_generator\u001b[39m\u001b[34m(audio_file_path, chunk_size, simulate_realtime)\u001b[39m\n\u001b[32m 48\u001b[39m \u001b[38;5;66;03m# 模拟实时处理的延迟\u001b[39;00m\n\u001b[32m 49\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m simulate_realtime:\n\u001b[32m---> \u001b[39m\u001b[32m50\u001b[39m \u001b[43mtime\u001b[49m\u001b[43m.\u001b[49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[43mchunk_duration\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 52\u001b[39m \u001b[38;5;28;01myield\u001b[39;00m chunk\n\u001b[32m 54\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m音频流处理完成\u001b[39m\u001b[33m\"\u001b[39m)\n",
235
+ "\u001b[31mKeyboardInterrupt\u001b[39m: "
236
+ ]
237
+ }
238
+ ],
239
+ "source": [
240
+ "i = 0\n",
241
+ "for chunk in chunks_generator:\n",
242
+ " # vad_iterator.reset_states()\n",
243
+ " # audio_buffer = np.append(audio_buffer, chunk)\n",
244
+ " \n",
245
+ " speech_dict = vac(chunk, return_seconds=False)\n",
246
+ " print(f\"{i} ->>> {speech_dict}\")\n",
247
+ " # if speech_dict:\n",
248
+ " # print(speech_dict)\n",
249
+ " i+=1"
250
+ ]
251
+ },
252
+ {
253
+ "cell_type": "code",
254
+ "execution_count": null,
255
+ "metadata": {},
256
+ "outputs": [],
257
+ "source": [
258
+ "audio_data, sample_rate = sf.read(audio_file_path)\n",
259
+ "\n",
260
+ "# 确保音频数据是float32类型\n",
261
+ "if audio_data.dtype != np.float32:\n",
262
+ " audio_data = audio_data.astype(np.float32)\n",
263
+ "\n",
264
+ "# 如果是立体声,转换为单声道\n",
265
+ "if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:\n",
266
+ " audio_data = audio_data.mean(axis=1)\n",
267
+ " \n",
268
+ "print(f\"已加载音频文件: {audio_file_path}\")\n",
269
+ "print(f\"采样率: {sample_rate} Hz\")\n",
270
+ "print(f\"音频长度: {len(audio_data)/sample_rate:.2f} 秒\")"
271
+ ]
272
+ }
273
+ ],
274
+ "metadata": {
275
+ "kernelspec": {
276
+ "display_name": "base",
277
+ "language": "python",
278
+ "name": "python3"
279
+ },
280
+ "language_info": {
281
+ "codemirror_mode": {
282
+ "name": "ipython",
283
+ "version": 3
284
+ },
285
+ "file_extension": ".py",
286
+ "mimetype": "text/x-python",
287
+ "name": "python",
288
+ "nbconvert_exporter": "python",
289
+ "pygments_lexer": "ipython3",
290
+ "version": "3.12.2"
291
+ }
292
+ },
293
+ "nbformat": 4,
294
+ "nbformat_minor": 2
295
+ }
silero_vad_onnx/CMakeLists.txt ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cmake_minimum_required(VERSION 3.16)
2
+ project(VadOnnx)
3
+
4
+ set(CMAKE_CXX_STANDARD 17)
5
+ set(CMAKE_CXX_STANDARD_REQUIRED ON)
6
+ set(CMAKE_CXX_EXTENSIONS OFF)
7
+
8
+ # 添加 ONNX Runtime include 路径
9
+ include_directories(${ONNXRUNTIME_DIR}/include)
10
+
11
+ # 添加项目头文件目录
12
+ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
13
+
14
+ add_library(silero_vad_onnx SHARED ${CMAKE_CURRENT_SOURCE_DIR}/vad_iterator.cpp
15
+ ${CMAKE_CURRENT_SOURCE_DIR}/time_stamp.cpp)
16
+
17
+ # 设置库输出名称(跨平台兼容)
18
+ # set_target_properties(silero_vad_onnx PROPERTIES
19
+ # PREFIX ""
20
+ # SUFFIX ".so"
21
+ # LIBRARY_OUTPUT_NAME_DEBUG "silero_vad_onnx"
22
+ # LIBRARY_OUTPUT_NAME_RELEASE "silero_vad_onnx"
23
+ # )
24
+
25
+ # 链接 ONNX Runtime 库
26
+ if(APPLE)
27
+ # macOS 上链接 dylib
28
+ target_link_libraries(silero_vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/libonnxruntime.dylib)
29
+ elseif(UNIX)
30
+ # Linux 上链接 so
31
+ target_link_libraries(silero_vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/libonnxruntime.so)
32
+ elseif(WIN32)
33
+ # Windows 上链接 dll + lib
34
+ target_link_libraries(silero_vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/onnxruntime.lib)
35
+ set_target_properties(silero_vad_onnx PROPERTIES SUFFIX ".dll")
36
+ else()
37
+ message(WARNING "Unknown platform, no ONNX Runtime linking applied.")
38
+ endif()
39
+
silero_vad_onnx/time_stamp.cpp ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "time_stamp.h"
2
+ #include <cstdio>
3
+ #include <cstdarg>
4
+ #include <memory>
5
+ #include <string>
6
+
7
+ timestamp_t::timestamp_t(int s, int e)
8
+ : start(s), end(e) {}
9
+
10
+ timestamp_t& timestamp_t::operator=(const timestamp_t& a) {
11
+ if (this != &a) {
12
+ start = a.start;
13
+ end = a.end;
14
+ }
15
+ return *this;
16
+ }
17
+
18
+ bool timestamp_t::operator==(const timestamp_t& a) const {
19
+ return (start == a.start && end == a.end);
20
+ }
21
+
22
+ std::string timestamp_t::c_str() const {
23
+ return format("{start:%08d, end:%08d}", start, end);
24
+ }
25
+
26
+ std::string timestamp_t::format(const char* fmt, ...) const {
27
+ char buf[256];
28
+ va_list args;
29
+ va_start(args, fmt);
30
+ const auto r = std::vsnprintf(buf, sizeof(buf), fmt, args);
31
+ va_end(args);
32
+
33
+ if (r < 0)
34
+ return {};
35
+
36
+ const size_t len = r;
37
+ if (len < sizeof(buf))
38
+ return std::string(buf, len);
39
+
40
+ #if __cplusplus >= 201703L
41
+ std::string s(len + 1, '\0');
42
+ va_start(args, fmt);
43
+ std::vsnprintf(s.data(), len + 1, fmt, args);
44
+ va_end(args);
45
+ return s;
46
+ #else
47
+ std::unique_ptr<char[]> vbuf(new char[len + 1]);
48
+ va_start(args, fmt);
49
+ std::vsnprintf(vbuf.get(), len + 1, fmt, args);
50
+ va_end(args);
51
+ return std::string(vbuf.get(), len);
52
+ #endif
53
+ }
silero_vad_onnx/time_stamp.h ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef TIME_STAMP_H
2
+ #define TIME_STAMP_H
3
+
4
+ #include <string>
5
+
6
+ // timestamp_t class: stores the start and end (in samples) of a speech segment.
7
+ class timestamp_t {
8
+ public:
9
+ int start;
10
+ int end;
11
+
12
+ timestamp_t(int start = -1, int end = -1);
13
+
14
+ timestamp_t& operator=(const timestamp_t& a);
15
+
16
+ bool operator==(const timestamp_t& a) const;
17
+
18
+ // Returns a formatted string of the timestamp.
19
+ std::string c_str() const;
20
+
21
+ private:
22
+ // Helper function for formatting.
23
+ std::string format(const char* fmt, ...) const;
24
+ };
25
+
26
+ #endif // TIME_STAMP_H
silero_vad_onnx/vad_iterator.cpp ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "vad_iterator.h"
2
+ #include <cmath>
3
+ #include <cstdio>
4
+ #include <cstring>
5
+ #include <memory>
6
+
7
+
8
+ void VadIterator::init_onnx_model(const std::string& model_path) {
9
+ init_engine_threads(1, 1);
10
+ session = std::make_shared<Ort::Session>(env, model_path.c_str(), session_options);
11
+ }
12
+
13
+ void VadIterator::init_engine_threads(int inter_threads, int intra_threads) {
14
+ session_options.SetIntraOpNumThreads(intra_threads);
15
+ session_options.SetInterOpNumThreads(inter_threads);
16
+ session_options.SetGraphOptimizationLevel(ORT_ENABLE_ALL);
17
+ }
18
+
19
+ void VadIterator::reset_states() {
20
+ std::memset(_state.data(), 0, _state.size() * sizeof(float));
21
+ triggered = false;
22
+ temp_end = 0;
23
+ current_sample = 0;
24
+ prev_end = next_start = 0;
25
+ speeches.clear();
26
+ current_speech = timestamp_t();
27
+ std::fill(_context.begin(), _context.end(), 0.0f);
28
+ }
29
+
30
+ void VadIterator::predict(const std::vector<float>& data_chunk) {
31
+ std::vector<float> new_data(effective_window_size, 0.0f);
32
+ std::copy(_context.begin(), _context.end(), new_data.begin());
33
+ std::copy(data_chunk.begin(), data_chunk.end(), new_data.begin() + context_samples);
34
+ input = new_data;
35
+
36
+ Ort::Value input_ort = Ort::Value::CreateTensor<float>(
37
+ memory_info, input.data(), input.size(), input_node_dims, 2);
38
+ Ort::Value state_ort = Ort::Value::CreateTensor<float>(
39
+ memory_info, _state.data(), _state.size(), state_node_dims, 3);
40
+ Ort::Value sr_ort = Ort::Value::CreateTensor<int64_t>(
41
+ memory_info, sr.data(), sr.size(), sr_node_dims, 1);
42
+
43
+ ort_inputs.clear();
44
+ ort_inputs.emplace_back(std::move(input_ort));
45
+ ort_inputs.emplace_back(std::move(state_ort));
46
+ ort_inputs.emplace_back(std::move(sr_ort));
47
+
48
+ ort_outputs = session->Run(
49
+ Ort::RunOptions{nullptr},
50
+ input_node_names.data(), ort_inputs.data(), ort_inputs.size(),
51
+ output_node_names.data(), output_node_names.size());
52
+
53
+ float speech_prob = ort_outputs[0].GetTensorMutableData<float>()[0];
54
+ float* stateN = ort_outputs[1].GetTensorMutableData<float>();
55
+ std::memcpy(_state.data(), stateN, size_state * sizeof(float));
56
+
57
+ current_sample += static_cast<unsigned int>(window_size_samples);
58
+
59
+ if (speech_prob >= threshold) {
60
+ if (temp_end != 0) {
61
+ temp_end = 0;
62
+ if (next_start < prev_end)
63
+ next_start = current_sample - window_size_samples;
64
+ }
65
+ if (!triggered) {
66
+ triggered = true;
67
+ current_speech.start = current_sample - window_size_samples;
68
+ }
69
+ std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
70
+ return;
71
+ }
72
+
73
+ if (triggered && ((current_sample - current_speech.start) > max_speech_samples)) {
74
+ if (prev_end > 0) {
75
+ current_speech.end = prev_end;
76
+ speeches.push_back(current_speech);
77
+ current_speech = timestamp_t();
78
+ if (next_start < prev_end)
79
+ triggered = false;
80
+ else
81
+ current_speech.start = next_start;
82
+ prev_end = 0;
83
+ next_start = 0;
84
+ temp_end = 0;
85
+ } else {
86
+ current_speech.end = current_sample;
87
+ speeches.push_back(current_speech);
88
+ current_speech = timestamp_t();
89
+ prev_end = 0;
90
+ next_start = 0;
91
+ temp_end = 0;
92
+ triggered = false;
93
+ }
94
+ std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
95
+ return;
96
+ }
97
+
98
+ if ((speech_prob >= (threshold - 0.15)) && (speech_prob < threshold)) {
99
+ std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
100
+ return;
101
+ }
102
+
103
+ if (speech_prob < (threshold - 0.15)) {
104
+ if (triggered) {
105
+ if (temp_end == 0)
106
+ temp_end = current_sample;
107
+ if (current_sample - temp_end > min_silence_samples_at_max_speech)
108
+ prev_end = temp_end;
109
+ if ((current_sample - temp_end) >= min_silence_samples) {
110
+ current_speech.end = temp_end;
111
+ if (current_speech.end - current_speech.start > min_speech_samples) {
112
+ speeches.push_back(current_speech);
113
+ current_speech = timestamp_t();
114
+ prev_end = 0;
115
+ next_start = 0;
116
+ temp_end = 0;
117
+ triggered = false;
118
+ }
119
+ }
120
+ }
121
+ std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
122
+ return;
123
+ }
124
+ }
125
+
126
+ void VadIterator::process(const std::vector<float>& input_wav) {
127
+ reset_states();
128
+ audio_length_samples = static_cast<int>(input_wav.size());
129
+
130
+ for (size_t j = 0; j < static_cast<size_t>(audio_length_samples); j += static_cast<size_t>(window_size_samples)) {
131
+ if (j + static_cast<size_t>(window_size_samples) > static_cast<size_t>(audio_length_samples))
132
+ break;
133
+ std::vector<float> chunk(&input_wav[j], &input_wav[j] + window_size_samples);
134
+ predict(chunk);
135
+ }
136
+
137
+ if (current_speech.start >= 0) {
138
+ current_speech.end = audio_length_samples;
139
+ speeches.push_back(current_speech);
140
+ current_speech = timestamp_t();
141
+ prev_end = 0;
142
+ next_start = 0;
143
+ temp_end = 0;
144
+ triggered = false;
145
+ }
146
+ }
147
+
148
+ const std::vector<timestamp_t>& VadIterator::get_speech_timestamps() const {
149
+ return speeches;
150
+ }
151
+
152
+ void VadIterator::reset() {
153
+ reset_states();
154
+ }
155
+
156
+ // 构造函数实现
157
+ VadIterator::VadIterator(const std::string ModelPath,
158
+ int Sample_rate,
159
+ int windows_frame_size,
160
+ float Threshold,
161
+ int min_silence_duration_ms,
162
+ int speech_pad_ms,
163
+ int min_speech_duration_ms,
164
+ float max_speech_duration_s)
165
+ : sample_rate(Sample_rate),
166
+ threshold(Threshold),
167
+ speech_pad_samples(speech_pad_ms),
168
+ prev_end(0),
169
+ memory_info(Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtDeviceAllocator, OrtMemType::OrtMemTypeDefault))
170
+ {
171
+
172
+ sr_per_ms = sample_rate / 1000;
173
+ window_size_samples = windows_frame_size * sr_per_ms;
174
+ effective_window_size = window_size_samples + context_samples;
175
+
176
+ input_node_dims[0] = 1;
177
+ input_node_dims[1] = effective_window_size;
178
+
179
+ _state.resize(size_state);
180
+ sr.resize(1);
181
+ sr[0] = sample_rate;
182
+ _context.assign(context_samples, 0.0f);
183
+
184
+ min_speech_samples = sr_per_ms * min_speech_duration_ms;
185
+
186
+ if (max_speech_duration_s < 0) {
187
+ max_speech_samples = std::numeric_limits<float>::infinity();
188
+ } else {
189
+ max_speech_samples = (sample_rate * max_speech_duration_s - window_size_samples - 2 * speech_pad_samples);
190
+ }
191
+
192
+ min_silence_samples = sr_per_ms * min_silence_duration_ms;
193
+ min_silence_samples_at_max_speech = sr_per_ms * 98;
194
+
195
+ init_onnx_model(ModelPath);
196
+ }
silero_vad_onnx/vad_iterator.h ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef VAD_ITERATOR_H
2
+ #define VAD_ITERATOR_H
3
+
4
+ #include "time_stamp.h"
5
+ #include <vector>
6
+ #include <string>
7
+ #if defined(__APPLE__)
8
+ #include <onnxruntime/onnxruntime_cxx_api.h>
9
+ #else
10
+ #include "onnxruntime_run_options_config_keys.h"
11
+ #include "onnxruntime_cxx_api.h"
12
+ #endif
13
+ // 前向声明 timestamp_t
14
+ class timestamp_t;
15
+
16
+ class VadIterator {
17
+ public:
18
+ // 构造函数
19
+ VadIterator(const std::string ModelPath,
20
+ int Sample_rate = 16000,
21
+ int windows_frame_size = 32,
22
+ float Threshold = 0.5,
23
+ int min_silence_duration_ms = 100,
24
+ int speech_pad_ms = 30,
25
+ int min_speech_duration_ms = 250,
26
+ float max_speech_duration_s = -1); // -1 表示无穷大
27
+
28
+ // 公共方法
29
+ void process(const std::vector<float>& input_wav);
30
+ const std::vector<timestamp_t>& get_speech_timestamps() const;
31
+ void reset();
32
+
33
+ private:
34
+ // ONNX Runtime 资源
35
+ Ort::Env env;
36
+ Ort::SessionOptions session_options;
37
+ std::shared_ptr<Ort::Session> session = nullptr;
38
+ Ort::AllocatorWithDefaultOptions allocator;
39
+ Ort::MemoryInfo memory_info;
40
+
41
+ // Context 相关变量
42
+ const int context_samples = 64;
43
+ std::vector<float> _context;
44
+ int window_size_samples;
45
+ int effective_window_size;
46
+ int sr_per_ms;
47
+
48
+ // ONNX 输入输出相关
49
+ std::vector<Ort::Value> ort_inputs;
50
+ std::vector<const char*> input_node_names = {"input", "state", "sr"};
51
+ std::vector<float> input;
52
+ unsigned int size_state = 2 * 1 * 128;
53
+ std::vector<float> _state;
54
+ std::vector<int64_t> sr;
55
+ int64_t input_node_dims[2];
56
+ const int64_t state_node_dims[3] = {2, 1, 128};
57
+ const int64_t sr_node_dims[1] = {1};
58
+ std::vector<Ort::Value> ort_outputs;
59
+ std::vector<const char*> output_node_names = {"output", "stateN"};
60
+
61
+ // 模型参数
62
+ int sample_rate;
63
+ float threshold;
64
+ int min_silence_samples;
65
+ int min_silence_samples_at_max_speech;
66
+ int min_speech_samples;
67
+ float max_speech_samples;
68
+ int speech_pad_samples;
69
+ int audio_length_samples;
70
+
71
+ // 状态管理
72
+ bool triggered = false;
73
+ unsigned int temp_end = 0;
74
+ unsigned int current_sample = 0;
75
+ int prev_end;
76
+ int next_start = 0;
77
+ std::vector<timestamp_t> speeches;
78
+ timestamp_t current_speech;
79
+
80
+ // 私有方法
81
+ void init_onnx_model(const std::string& model_path);
82
+ void init_engine_threads(int inter_threads, int intra_threads);
83
+ void reset_states();
84
+ void predict(const std::vector<float>& data_chunk);
85
+ };
86
+
87
+ #endif // VAD_ITERATOR_H
silero_vad_onnx/wav.h ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) 2016 Personal (Binbin Zhang)
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ #ifndef FRONTEND_WAV_H_
16
+ #define FRONTEND_WAV_H_
17
+
18
+
19
+ #include <assert.h>
20
+ #include <stdint.h>
21
+ #include <stdio.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+
25
+ #include <string>
26
+
27
+ #include <iostream>
28
+
29
+ // #include "utils/log.h"
30
+
31
+ namespace wav {
32
+
33
+ struct WavHeader {
34
+ char riff[4]; // "riff"
35
+ unsigned int size;
36
+ char wav[4]; // "WAVE"
37
+ char fmt[4]; // "fmt "
38
+ unsigned int fmt_size;
39
+ uint16_t format;
40
+ uint16_t channels;
41
+ unsigned int sample_rate;
42
+ unsigned int bytes_per_second;
43
+ uint16_t block_size;
44
+ uint16_t bit;
45
+ char data[4]; // "data"
46
+ unsigned int data_size;
47
+ };
48
+
49
+ class WavReader {
50
+ public:
51
+ WavReader() : data_(nullptr) {}
52
+ explicit WavReader(const std::string& filename) { Open(filename); }
53
+
54
+ bool Open(const std::string& filename) {
55
+ FILE* fp = fopen(filename.c_str(), "rb"); //文件读取
56
+ if (NULL == fp) {
57
+ std::cout << "Error in read " << filename;
58
+ return false;
59
+ }
60
+
61
+ WavHeader header;
62
+ fread(&header, 1, sizeof(header), fp);
63
+ if (header.fmt_size < 16) {
64
+ printf("WaveData: expect PCM format data "
65
+ "to have fmt chunk of at least size 16.\n");
66
+ return false;
67
+ } else if (header.fmt_size > 16) {
68
+ int offset = 44 - 8 + header.fmt_size - 16;
69
+ fseek(fp, offset, SEEK_SET);
70
+ fread(header.data, 8, sizeof(char), fp);
71
+ }
72
+ // check "riff" "WAVE" "fmt " "data"
73
+
74
+ // Skip any sub-chunks between "fmt" and "data". Usually there will
75
+ // be a single "fact" sub chunk, but on Windows there can also be a
76
+ // "list" sub chunk.
77
+ while (0 != strncmp(header.data, "data", 4)) {
78
+ // We will just ignore the data in these chunks.
79
+ fseek(fp, header.data_size, SEEK_CUR);
80
+ // read next sub chunk
81
+ fread(header.data, 8, sizeof(char), fp);
82
+ }
83
+
84
+ if (header.data_size == 0) {
85
+ int offset = ftell(fp);
86
+ fseek(fp, 0, SEEK_END);
87
+ header.data_size = ftell(fp) - offset;
88
+ fseek(fp, offset, SEEK_SET);
89
+ }
90
+
91
+ num_channel_ = header.channels;
92
+ sample_rate_ = header.sample_rate;
93
+ bits_per_sample_ = header.bit;
94
+ int num_data = header.data_size / (bits_per_sample_ / 8);
95
+ data_ = new float[num_data]; // Create 1-dim array
96
+ num_samples_ = num_data / num_channel_;
97
+
98
+ std::cout << "num_channel_ :" << num_channel_ << std::endl;
99
+ std::cout << "sample_rate_ :" << sample_rate_ << std::endl;
100
+ std::cout << "bits_per_sample_:" << bits_per_sample_ << std::endl;
101
+ std::cout << "num_samples :" << num_data << std::endl;
102
+ std::cout << "num_data_size :" << header.data_size << std::endl;
103
+
104
+ switch (bits_per_sample_) {
105
+ case 8: {
106
+ char sample;
107
+ for (int i = 0; i < num_data; ++i) {
108
+ fread(&sample, 1, sizeof(char), fp);
109
+ data_[i] = static_cast<float>(sample) / 32768;
110
+ }
111
+ break;
112
+ }
113
+ case 16: {
114
+ int16_t sample;
115
+ for (int i = 0; i < num_data; ++i) {
116
+ fread(&sample, 1, sizeof(int16_t), fp);
117
+ data_[i] = static_cast<float>(sample) / 32768;
118
+ }
119
+ break;
120
+ }
121
+ case 32:
122
+ {
123
+ if (header.format == 1) //S32
124
+ {
125
+ int sample;
126
+ for (int i = 0; i < num_data; ++i) {
127
+ fread(&sample, 1, sizeof(int), fp);
128
+ data_[i] = static_cast<float>(sample) / 32768;
129
+ }
130
+ }
131
+ else if (header.format == 3) // IEEE-float
132
+ {
133
+ float sample;
134
+ for (int i = 0; i < num_data; ++i) {
135
+ fread(&sample, 1, sizeof(float), fp);
136
+ data_[i] = static_cast<float>(sample);
137
+ }
138
+ }
139
+ else {
140
+ printf("unsupported quantization bits\n");
141
+ }
142
+ break;
143
+ }
144
+ default:
145
+ printf("unsupported quantization bits\n");
146
+ break;
147
+ }
148
+
149
+ fclose(fp);
150
+ return true;
151
+ }
152
+
153
+ int num_channel() const { return num_channel_; }
154
+ int sample_rate() const { return sample_rate_; }
155
+ int bits_per_sample() const { return bits_per_sample_; }
156
+ int num_samples() const { return num_samples_; }
157
+
158
+ ~WavReader() {
159
+ delete[] data_;
160
+ }
161
+
162
+ const float* data() const { return data_; }
163
+
164
+ private:
165
+ int num_channel_;
166
+ int sample_rate_;
167
+ int bits_per_sample_;
168
+ int num_samples_; // sample points per channel
169
+ float* data_;
170
+ };
171
+
172
+ class WavWriter {
173
+ public:
174
+ WavWriter(const float* data, int num_samples, int num_channel,
175
+ int sample_rate, int bits_per_sample)
176
+ : data_(data),
177
+ num_samples_(num_samples),
178
+ num_channel_(num_channel),
179
+ sample_rate_(sample_rate),
180
+ bits_per_sample_(bits_per_sample) {}
181
+
182
+ void Write(const std::string& filename) {
183
+ FILE* fp = fopen(filename.c_str(), "w");
184
+ // init char 'riff' 'WAVE' 'fmt ' 'data'
185
+ WavHeader header;
186
+ char wav_header[44] = {0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57,
187
+ 0x41, 0x56, 0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00,
188
+ 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
189
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
190
+ 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00};
191
+ memcpy(&header, wav_header, sizeof(header));
192
+ header.channels = num_channel_;
193
+ header.bit = bits_per_sample_;
194
+ header.sample_rate = sample_rate_;
195
+ header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8);
196
+ header.size = sizeof(header) - 8 + header.data_size;
197
+ header.bytes_per_second =
198
+ sample_rate_ * num_channel_ * (bits_per_sample_ / 8);
199
+ header.block_size = num_channel_ * (bits_per_sample_ / 8);
200
+
201
+ fwrite(&header, 1, sizeof(header), fp);
202
+
203
+ for (int i = 0; i < num_samples_; ++i) {
204
+ for (int j = 0; j < num_channel_; ++j) {
205
+ switch (bits_per_sample_) {
206
+ case 8: {
207
+ char sample = static_cast<char>(data_[i * num_channel_ + j]);
208
+ fwrite(&sample, 1, sizeof(sample), fp);
209
+ break;
210
+ }
211
+ case 16: {
212
+ int16_t sample = static_cast<int16_t>(data_[i * num_channel_ + j]);
213
+ fwrite(&sample, 1, sizeof(sample), fp);
214
+ break;
215
+ }
216
+ case 32: {
217
+ int sample = static_cast<int>(data_[i * num_channel_ + j]);
218
+ fwrite(&sample, 1, sizeof(sample), fp);
219
+ break;
220
+ }
221
+ }
222
+ }
223
+ }
224
+ fclose(fp);
225
+ }
226
+
227
+ private:
228
+ const float* data_;
229
+ int num_samples_; // total float points in data_
230
+ int num_channel_;
231
+ int sample_rate_;
232
+ int bits_per_sample_;
233
+ };
234
+
235
+ } // namespace wav
236
+
237
+ #endif // FRONTEND_WAV_H_
vad_onnx/CMakeLists.txt ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cmake_minimum_required(VERSION 3.16)
2
+ project(VadOnnx)
3
+
4
+ set(CMAKE_CXX_STANDARD 17)
5
+ set(CMAKE_CXX_STANDARD_REQUIRED ON)
6
+ set(CMAKE_CXX_EXTENSIONS OFF)
7
+
8
+ # 添加 ONNX Runtime include 路径
9
+ include_directories(${ONNXRUNTIME_DIR}/include)
10
+
11
+ # 添加项目头文件目录
12
+ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
13
+
14
+ add_library(vad_onnx SHARED ${CMAKE_CURRENT_SOURCE_DIR}/vad_onnx.cpp)
15
+
16
+ # 设置库输出名称(跨平台兼容)
17
+ # set_target_properties(vad_onnx PROPERTIES
18
+ # PREFIX ""
19
+ # SUFFIX ".so"
20
+ # LIBRARY_OUTPUT_NAME_DEBUG "vad_onnx"
21
+ # LIBRARY_OUTPUT_NAME_RELEASE "vad_onnx"
22
+ # )
23
+
24
+ # 链接 ONNX Runtime 库
25
+ if(APPLE)
26
+ # macOS 上链接 dylib
27
+ target_link_libraries(vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/libonnxruntime.dylib)
28
+ elseif(UNIX)
29
+ # Linux 上链接 so
30
+ target_link_libraries(vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/libonnxruntime.so)
31
+ elseif(WIN32)
32
+ # Windows 上链接 dll + lib
33
+ target_link_libraries(vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/onnxruntime.lib)
34
+ set_target_properties(vad_onnx PROPERTIES SUFFIX ".dll")
35
+ else()
36
+ message(WARNING "Unknown platform, no ONNX Runtime linking applied.")
37
+ endif()
38
+
39
+
40
+
vad_onnx/vad_onnx.cpp ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <stdexcept>
2
+ #include <cmath>
3
+ #include <iostream>
4
+
5
+ #include "vad_onnx.h"
6
+
7
+
8
+ static void get_input_names(Ort::Session* session, std::vector<std::string> &input_names_str,
9
+ std::vector<const char *> &input_names_char) {
10
+ Ort::AllocatorWithDefaultOptions allocator;
11
+ size_t nodes_num = session->GetInputCount();
12
+ input_names_str.resize(nodes_num);
13
+ input_names_char.resize(nodes_num);
14
+
15
+ for (size_t i = 0; i != nodes_num; ++i) {
16
+ auto t = session->GetInputNameAllocated(i, allocator);
17
+ input_names_str[i] = t.get();
18
+ input_names_char[i] = input_names_str[i].c_str();
19
+ }
20
+ }
21
+
22
+ static void get_output_names(Ort::Session* session, std::vector<std::string> &output_names_,
23
+ std::vector<const char *> &vad_out_names_) {
24
+ Ort::AllocatorWithDefaultOptions allocator;
25
+ size_t nodes_num = session->GetOutputCount();
26
+ output_names_.resize(nodes_num);
27
+ vad_out_names_.resize(nodes_num);
28
+ for (size_t i = 0; i != nodes_num; ++i) {
29
+ auto t = session->GetOutputNameAllocated(i, allocator);
30
+ output_names_[i] = t.get();
31
+ vad_out_names_[i] = output_names_[i].c_str();
32
+ }
33
+ }
34
+
35
+ VadOnnx::VadOnnx(const std::string& model_path,
36
+ int batch_size,
37
+ int thread_num,
38
+ float threshold,
39
+ int sampling_rate,
40
+ int min_silence_duration_ms,
41
+ float max_speech_duration_s,
42
+ int speech_pad_ms)
43
+ : batch_size_(batch_size),
44
+ thread_num_(thread_num),
45
+ threshold_(threshold),
46
+ sample_rates_(sampling_rate),
47
+ min_silence_samples_(sampling_rate * min_silence_duration_ms / 1000.0),
48
+ speech_pad_samples_(sampling_rate * speech_pad_ms / 1000.0),
49
+ triggered_(false),
50
+ temp_end_(0),
51
+ current_sample_(0),
52
+ start_(0),
53
+ memory_info(Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU))
54
+ {
55
+
56
+ init_onnx_model(model_path);
57
+
58
+ get_input_names(session.get(), input_names_, vad_in_names_);
59
+ get_output_names(session.get(), output_names_, vad_out_names_);
60
+
61
+ sr.resize(1);
62
+ sr[0] = sample_rates_;
63
+
64
+ if (batch_size_ != 1) {
65
+ state_shape = {2, batch_size_, 128};
66
+ state_size = 2 * batch_size_ * 128;
67
+ }
68
+ state_.resize(state_size);
69
+
70
+ context_size = (sample_rates_ == 16000) ? 64 : 32;
71
+ context_.resize(context_size);
72
+
73
+ effective_window_size = window_size_samples + context_size;
74
+ input_node_shape[0] = 1;
75
+ input_node_shape[1] = effective_window_size;
76
+
77
+ reset_states();
78
+ }
79
+
80
+ VadOnnx::~VadOnnx() = default;
81
+
82
+ void VadOnnx::reset_states() {
83
+ std::memset(state_.data(), 0, state_.size() * sizeof(float));
84
+ std::fill(context_.begin(), context_.end(), 0.0f);
85
+ triggered_ = false;
86
+ temp_end_ = 0;
87
+ current_sample_ = 0;
88
+ start_ = 0;
89
+ last_sr_ = 0;
90
+ last_batch_size_ = 0;
91
+ }
92
+
93
+ float VadOnnx::forward_infer(std::vector<float>& data_chunk) {
94
+ // 合并 context 和 input
95
+ std::vector<float> x_with_context(effective_window_size, 0.0f);
96
+ std::copy(context_.begin(), context_.end(), x_with_context.begin());
97
+ std::copy(data_chunk.begin(), data_chunk.end(), x_with_context.begin() + context_size);
98
+ input = x_with_context;
99
+
100
+ // Prepare inputs
101
+ Ort::Value input_tensor = Ort::Value::CreateTensor<float>(
102
+ memory_info, input.data(), input.size(), input_node_shape.data(), 2);
103
+ Ort::Value state_tensor = Ort::Value::CreateTensor<float>(
104
+ memory_info, state_.data(), state_.size(), state_shape.data(), 3);
105
+ Ort::Value sr_tensor = Ort::Value::CreateTensor<int64_t>(
106
+ memory_info, sr.data(), 1, sr_shape.data(), 1);
107
+
108
+ ort_inputs.clear();
109
+ ort_inputs.emplace_back(std::move(input_tensor));
110
+ ort_inputs.emplace_back(std::move(state_tensor));
111
+ ort_inputs.emplace_back(std::move(sr_tensor));
112
+
113
+ // Run inference
114
+ ort_outputs = session->Run(
115
+ Ort::RunOptions{nullptr}, vad_in_names_.data(), ort_inputs.data(),
116
+ ort_inputs.size(), vad_out_names_.data(), vad_out_names_.size());
117
+
118
+
119
+ // Get output
120
+ float speech_prob = ort_outputs[0].GetTensorMutableData<float>()[0];
121
+
122
+ // Update state
123
+ float* stateN = ort_outputs[1].GetTensorMutableData<float>();
124
+ std::memcpy(state_.data(), stateN, state_size * sizeof(float));
125
+
126
+ // Update context
127
+ std::copy(x_with_context.end() - context_size, x_with_context.end(), context_.begin());
128
+
129
+ return speech_prob;
130
+ }
131
+
132
+ std::vector<float> VadOnnx::vad_dectect(std::vector<float>& audio) {
133
+ std::vector<float> result;
134
+
135
+ // Pad to multiple of num_samples
136
+ int pad_num = (window_size_samples - (audio.size() % window_size_samples)) % window_size_samples;
137
+ audio.insert(audio.end(), pad_num, 0.0f);
138
+
139
+ for (size_t i = 0; i < audio.size(); i += window_size_samples) {
140
+ std::vector<float> chunk(audio.begin() + i, audio.begin() + i + window_size_samples);
141
+ auto prob = forward_infer(chunk);
142
+ result.emplace_back(prob);
143
+ }
144
+
145
+ return result;
146
+ }
147
+
148
+ std::map<std::string, double> VadOnnx::vad_dectect(std::vector<float>& audio, bool return_seconds) {
149
+ std::map<std::string, double> result;
150
+
151
+ // 将新音频追加到缓存中
152
+ buffer_.insert(buffer_.end(), audio.begin(), audio.end());
153
+
154
+ while (buffer_.size() > 0) {
155
+ std::map<std::string, double> tmp;
156
+ std::vector<float> chunk(buffer_.begin(), buffer_.begin() + std::min(static_cast<int>(buffer_.size()), window_size_samples));
157
+ // 补零到固定长度
158
+ if (chunk.size() < static_cast<size_t>(window_size_samples)) {
159
+ chunk.resize(window_size_samples, 0.0f);
160
+ }
161
+
162
+ current_sample_ += window_size_samples;
163
+
164
+ // 推理得到语音概率
165
+ float speech_prob = forward_infer(chunk);
166
+
167
+ if (speech_prob >= threshold_ && temp_end_ > 0) {
168
+ temp_end_ = 0;
169
+ }
170
+
171
+ if (speech_prob >= threshold_ && !triggered_) {
172
+ triggered_ = true;
173
+ start_ = std::max(0.0, current_sample_ - window_size_samples);
174
+ tmp["start"] = return_seconds ? start_ / sample_rates_ : start_;
175
+ }
176
+
177
+ if (speech_prob < (threshold_ - 0.15) && triggered_) {
178
+ if (temp_end_ == 0) {
179
+ temp_end_ = current_sample_;
180
+ }
181
+
182
+ if (current_sample_ - temp_end_ >= min_silence_samples_) {
183
+ double speech_end = temp_end_;
184
+ tmp["end"] = return_seconds ? speech_end / sample_rates_ : speech_end;
185
+ temp_end_ = 0;
186
+ triggered_ = false;
187
+ }
188
+ }
189
+
190
+ // 移除已处理的数据
191
+ if (window_size_samples >= buffer_.size()) {
192
+ buffer_.clear(); // 全部丢弃
193
+ } else {
194
+ std::copy(buffer_.begin() + window_size_samples, buffer_.end(), buffer_.begin());
195
+ buffer_.resize(buffer_.size() - window_size_samples);
196
+ }
197
+
198
+ // 合并检测结果
199
+ if (result.empty()) {
200
+ result = tmp;
201
+ } else if (!tmp.empty()) {
202
+ // 如果当前结果有 'end',更新最终 end
203
+ if (tmp.find("end") != tmp.end()) {
204
+ result["end"] = tmp["end"];
205
+ }
206
+
207
+ // 如果有新的 start,但前一个有 end,则合并成连续语音段
208
+ if (tmp.find("start") != tmp.end() && result.find("end") != result.end()) {
209
+ result.erase("end");
210
+ }
211
+ }
212
+ }
213
+
214
+ return result;
215
+ }
216
+
217
+ void VadOnnx::init_onnx_model(const std::string& model_path) {
218
+ init_engine_threads(1, 1);
219
+ init_exec_provider();
220
+
221
+ // 初始化 ONNX Session
222
+ env_ = Ort::Env(ORT_LOGGING_LEVEL_WARNING, "VadOnnx");
223
+ session = std::make_unique<Ort::Session>(env_, ORTCHAR(model_path.c_str()), session_options);
224
+ }
225
+
226
+ void VadOnnx::init_engine_threads(int inter_threads, int intra_threads) {
227
+ session_options.SetInterOpNumThreads(inter_threads);
228
+ session_options.SetIntraOpNumThreads(intra_threads);
229
+ session_options.SetGraphOptimizationLevel(ORT_ENABLE_ALL);
230
+ }
231
+
232
+ void VadOnnx::init_exec_provider() {
233
+ // 获取所有可用的 Execution Providers
234
+ std::vector<std::string> providers = Ort::GetAvailableProviders();
235
+ // 根据支持情况添加 Execution Provider
236
+ if (std::find(providers.begin(), providers.end(), "CUDAExecutionProvider") != providers.end()) {
237
+ OrtCUDAProviderOptions cuda_options{};
238
+ session_options.AppendExecutionProvider_CUDA(cuda_options);
239
+ }
240
+ // #if defined(__APPLE__)
241
+ // if (std::find(providers.begin(), providers.end(), "CoreMLExecutionProvider") != providers.end()) {
242
+ // session_options.AppendExecutionProvider_CoreML();
243
+ // }
244
+ // #endif
245
+ }
vad_onnx/vad_onnx.h ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <vector>
2
+ #include <string>
3
+ #include <map>
4
+
5
+ #if defined(__APPLE__)
6
+ #include <onnxruntime/onnxruntime_cxx_api.h>
7
+ #else
8
+ #include "onnxruntime_run_options_config_keys.h"
9
+ #include "onnxruntime_cxx_api.h"
10
+ #endif
11
+
12
+ #ifdef _WIN32
13
+
14
+ #define ORTSTRING(str) StrToWstr(str)
15
+ #define ORTCHAR(str) StrToWstr(str).c_str()
16
+
17
+ inline std::wstring String2wstring(const std::string& str, const std::string& locale)
18
+ {
19
+ typedef std::codecvt_byname<wchar_t, char, std::mbstate_t> F;
20
+ std::wstring_convert<F> strCnv(new F(locale));
21
+ return strCnv.from_bytes(str);
22
+ }
23
+
24
+ inline std::wstring StrToWstr(std::string str) {
25
+ if (str.length() == 0)
26
+ return L"";
27
+ return String2wstring(str, "zh-CN");
28
+
29
+ }
30
+
31
+ #else
32
+
33
+ #define ORTSTRING(str) str
34
+ #define ORTCHAR(str) str
35
+
36
+ #endif
37
+
38
+ class VadOnnx {
39
+
40
+ public:
41
+ explicit VadOnnx(const std::string& model_path,
42
+ int batch_size = 1,
43
+ int thread_num = 1,
44
+ float threshold = 0.5,
45
+ int sampling_rate = 16000,
46
+ int min_silence_duration_ms = 100,
47
+ float max_speech_duration_s = INFINITY,
48
+ int speech_pad_ms = 30);
49
+ ~VadOnnx();
50
+
51
+ // 处理固定长度音频(16000 -> 512 , 8000 -> 256)
52
+ float forward_infer(std::vector<float>& data_chunk);
53
+
54
+ // 处理整个长音频,返回概率
55
+ std::vector<float> vad_dectect(std::vector<float>& audio);
56
+
57
+ // 处理整个长音频,返回有效音频区间
58
+ std::map<std::string, double> vad_dectect(std::vector<float>& audio, bool return_seconds);
59
+
60
+ // 重置 RNN 状态
61
+ void reset_states();
62
+
63
+ private:
64
+ // onnx资源参数
65
+ Ort::Env env_;
66
+ Ort::SessionOptions session_options;
67
+ std::unique_ptr<Ort::Session> session = nullptr;
68
+ Ort::AllocatorWithDefaultOptions allocator;
69
+ Ort::MemoryInfo memory_info;
70
+ int thread_num_;
71
+
72
+ // onnx输入输出相关
73
+ std::vector<Ort::Value> ort_inputs, ort_outputs;
74
+ std::vector<std::string> input_names_, output_names_;
75
+ std::vector<const char *> vad_in_names_;
76
+ std::vector<const char *> vad_out_names_;
77
+
78
+ int window_size_samples = 512;
79
+ int effective_window_size;
80
+ std::array<int64_t, 2> input_node_shape;
81
+ std::vector<float> input;
82
+ std::array<int64_t, 3> state_shape = {2, 1, 128};
83
+ int state_size = 2 * 1 * 128;
84
+ std::vector<float> state_; // RNN State
85
+ int context_size;
86
+ std::vector<float> context_; // Context buffer
87
+ std::array<int64_t, 1> sr_shape = {1};
88
+ std::vector<int64_t> sr;
89
+
90
+ // vad推理参数
91
+ std::vector<float> buffer_; // 缓冲区用于保存未处理完的音频
92
+ double min_silence_samples_;
93
+ double speech_pad_samples_;
94
+ double temp_end_;
95
+ double current_sample_;
96
+ double start_;
97
+ float threshold_;
98
+ bool triggered_;
99
+ int batch_size_;
100
+ int sample_rates_;
101
+ int last_sr_ = 0;
102
+ int last_batch_size_ = 0;
103
+
104
+ void init_onnx_model(const std::string& model_path);
105
+ void init_engine_threads(int inter_threads, int intra_threads);
106
+ void init_exec_provider();
107
+ };