xianglarry commited on 4 days ago

Commit

d21d362

0 Parent(s):

Duplicate from MoYoYoTech/vad_cpp

Browse files

Co-authored-by: chenxiang <xianglarry@users.noreply.huggingface.co>

Files changed (39) hide show

.gitattributes +35 -0
CMakeLists.txt +74 -0
README.md +84 -0
bin/CMakeLists.txt +26 -0
bin/main.cpp +71 -0
bin/main_silero.cpp +59 -0
bin/test_main.cpp +87 -0
bin/test_silero.cpp +80 -0
bin/wav.h +237 -0
python/__inip__.py +0 -0
python/__pycache__/processing.cpython-312.pyc +0 -0
python/helpers/__init__.py +0 -0
python/helpers/__pycache__/__init__.cpython-312.pyc +0 -0
python/helpers/__pycache__/vadprocessor.cpython-312.pyc +0 -0
python/helpers/vadprocessor.py +603 -0
python/pipelines/__init__.py +3 -0
python/pipelines/__pycache__/__init__.cpython-312.pyc +0 -0
python/pipelines/__pycache__/base.cpython-312.pyc +0 -0
python/pipelines/__pycache__/pipe_vad.cpython-312.pyc +0 -0
python/pipelines/base.py +71 -0
python/pipelines/pipe_vad.py +96 -0
python/processing.py +62 -0
reference/.DS_Store +0 -0
reference/cpp/onnx_wrapper.cpp +185 -0
reference/cpp/onnx_wrapper.h +70 -0
reference/cpp/vad_iterator_onnx.cpp +104 -0
reference/cpp/vad_iterator_onnx.h +38 -0
reference/python/__pycache__/audio_utils.cpython-312.pyc +0 -0
reference/python/audio_utils.py +54 -0
reference/python/test_vad.ipynb +295 -0
silero_vad_onnx/CMakeLists.txt +39 -0
silero_vad_onnx/time_stamp.cpp +53 -0
silero_vad_onnx/time_stamp.h +26 -0
silero_vad_onnx/vad_iterator.cpp +196 -0
silero_vad_onnx/vad_iterator.h +87 -0
silero_vad_onnx/wav.h +237 -0
vad_onnx/CMakeLists.txt +40 -0
vad_onnx/vad_onnx.cpp +245 -0
vad_onnx/vad_onnx.h +107 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,74 @@

+cmake_minimum_required(VERSION 3.16)
+project(VadOnnx)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+# 可选：测试是否为大端系统（用于音频处理等场景）
+include(TestBigEndian)
+test_big_endian(BIG_ENDIAN)
+if(BIG_ENDIAN)
+    message("Big endian system")
+else()
+    message("Little endian system")
+endif()
+# 查找 ONNX Runtime 安装路径
+if(NOT DEFINED ONNXRUNTIME_DIR OR NOT EXISTS ${ONNXRUNTIME_DIR})
+    message(FATAL_ERROR "Please specify ONNXRUNTIME_DIR when configuring, e.g. cmake -DONNXRUNTIME_DIR=/path/to/onnxruntime ..")
+endif()
+# 使用 pkg-config 查找 sndfile
+find_package(PkgConfig REQUIRED)
+pkg_check_modules(SNDFILE REQUIRED IMPORTED_TARGET sndfile)
+# # 添加 ONNX Runtime include 路径
+# include_directories(${ONNXRUNTIME_DIR}/include)
+# file(GLOB SOURCES_FILES "${CMAKE_SOURCE_DIR}/src/*.cpp")
+# set(files ${files1})
+# add_library(vad_onnx SHARED ${SOURCES_FILES})
+# add_library(vad_onnx SHARED ${CMAKE_SOURCE_DIR}/src/vad_onnx.cpp)
+# 设置库输出名称（跨平台兼容）
+# set_target_properties(vad_onnx PROPERTIES
+#     PREFIX ""
+#     SUFFIX ".so"
+#     LIBRARY_OUTPUT_NAME_DEBUG "vad_onnx"
+#     LIBRARY_OUTPUT_NAME_RELEASE "vad_onnx"
+# )
+# 链接 ONNX Runtime 库
+# if(APPLE)
+#     # macOS 上链接 dylib
+#     target_link_libraries(vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/libonnxruntime.dylib)
+# elseif(UNIX)
+#     # Linux 上链接 so
+#     target_link_libraries(vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/libonnxruntime.so)
+# elseif(WIN32)
+#     # Windows 上链接 dll + lib
+#     target_link_libraries(vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/onnxruntime.lib)
+#     set_target_properties(vad_onnx PROPERTIES SUFFIX ".dll")
+# else()
+#     message(WARNING "Unknown platform, no ONNX Runtime linking applied.")
+# endif()
+# 添加项目头文件目录
+# include_directories(${CMAKE_SOURCE_DIR}/src)
+# 可选：添加 ffmpeg 等其他依赖
+# if (ENABLE_FFMPEG)
+#     include_directories(${FFMPEG_DIR}/include)
+#     target_link_libraries(vad_onnx PRIVATE ${FFMPEG_DIR}/lib/libavcodec.a ${FFMPEG_DIR}/lib/libavutil.a)
+# endif()
+# https://github.com/snakers4/silero-vad  examples/cpp
+add_subdirectory(silero_vad_onnx)
+# from moyoyo/translator python/helpers/vadprocessor.py
+add_subdirectory(vad_onnx)
+# 编译测试程序
+add_subdirectory(bin)

README.md ADDED Viewed

	@@ -0,0 +1,84 @@

+---
+license: mit
+---
+## 简介
+这是一个对 silero_vad (https://github.com/snakers4/silero-vad) 的简易封装，便于开发。
+## 目录简介
+## 环境安装
+### 系统环境
+> 1. 在macOS 14.5 版本已验证执行OK。
+> 2. mac上安装 onnxruntime(brew 安装未使能 CoreMl, 需下载源码, 手动编译使能, 并安装)
+    ```bash
+    brew install onnxruntime
+    ```
+> 3. 下载onnxruntime源码, 手动编译使能, 并安装
+    ```bash
+    brew install cmake protobuf python # 可选
+    git clone https://github.com/microsoft/onnxruntime
+    cd onnxruntime
+    ./build.sh --config Release --enable_coreml
+    # 或者 ./build.sh --config Release --enable_coreml --build_wheel --parallel
+    sudo ./install_to_system.sh # 如果不安装，库目录build/Release，头文件目录 build/Release/include/
+    ```
+## 目录简介
+    ```
+    .
+    ├── README.md
+    ├── bin/
+    │   ├──main_silero.cpp      // 参照 silero_vad中cpp的example封装代码进行测试
+    │   ├── main.cpp            // 参照 translator中FixedVADIterator封装代码进行测试
+    │   ├── wav.h               // 定义读取 wav 文件类
+    │   └── ...
+    ├── python/
+    │   ├── processing.py      // translator中FixedVADIterator的python脚本
+    │   └── ...
+    ├── reference/             // python、cpp参考代码
+    ├── silero_vad_onnx/       // 参照 silero_vad中cpp的封装
+    │   ├── time_stamp.cpp
+    │   ├── time_stamp.h
+    │   ├── vad_iterator.cpp
+    │   ├── vad_iterator.h
+    │   └── ...
+    ├── vad_onnx/               // 参照 translator中FixedVADIterator封装
+    │   ├── vad_onnx.cpp
+    │   ├── vad_onnx.h
+    │   └── ...
+    └── ...
+    ```
+## 编译
+    ```bash
+    git clone https://huggingface.co/MoYoYoTech/vad_cpp
+    cd vad_cpp
+    mkdir build
+    cd build
+    cmake .. -DONNXRUNTIME_DIR=/opt/homebrew/Cellar/onnxruntime/1.21.1   # 或者指定源码编译后的路径，包括 include 和 lib目录
+    make
+    ```
+## 运行&使用
+### 接口调用和使用参考 main_silero.cpp 和 main.cpp
+    ```bash
+    cd bin
+    # silero_vad_onnx.dylib 测试程序
+    ./main_silero "/Users/.../Translator/moyoyo_asr_models/silero-vad/silero_vad.onnx" "/Users/xxx/zh.wav"
+    # vad_onnx.dylib 测试程序
+    ./main "/Users/.../Translator/moyoyo_asr_models/silero-vad/silero_vad.onnx" "/Users/xxx/zh.wav"
+    ```
+###  translator中FixedVADIterator的python测试脚本
+    ```bash
+    cd vad_cpp
+    # python/processing.py中配置 wav_path ; 在python/helpers/vadprocessor.py中配置 VAD_MODEL_PATH
+    python -m python.processing
+    # 结果显示
+    ....
+    935936 ->>> {'start': 935456} -> {'start': 5664}
+    984576 ->>> {'end': 983008} -> {'start': 5664, 'end': 983008}
+    strat: 5664 end: 983008
+    ```

bin/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+set(CMAKE_CXX_STANDARD 17)
+if(WIN32)
+add_compile_options("$<$<CXX_COMPILER_ID:MSVC>:/execution-charset:utf-8>")
+add_compile_options("$<$<CXX_COMPILER_ID:MSVC>:/source-charset:utf-8>")
+endif()
+# 查找 onnxruntime
+find_package(onnxruntime REQUIRED)
+# 添加 ONNX Runtime include 路径
+include_directories(${ONNXRUNTIME_DIR}/include)
+include_directories(${PROJECT_SOURCE_DIR}/vad_onnx)
+add_executable(main "main.cpp")
+if(UNIX AND NOT APPLE)
+    target_link_options(main PRIVATE "-Wl,--no-as-needed")
+endif()
+target_link_libraries(main PUBLIC vad_onnx onnxruntime::onnxruntime PkgConfig::SNDFILE)
+include_directories(${PROJECT_SOURCE_DIR}/silero_vad_onnx)
+add_executable(main_silero "main_silero.cpp")
+if(UNIX AND NOT APPLE)
+    target_link_options(main PRIVATE "-Wl,--no-as-needed")
+endif()
+target_link_libraries(main_silero PUBLIC silero_vad_onnx onnxruntime::onnxruntime)

bin/main.cpp ADDED Viewed

	@@ -0,0 +1,71 @@

+#include "vad_onnx.h"
+#include <iostream>
+#include <sndfile.h>
+int main(int argc, char* argv[]) {
+    if (argc < 3) {
+        std::cerr << "Usage: " << argv[0] << " <model_absolute_path>" << " <audio_file_absolute_path>" << std::endl;
+        return 1;
+    }
+    // 获取命令行传入的音频文件路径
+    std::string model_path = argv[1];
+    std::string wav_path = argv[2];
+    // std::string model_path = "/Users/chenxiang/translator/Translator/moyoyo_asr_models/silero-vad/silero_vad.onnx";
+    // std::string wav_path = "/Users/chenxiang/translator/core/whisper_wrapper/bin/zh.wav";
+    // 加载音频文件
+    SF_INFO sf_info;
+    SNDFILE* file = sf_open(wav_path.c_str(), SFM_READ, &sf_info);
+    int samplerate = sf_info.samplerate;
+    int channels = sf_info.channels;
+    int frames = sf_info.frames;
+    std::vector<float> audio(frames * channels);
+    sf_readf_float(file, audio.data(), sf_info.frames);
+    sf_close(file);
+    // 创建目标 buffer 来保存 512 帧音频数据
+    std::vector<float> audio_512frames(audio.begin(), audio.begin() + 512);
+    try {
+        VadOnnx vad_model = VadOnnx(model_path);
+        // 输入一段音频数据（512 samples）
+        float result_512 = vad_model.forward_infer(audio_512frames);
+        std::cout << "result_512 = " << result_512 << std::endl;
+        std::vector<float> result_1 = vad_model.vad_dectect(audio);
+        if (!result_1.empty()) {
+            std::cout << "result_1.size = " << result_1.size() << std::endl;
+            for (int i = 0; i < 5 && i < result_1.size(); ++i) {
+                std::cout << result_1[i] << ", ";
+            }
+            std::cout << "(only show 5)" << std::endl;
+        }
+        std::map<std::string, double> result_map;
+        result_map = vad_model.vad_dectect(audio, false);
+        std::cerr << "result: " << std::endl;
+        if (!result_map.empty()) {
+            for (const auto& pair : result_map) {
+                std::cout << pair.first << " : " << pair.second << std::endl;
+            }
+        }
+    } catch (const std::exception& ex) {
+        std::cerr << "Error: " << ex.what() << std::endl;
+    }
+    // // 输出音频信息
+    // std::cout << "========= 音频信息 =========" << std::endl;
+    // std::cout << "采样率: " << samplerate << " Hz" << std::endl;
+    // std::cout << "通道数: " << channels << std::endl;
+    // std::cout << "总帧数: " << frames << std::endl;
+    // std::cout << "===========================" << std::endl;
+    return 0;
+}

bin/main_silero.cpp ADDED Viewed

	@@ -0,0 +1,59 @@

+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <iomanip>      // std::fixed, std::setprecision
+// 自定义头文件
+#include "wav.h"         // 包含 wav::WavReader 定义
+#include "time_stamp.h"          // 包含 timestamp_t 定义
+#include "vad_iterator.h"       // 包含 VadIterator 类声明
+int main(int argc, char* argv[]) {
+    if (argc < 3) {
+        std::cerr << "Usage: " << argv[0] << " <model_absolute_path>" << " <audio_file_absolute_path>" << std::endl;
+        return 1;
+    }
+    // 获取命令行传入的音频文件路径
+    std::string model_path = argv[1];
+    std::string wav_path = argv[2];
+    // std::string model_path = "/Users/chenxiang/translator/Translator/moyoyo_asr_models/silero-vad/silero_vad.onnx";
+    // std::string wav_path = "/Users/chenxiang/translator/core/whisper_wrapper/bin/zh.wav";
+    // Read the WAV file (expects 16000 Hz, mono, PCM).
+    wav::WavReader wav_reader(wav_path); // File located in the "audio" folder.
+    int numSamples = wav_reader.num_samples();
+    std::vector<float> input_wav(static_cast<size_t>(numSamples));
+    for (size_t i = 0; i < static_cast<size_t>(numSamples); i++) {
+        input_wav[i] = static_cast<float>(*(wav_reader.data() + i));
+    }
+    // Initialize the VadIterator.
+    VadIterator vad(model_path);
+    // Process the audio.
+    vad.process(input_wav);
+    // Retrieve the speech timestamps (in samples).
+    std::vector<timestamp_t> stamps = vad.get_speech_timestamps();
+    // Convert timestamps to seconds and round to one decimal place (for 16000 Hz).
+    const float sample_rate_float = 16000.0f;
+    for (size_t i = 0; i < stamps.size(); i++) {
+        float start_sec = std::rint((stamps[i].start / sample_rate_float) * 10.0f) / 10.0f;
+        float end_sec = std::rint((stamps[i].end / sample_rate_float) * 10.0f) / 10.0f;
+        std::cout << "Speech detected from "
+            << std::fixed << std::setprecision(1) << start_sec
+            << " s to "
+            << std::fixed << std::setprecision(1) << end_sec
+            << " s"
+            << " [ " << stamps[i].start << " " << stamps[i].end <<" ]"
+            << std::endl;
+    }
+    // Optionally, reset the internal state.
+    vad.reset();
+    return 0;
+}

bin/test_main.cpp ADDED Viewed

	@@ -0,0 +1,87 @@

+#include "vad_onnx.h"
+#include <iostream>
+#include <sndfile.h>
+#include <vector>
+#include <map>
+#include <fstream>
+#include <string>
+int main(int argc, char* argv[]) {
+    if (argc < 3) {
+        std::cerr << "Usage: " << argv[0] << " <model_absolute_path> <audio_list_absolute_path>" << std::endl;
+        return 1;
+    }
+    // 获取命令行传入的模型路径和音频列表文件路径
+    std::string model_path = argv[1];
+    std::string audio_list_path = argv[2];
+    // 打开 audio_list.txt 文件
+    std::ifstream audio_list_file(audio_list_path);
+    if (!audio_list_file.is_open()) {
+        std::cerr << "Error: Unable to open audio list file: " << audio_list_path << std::endl;
+        return 1;
+    }
+    try {
+        VadOnnx vad_model = VadOnnx(model_path);
+        // 逐行读取音频文件路径并处理
+        std::string wav_path;
+        while (std::getline(audio_list_file, wav_path)) {
+            if (wav_path.empty()) {
+                continue; // 跳过空行
+            }
+            vad_model.reset_states(); // 重置状态
+            std::cout << wav_path << std::endl;
+            // 加载音频文件
+            SF_INFO sf_info;
+            SNDFILE* file = sf_open(wav_path.c_str(), SFM_READ, &sf_info);
+            if (!file) {
+                std::cerr << "Error: Unable to open audio file: " << wav_path << std::endl;
+                continue; // 跳过无法打开的文件
+            }
+            int samplerate = sf_info.samplerate;
+            int channels = sf_info.channels;
+            int frames = sf_info.frames;
+            std::vector<float> audio_buffer(4096 * channels); // 用于存储每次读取的 4096 帧音频数据
+            try {
+                // 循环读取音频文件，每次读取 4096 帧
+                int read_frames = 0;
+                while ((read_frames = sf_readf_float(file, audio_buffer.data(), 4096)) > 0) {
+                    // 如果实际读取的帧数小于 4096，则调整 buffer 大小
+                    audio_buffer.resize(read_frames * channels);
+                    // 推理
+                    std::map<std::string, double> result_map = vad_model.vad_dectect(audio_buffer, false);
+                    // 打印推理结果
+                    if (!result_map.empty()) {
+                        for (const auto& pair : result_map) {
+                            std::cout << pair.first << ", " << pair.second << std::endl;
+                        }
+                    }
+                }
+                sf_close(file);
+            } catch (const std::exception& ex) {
+                std::cerr << "Error processing file " << wav_path << ": " << ex.what() << std::endl;
+                sf_close(file);
+            }
+        }
+        audio_list_file.close();
+    } catch (const std::exception& ex) {
+        std::cerr << "Error: " << ex.what() << std::endl;
+        return 1;
+    }
+    return 0;
+}

bin/test_silero.cpp ADDED Viewed

	@@ -0,0 +1,80 @@

+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <iomanip>      // std::fixed, std::setprecision
+#include <fstream>      // std::ifstream
+#include <string>       // std::string
+// 自定义头文件
+#include "wav.h"         // 包含 wav::WavReader 定义
+#include "time_stamp.h"          // 包含 timestamp_t 定义
+#include "vad_iterator.h"       // 包含 VadIterator 类声明
+int main(int argc, char* argv[]) {
+    if (argc < 3) {
+        std::cerr << "Usage: " << argv[0] << " <model_absolute_path>" << " <audio_list_absolute_path>" << std::endl;
+        return 1;
+    }
+    // 获取命令行传入的模型路径和音频列表文件路径
+    std::string model_path = argv[1];
+    std::string audio_list_path = argv[2];
+    // 打开 audio_list.txt 文件
+    std::ifstream audio_list_file(audio_list_path);
+    if (!audio_list_file.is_open()) {
+        std::cerr << "Error: Unable to open audio list file: " << audio_list_path << std::endl;
+        return 1;
+    }
+    // 初始化 VadIterator
+    VadIterator vad(model_path);
+    // 逐行读取音频文件路径并处理
+    std::string wav_path;
+    while (std::getline(audio_list_file, wav_path)) {
+        if (wav_path.empty()) {
+            continue; // 跳过空行
+        }
+        std::cout << wav_path << std::endl;
+        try {
+            // 读取 WAV 文件 (expects 16000 Hz, mono, PCM)
+            wav::WavReader wav_reader(wav_path);
+            int numSamples = wav_reader.num_samples();
+            std::vector<float> input_wav(static_cast<size_t>(numSamples));
+            for (size_t i = 0; i < static_cast<size_t>(numSamples); i++) {
+                input_wav[i] = static_cast<float>(*(wav_reader.data() + i));
+            }
+            // 处理音频
+            vad.process(input_wav);
+            // 获取语音时间戳 (以样本为单位)
+            std::vector<timestamp_t> stamps = vad.get_speech_timestamps();
+            // 将时间戳转换为秒并输出
+            const float sample_rate_float = 16000.0f;
+            for (size_t i = 0; i < stamps.size(); i++) {
+                float start_sec = std::rint((stamps[i].start / sample_rate_float) * 10.0f) / 10.0f;
+                float end_sec = std::rint((stamps[i].end / sample_rate_float) * 10.0f) / 10.0f;
+                // std::cout << "Speech detected from "
+                //           << std::fixed << std::setprecision(1) << start_sec
+                //           << " s to "
+                //           << std::fixed << std::setprecision(1) << end_sec
+                //           << " s"
+                //           << " [ " << stamps[i].start << " " << stamps[i].end << " ]"
+                //           << std::endl;
+                std::cout << stamps[i].start << ", " << stamps[i].end << std::endl;
+            }
+            // 重置内部状态
+            vad.reset();
+        } catch (const std::exception& e) {
+            std::cerr << "Error processing file " << wav_path << ": " << e.what() << std::endl;
+        }
+    }
+    audio_list_file.close();
+    return 0;
+}

bin/wav.h ADDED Viewed

	@@ -0,0 +1,237 @@

+// Copyright (c) 2016 Personal (Binbin Zhang)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef FRONTEND_WAV_H_
+#define FRONTEND_WAV_H_
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <iostream>
+// #include "utils/log.h"
+namespace wav {
+struct WavHeader {
+  char riff[4];  // "riff"
+  unsigned int size;
+  char wav[4];  // "WAVE"
+  char fmt[4];  // "fmt "
+  unsigned int fmt_size;
+  uint16_t format;
+  uint16_t channels;
+  unsigned int sample_rate;
+  unsigned int bytes_per_second;
+  uint16_t block_size;
+  uint16_t bit;
+  char data[4];  // "data"
+  unsigned int data_size;
+};
+class WavReader {
+ public:
+  WavReader() : data_(nullptr) {}
+  explicit WavReader(const std::string& filename) { Open(filename); }
+  bool Open(const std::string& filename) {
+    FILE* fp = fopen(filename.c_str(), "rb"); //文件读取
+    if (NULL == fp) {
+      std::cout << "Error in read " << filename;
+      return false;
+    }
+    WavHeader header;
+    fread(&header, 1, sizeof(header), fp);
+    if (header.fmt_size < 16) {
+      printf("WaveData: expect PCM format data "
+              "to have fmt chunk of at least size 16.\n");
+      return false;
+    } else if (header.fmt_size > 16) {
+      int offset = 44 - 8 + header.fmt_size - 16;
+      fseek(fp, offset, SEEK_SET);
+      fread(header.data, 8, sizeof(char), fp);
+    }
+    // check "riff" "WAVE" "fmt " "data"
+    // Skip any sub-chunks between "fmt" and "data".  Usually there will
+    // be a single "fact" sub chunk, but on Windows there can also be a
+    // "list" sub chunk.
+    while (0 != strncmp(header.data, "data", 4)) {
+      // We will just ignore the data in these chunks.
+      fseek(fp, header.data_size, SEEK_CUR);
+      // read next sub chunk
+      fread(header.data, 8, sizeof(char), fp);
+    }
+    if (header.data_size == 0) {
+        int offset = ftell(fp);
+        fseek(fp, 0, SEEK_END);
+        header.data_size = ftell(fp) - offset;
+        fseek(fp, offset, SEEK_SET);
+    }
+    num_channel_ = header.channels;
+    sample_rate_ = header.sample_rate;
+    bits_per_sample_ = header.bit;
+    int num_data = header.data_size / (bits_per_sample_ / 8);
+    data_ = new float[num_data]; // Create 1-dim array
+    num_samples_ = num_data / num_channel_;
+    std::cout << "num_channel_    :" << num_channel_ << std::endl;
+    std::cout << "sample_rate_    :" << sample_rate_ << std::endl;
+    std::cout << "bits_per_sample_:" << bits_per_sample_ << std::endl;
+    std::cout << "num_samples     :" << num_data << std::endl;
+    std::cout << "num_data_size   :" << header.data_size << std::endl;
+    switch (bits_per_sample_) {
+        case 8: {
+            char sample;
+            for (int i = 0; i < num_data; ++i) {
+                fread(&sample, 1, sizeof(char), fp);
+                data_[i] = static_cast<float>(sample) / 32768;
+            }
+            break;
+        }
+        case 16: {
+            int16_t sample;
+            for (int i = 0; i < num_data; ++i) {
+                fread(&sample, 1, sizeof(int16_t), fp);
+                data_[i] = static_cast<float>(sample) / 32768;
+            }
+            break;
+        }
+        case 32:
+        {
+            if (header.format == 1) //S32
+            {
+                int sample;
+                for (int i = 0; i < num_data; ++i) {
+                    fread(&sample, 1, sizeof(int), fp);
+                    data_[i] = static_cast<float>(sample) / 32768;
+                }
+            }
+            else if (header.format == 3) // IEEE-float
+            {
+                float sample;
+                for (int i = 0; i < num_data; ++i) {
+                    fread(&sample, 1, sizeof(float), fp);
+                    data_[i] = static_cast<float>(sample);
+                }
+            }
+            else {
+                printf("unsupported quantization bits\n");
+            }
+            break;
+        }
+        default:
+            printf("unsupported quantization bits\n");
+            break;
+    }
+    fclose(fp);
+    return true;
+  }
+  int num_channel() const { return num_channel_; }
+  int sample_rate() const { return sample_rate_; }
+  int bits_per_sample() const { return bits_per_sample_; }
+  int num_samples() const { return num_samples_; }
+  ~WavReader() {
+    delete[] data_;
+  }
+  const float* data() const { return data_; }
+ private:
+  int num_channel_;
+  int sample_rate_;
+  int bits_per_sample_;
+  int num_samples_;  // sample points per channel
+  float* data_;
+};
+class WavWriter {
+ public:
+  WavWriter(const float* data, int num_samples, int num_channel,
+            int sample_rate, int bits_per_sample)
+      : data_(data),
+        num_samples_(num_samples),
+        num_channel_(num_channel),
+        sample_rate_(sample_rate),
+        bits_per_sample_(bits_per_sample) {}
+  void Write(const std::string& filename) {
+    FILE* fp = fopen(filename.c_str(), "w");
+    // init char 'riff' 'WAVE' 'fmt ' 'data'
+    WavHeader header;
+    char wav_header[44] = {0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57,
+                           0x41, 0x56, 0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00,
+                           0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                           0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                           0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00};
+    memcpy(&header, wav_header, sizeof(header));
+    header.channels = num_channel_;
+    header.bit = bits_per_sample_;
+    header.sample_rate = sample_rate_;
+    header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8);
+    header.size = sizeof(header) - 8 + header.data_size;
+    header.bytes_per_second =
+        sample_rate_ * num_channel_ * (bits_per_sample_ / 8);
+    header.block_size = num_channel_ * (bits_per_sample_ / 8);
+    fwrite(&header, 1, sizeof(header), fp);
+    for (int i = 0; i < num_samples_; ++i) {
+      for (int j = 0; j < num_channel_; ++j) {
+        switch (bits_per_sample_) {
+          case 8: {
+            char sample = static_cast<char>(data_[i * num_channel_ + j]);
+            fwrite(&sample, 1, sizeof(sample), fp);
+            break;
+          }
+          case 16: {
+            int16_t sample = static_cast<int16_t>(data_[i * num_channel_ + j]);
+            fwrite(&sample, 1, sizeof(sample), fp);
+            break;
+          }
+          case 32: {
+            int sample = static_cast<int>(data_[i * num_channel_ + j]);
+            fwrite(&sample, 1, sizeof(sample), fp);
+            break;
+          }
+        }
+      }
+    }
+    fclose(fp);
+  }
+ private:
+  const float* data_;
+  int num_samples_;  // total float points in data_
+  int num_channel_;
+  int sample_rate_;
+  int bits_per_sample_;
+};
+}  // namespace wav
+#endif  // FRONTEND_WAV_H_

python/__inip__.py ADDED Viewed

File without changes

python/__pycache__/processing.cpython-312.pyc ADDED Viewed

Binary file (3.25 kB). View file

python/helpers/__init__.py ADDED Viewed

File without changes

python/helpers/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (164 Bytes). View file

python/helpers/__pycache__/vadprocessor.cpython-312.pyc ADDED Viewed

Binary file (27.6 kB). View file

python/helpers/vadprocessor.py ADDED Viewed

	@@ -0,0 +1,603 @@

+from copy import deepcopy
+from time import time
+# from config import VAD_MODEL_PATH
+# from silero_vad import load_silero_vad
+import numpy as np
+import onnxruntime
+import logging
+from datetime import timedelta
+import gc
+# from pydub import AudioSegment
+from collections import deque
+VAD_MODEL_PATH = "/Users/xxx/Translator/moyoyo_asr_models/silero-vad/silero_vad.onnx"
+class AdaptiveSilenceController:
+    def __init__(self, base_silence_ms=120, min_ms=50, max_ms=600):
+        self.base = base_silence_ms
+        self.min = min_ms
+        self.max = max_ms
+        self.recent_silences = deque(maxlen=20)
+        self.recent_speeches = deque(maxlen=20)
+    def update_silence(self, duration_ms):
+        self.recent_silences.append(duration_ms)
+    def update_speech(self, duration_ms):
+        self.recent_speeches.append(duration_ms)
+    def get_adaptive_silence_ms(self):
+        # 1. 快速说话特征：平均语音段长度短（如 < 250ms）
+        avg_speech = np.mean(self.recent_speeches) if self.recent_speeches else self.base
+        avg_silence = np.mean(self.recent_silences) if self.recent_silences else self.base
+        # 2. 快速语音则缩短 silence 阈值
+        speed_factor = 1.0
+        if avg_speech < 300:
+            speed_factor = 0.5
+        elif avg_speech < 600:
+            speed_factor = 0.8
+        logging.warning(f"Avg speech :{avg_speech}, Avg silence: {avg_silence}")
+        # 3. silence 的变化趋势也考虑进去
+        adaptive = self.base * speed_factor + 0.3 * avg_silence
+        return int(max(self.min, min(self.max, adaptive)))
+class OnnxWrapper():
+    def __init__(self, path, force_onnx_cpu=False):
+        opts = onnxruntime.SessionOptions()
+        opts.inter_op_num_threads = 1
+        opts.intra_op_num_threads = 1
+        if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers():
+            self.session = onnxruntime.InferenceSession(path, providers=['CPUExecutionProvider'], sess_options=opts)
+        else:
+            self.session = onnxruntime.InferenceSession(path, sess_options=opts)
+        self.reset_states()
+        self.sample_rates = [16000]
+    def _validate_input(self, x: np.ndarray, sr: int):
+        if x.ndim == 1:
+            x = x[None]
+        if x.ndim > 2:
+            raise ValueError(f"Too many dimensions for input audio chunk {x.ndim}")
+        if sr != 16000 and (sr % 16000 == 0):
+            step = sr // 16000
+            x = x[:, ::step]
+            sr = 16000
+        if sr not in self.sample_rates:
+            raise ValueError(f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)")
+        if sr / x.shape[1] > 31.25:
+            raise ValueError("Input audio chunk is too short")
+        return x, sr
+    def reset_states(self, batch_size=1):
+        self._state = np.zeros((2, batch_size, 128)).astype(np.float32)
+        self._context = np.zeros(0)
+        self._last_sr = 0
+        self._last_batch_size = 0
+    def __call__(self, x, sr: int):
+        x, sr = self._validate_input(x, sr)
+        num_samples = 512 if sr == 16000 else 256
+        if x.shape[-1] != num_samples:
+            raise ValueError(
+                f"Provided number of samples is {x.shape[-1]} (Supported values: 256 for 8000 sample rate, 512 for 16000)")
+        batch_size = x.shape[0]
+        context_size = 64 if sr == 16000 else 32
+        if not self._last_batch_size:
+            self.reset_states(batch_size)
+        if (self._last_sr) and (self._last_sr != sr):
+            self.reset_states(batch_size)
+        if (self._last_batch_size) and (self._last_batch_size != batch_size):
+            self.reset_states(batch_size)
+        if not len(self._context):
+            self._context = np.zeros((batch_size, context_size)).astype(np.float32)
+        x = np.concatenate([self._context, x], axis=1)
+        if sr in [8000, 16000]:
+            ort_inputs = {'input': x, 'state': self._state, 'sr': np.array(sr, dtype='int64')}
+            ort_outs = self.session.run(None, ort_inputs)
+            out, state = ort_outs
+            self._state = state
+        else:
+            raise ValueError()
+        self._context = x[..., -context_size:]
+        self._last_sr = sr
+        self._last_batch_size = batch_size
+        # out = torch.from_numpy(out)
+        return out
+    def audio_forward(self, audio: np.ndarray, sr: int):
+        outs = []
+        x, sr = self._validate_input(audio, sr)
+        self.reset_states()
+        num_samples = 512 if sr == 16000 else 256
+        if x.shape[1] % num_samples:
+            pad_num = num_samples - (x.shape[1] % num_samples)
+            x = np.pad(x, ((0, 0), (0, pad_num)), 'constant', constant_values=(0.0, 0.0))
+        for i in range(0, x.shape[1], num_samples):
+            wavs_batch = x[:, i:i + num_samples]
+            out_chunk = self.__call__(wavs_batch, sr)
+            outs.append(out_chunk)
+        stacked = np.concatenate(outs, axis=1)
+        return stacked
+class VADIteratorOnnx:
+    def __init__(self,
+                 threshold: float = 0.5,
+                 sampling_rate: int = 16000,
+                 min_silence_duration_ms: int = 100,
+                 max_speech_duration_s: float = float('inf'),
+                 speech_pad_ms: int = 30
+                 ):
+        self.model = OnnxWrapper(VAD_MODEL_PATH, True)
+        self.threshold = threshold
+        self.sampling_rate = sampling_rate
+        if sampling_rate not in [8000, 16000]:
+            raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')
+        self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
+        # self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
+        self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
+        self.reset_states()
+    def reset_states(self):
+        self.model.reset_states()
+        self.triggered = False
+        self.temp_end = 0
+        self.current_sample = 0
+        self.start = 0
+    def __call__(self, x: np.ndarray, return_seconds=False):
+        """
+        x: np.ndarray
+            audio chunk (see examples in repo)
+        return_seconds: bool (default - False)
+            whether return timestamps in seconds (default - samples)
+        """
+        window_size_samples = 512 if self.sampling_rate == 16000 else 256
+        x = x[:window_size_samples]
+        if len(x) < window_size_samples:
+            x = np.pad(x, ((0, 0), (0, window_size_samples - len(x))), 'constant', constant_values=0.0)
+        self.current_sample += window_size_samples
+        speech_prob = self.model(x, self.sampling_rate)[0,0]
+        if (speech_prob >= self.threshold) and self.temp_end:
+            self.temp_end = 0
+        if (speech_prob >= self.threshold) and not self.triggered:
+            self.triggered = True
+            # speech_start = max(0, self.current_sample - window_size_samples)
+            speech_start = max(0, self.current_sample - self.speech_pad_samples - window_size_samples)
+            self.start = speech_start
+            return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
+        # if (speech_prob >= self.threshold) and self.current_sample - self.start >= self.max_speech_samples:
+        #     if self.temp_end:
+        #         self.temp_end = 0
+        #     self.start = self.current_sample
+        #     return {'end': int(self.current_sample) if not return_seconds else round(self.current_sample / self.sampling_rate, 1)}
+        if (speech_prob < self.threshold - 0.15) and self.triggered:
+            if not self.temp_end:
+                self.temp_end = self.current_sample
+            if self.current_sample - self.temp_end < self.min_silence_samples:
+                return None
+            else:
+                # speech_end = self.temp_end - window_size_samples
+                speech_end = self.temp_end + self.speech_pad_samples - window_size_samples
+                self.temp_end = 0
+                self.triggered = False
+                return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
+        return None
+class FixedVADIterator(VADIteratorOnnx):
+    '''It fixes VADIterator by allowing to process any audio length, not only exactly 512 frames at once.
+    If audio to be processed at once is long and multiple voiced segments detected,
+    then __call__ returns the start of the first segment, and end (or middle, which means no end) of the last segment.
+    '''
+    def reset_states(self):
+        super().reset_states()
+        self.buffer = np.array([],dtype=np.float32)
+    def __call__(self, x, return_seconds=False):
+        self.buffer = np.append(self.buffer, x)
+        # print(f"len(self.buffer): {len(self.buffer)}")
+        ret = None
+        i = 0
+        while len(self.buffer) >= 512:
+            # print(f"len(self.buffer): {len(self.buffer)}")
+            r = super().__call__(self.buffer[:512], return_seconds=return_seconds)
+            # print(f"super() : {r}")
+            self.buffer = self.buffer[512:]
+            if ret is None:
+                print(f"{i*512} ->>> {r} ->=== {ret}")
+                ret = r
+            elif r is not None:
+                if 'end' in r:
+                    ret['end'] = r['end']  # the latter end
+                    print(f"{i*512} ->>> {r} -> {ret}")
+                if 'start' in r and 'end' in ret:  # there is an earlier start.
+                    # Remove end, merging this segment with the previous one.
+                    # print(f"{i*512} ->>>del {r} -> {ret}")
+                    del ret['end']
+                    print(f"{i*512} ->>> {r} -> {ret}")
+            # else:
+            #     # print(f"{i*512} ->>> {r} -> {ret}")
+            i += 1
+        # print(f"FixedVADIterator output : {ret}")
+        return ret if ret != {} else None
+class VadV2:
+    def __init__(self,
+                 threshold: float = 0.5,
+                 sampling_rate: int = 16000,
+                 min_silence_duration_ms: int = 100,
+                 speech_pad_ms: int = 30,
+                 max_speech_duration_s: float = float('inf')):
+        # self.vad_iterator = VADIterator(threshold, sampling_rate, min_silence_duration_ms)
+        self.vad_iterator = VADIteratorOnnx(threshold, sampling_rate, min_silence_duration_ms, max_speech_duration_s)
+        self.speech_pad_samples = int(sampling_rate * speech_pad_ms / 1000)
+        self.sampling_rate = sampling_rate
+        self.audio_buffer = np.array([], dtype=np.float32)
+        self.start = 0
+        self.end = 0
+        self.offset = 0
+        assert speech_pad_ms <= min_silence_duration_ms, "speech_pad_ms should be less than min_silence_duration_ms"
+        self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
+        self.silence_chunk_size = 0
+        self.silence_chunk_threshold = 60 / (512 / self.sampling_rate)
+    def reset(self):
+        self.audio_buffer = np.array([], dtype=np.float32)
+        self.start = 0
+        self.end = 0
+        self.offset = 0
+        self.vad_iterator.reset_states()
+    def __call__(self, x: np.ndarray = None):
+        if x is None:
+            if self.start:
+                start = max(self.offset, self.start - self.speech_pad_samples)
+                end = self.offset + len(self.audio_buffer)
+                start_ts = round(start / self.sampling_rate, 1)
+                end_ts = round(end / self.sampling_rate, 1)
+                audio_data = self.audio_buffer[start - self.offset: end - self.offset]
+                result = {
+                    "start": start_ts,
+                    "end": end_ts,
+                    "audio": audio_data,
+                }
+            else:
+                result = None
+            self.reset()
+            return result
+        self.audio_buffer = np.append(self.audio_buffer, deepcopy(x))
+        result = self.vad_iterator(x)
+        if result is not None:
+            # self.start = result.get('start', self.start)
+            # self.end = result.get('end', self.end)
+            self.silence_chunk_size = 0
+            if 'start' in result:
+                self.start = result['start']
+            if 'end' in result:
+                self.end = result['end']
+        else:
+            self.silence_chunk_size += 1
+        if self.start == 0 and len(self.audio_buffer) > self.speech_pad_samples:
+            self.offset += len(self.audio_buffer) - self.speech_pad_samples
+            self.audio_buffer = self.audio_buffer[-self.speech_pad_samples:]
+        if self.silence_chunk_size >= self.silence_chunk_threshold:
+            self.offset += len(self.audio_buffer) - self.speech_pad_samples
+            self.audio_buffer = self.audio_buffer[-self.speech_pad_samples:]
+            self.silence_chunk_size = 0
+        if self.end > self.start:
+            start = max(self.offset, self.start - self.speech_pad_samples)
+            end = self.end + self.speech_pad_samples
+            start_ts = round(start / self.sampling_rate, 1)
+            end_ts = round(end / self.sampling_rate, 1)
+            audio_data = self.audio_buffer[start - self.offset: end - self.offset]
+            self.audio_buffer = self.audio_buffer[self.end - self.offset:]
+            self.offset = self.end
+            self.start = self.end
+            # self.start = 0
+            self.end = 0
+            result = {
+                "start": start_ts,
+                "end": end_ts,
+                "audio": audio_data,
+            }
+            return result
+        return None
+class SileroVADProcessor:
+    """
+    A class for processing audio files using Silero VAD to detect voice activity
+    and extract voice segments from audio files.
+    """
+    def __init__(self,
+                 activate_threshold=0.5,
+                 fusion_threshold=0.3,
+                 min_speech_duration=0.25,
+                 max_speech_duration=20,
+                 min_silence_duration=250,
+                 sample_rate=16000,
+                 ort_providers=None):
+        """
+        Initialize the SileroVADProcessor.
+        Args:
+            activate_threshold (float): Threshold for voice activity detection
+            fusion_threshold (float): Threshold for merging close speech segments (seconds)
+            min_speech_duration (float): Minimum duration of speech to be considered valid (seconds)
+            max_speech_duration (float): Maximum duration of speech (seconds)
+            min_silence_duration (int): Minimum silence duration (ms)
+            sample_rate (int): Sample rate of the audio (8000 or 16000 Hz)
+            ort_providers (list): ONNX Runtime providers for acceleration
+        """
+        # VAD parameters
+        self.activate_threshold = activate_threshold
+        self.fusion_threshold = fusion_threshold
+        self.min_speech_duration = min_speech_duration
+        self.max_speech_duration = max_speech_duration
+        self.min_silence_duration = min_silence_duration
+        self.sample_rate = sample_rate
+        self.ort_providers = ort_providers if ort_providers else []
+        # Initialize logger
+        self.logger = logging.getLogger(__name__)
+        # Load Silero VAD model
+        self._init_onnx_session()
+        self.silero_vad = load_silero_vad(onnx=True)
+    def _init_onnx_session(self):
+        """Initialize ONNX Runtime session with appropriate settings."""
+        session_opts = onnxruntime.SessionOptions()
+        session_opts.log_severity_level = 3
+        session_opts.inter_op_num_threads = 0
+        session_opts.intra_op_num_threads = 0
+        session_opts.enable_cpu_mem_arena = True
+        session_opts.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
+        session_opts.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        session_opts.add_session_config_entry("session.intra_op.allow_spinning", "1")
+        session_opts.add_session_config_entry("session.inter_op.allow_spinning", "1")
+        session_opts.add_session_config_entry("session.set_denormal_as_zero", "1")
+        # Set the session_opts to be used by silero_vad
+        # onnxruntime.capi._pybind_state.get_default_session_options(session_opts)
+    def load_audio(self, audio_path):
+        """
+        Load audio file and prepare it for VAD processing.
+        Args:
+            audio_path (str): Path to the audio file
+        Returns:
+            numpy.ndarray: Audio data as numpy array
+        """
+        self.logger.info(f"Loading audio from {audio_path}")
+        audio_segment = AudioSegment.from_file(audio_path)
+        audio_segment = audio_segment.set_channels(1).set_frame_rate(self.sample_rate)
+        # Convert to numpy array and normalize
+        dtype = np.float16 if self.use_gpu_fp16 else np.float32
+        audio_array = np.array(audio_segment.get_array_of_samples(), dtype=dtype) * 0.000030517578  # 1/32768
+        self.audio_segment = audio_segment  # Store for later use
+        return audio_array
+    @property
+    def model(self):
+        return self.silero_vad
+    def process_timestamps(self, timestamps):
+        """
+        Process VAD timestamps: filter short segments and merge close segments.
+        Args:
+            timestamps (list): List of (start, end) tuples
+        Returns:
+            list: Processed list of (start, end) tuples
+        """
+        # Filter out short durations
+        filtered_timestamps = [(start, end) for start, end in timestamps
+                               if (end - start) >= self.min_speech_duration]
+        # Fuse timestamps in two passes for better merging
+        fused_timestamps_1st = []
+        for start, end in filtered_timestamps:
+            if fused_timestamps_1st and (start - fused_timestamps_1st[-1][1] <= self.fusion_threshold):
+                fused_timestamps_1st[-1] = (fused_timestamps_1st[-1][0], end)
+            else:
+                fused_timestamps_1st.append((start, end))
+        fused_timestamps_2nd = []
+        for start, end in fused_timestamps_1st:
+            if fused_timestamps_2nd and (start - fused_timestamps_2nd[-1][1] <= self.fusion_threshold):
+                fused_timestamps_2nd[-1] = (fused_timestamps_2nd[-1][0], end)
+            else:
+                fused_timestamps_2nd.append((start, end))
+        return fused_timestamps_2nd
+    def format_time(self, seconds):
+        """
+        Convert seconds to VTT time format 'hh:mm:ss.mmm'.
+        Args:
+            seconds (float): Time in seconds
+        Returns:
+            str: Formatted time string
+        """
+        td = timedelta(seconds=seconds)
+        td_sec = td.total_seconds()
+        total_seconds = int(td_sec)
+        milliseconds = int((td_sec - total_seconds) * 1000)
+        hours = total_seconds // 3600
+        minutes = (total_seconds % 3600) // 60
+        seconds = total_seconds % 60
+        return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"
+    def detect_speech(self, audio:np.array):
+        """
+        Run VAD on the audio file to detect speech segments.
+        Args:
+            audio_path (str): Path to the audio file
+        Returns:
+            list: List of processed timestamps as (start, end) tuples
+        """
+        self.logger.info("Starting VAD process")
+        start_time = time.time()
+        # Get speech timestamps
+        raw_timestamps = get_speech_timestamps(
+            audio,
+            model=self.silero_vad,
+            threshold=self.activate_threshold,
+            max_speech_duration_s=self.max_speech_duration,
+            min_speech_duration_ms=int(self.min_speech_duration * 1000),
+            min_silence_duration_ms=self.min_silence_duration,
+            return_seconds=True
+        )
+        # Convert to simple format and process
+        timestamps = [(item['start'], item['end']) for item in raw_timestamps]
+        processed_timestamps = self.process_timestamps(timestamps)
+        # Clean up
+        del audio
+        gc.collect()
+        self.logger.info(f"VAD completed in {time.time() - start_time:.3f} seconds")
+        return processed_timestamps
+        """
+        Save timestamps in both second and sample indices formats.
+        Args:
+            timestamps (list): List of (start, end) tuples
+            output_prefix (str): Prefix for output files
+        """
+        # Save timestamps in seconds (VTT format)
+        seconds_path = f"{output_prefix}_timestamps_second.txt"
+        with open(seconds_path, "w", encoding='UTF-8') as file:
+            self.logger.info("Saving timestamps in seconds format")
+            for start, end in timestamps:
+                s_time = self.format_time(start)
+                e_time = self.format_time(end)
+                line = f"{s_time} --> {e_time}\n"
+                file.write(line)
+        # Save timestamps in sample indices
+        indices_path = f"{output_prefix}_timestamps_indices.txt"
+        with open(indices_path, "w", encoding='UTF-8') as file:
+            self.logger.info("Saving timestamps in indices format")
+            for start, end in timestamps:
+                line = f"{int(start * self.sample_rate)} --> {int(end * self.sample_rate)}\n"
+                file.write(line)
+        self.logger.info(f"Timestamps saved to {seconds_path} and {indices_path}")
+    def extract_speech_segments(self, audio_segment, timestamps):
+        """
+        Extract speech segments from the audio and combine them into a single audio file.
+        Args:
+            timestamps (list): List of (start, end) tuples indicating speech segments
+        Returns:
+            AudioSegment: The combined speech segments
+        """
+        audio_segment = audio_segment.numpy()
+        combined_speech = np.array([], dtype=np.float32)
+        # Extract and combine each speech segment
+        for i, (start, end) in enumerate(timestamps):
+            # Convert seconds to milliseconds for pydub
+            start_ms = int(start * 1000)
+            end_ms = int(end * 1000)
+            # Ensure the end time does not exceed the length of the audio segment
+            if end_ms > len(audio_segment):
+                end_ms = len(audio_segment)
+            # Extract the segment
+            segment = audio_segment[start_ms:end_ms]
+            # Add to combined audio
+            combined_speech = np.append(combined_speech, segment)
+        return combined_speech
+    def process_audio(self, audio_array:np.array):
+        """
+        Complete processing pipeline: detect speech, save timestamps, and optionally extract speech.
+        Returns:
+            tuple: (timestamps, output_speech_path if extract_speech else None)
+        """
+        # Run VAD to detect speech
+        timestamps = self.detect_speech(audio_array)
+        combined_speech = self.extract_speech_segments(audio_array, timestamps)
+        return timestamps, combined_speech
+class VadProcessor:
+    def __init__(
+            self,
+            prob_threshold=0.5,
+            silence_s=0.2,
+            cache_s=0.15,
+            sr=16000
+    ):
+        self.prob_threshold = prob_threshold
+        self.cache_s = cache_s
+        self.sr = sr
+        self.silence_s = silence_s
+        self.vad = VadV2(self.prob_threshold, self.sr, self.silence_s * 1000, self.cache_s * 1000, max_speech_duration_s=15)
+    def process_audio(self, audio_buffer: np.ndarray):
+        audio = np.array([], np.float32)
+        for i in range(0, len(audio_buffer), 512):
+            chunk = audio_buffer[i:i+512]
+            ret = self.vad(chunk)
+            if ret:
+                audio = np.append(audio, ret['audio'])
+        return audio

python/pipelines/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+
2	+ from .base import MetaItem
3	+ from .pipe_vad import VadPipe

python/pipelines/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (247 Bytes). View file

python/pipelines/__pycache__/base.cpython-312.pyc ADDED Viewed

Binary file (4.04 kB). View file

python/pipelines/__pycache__/pipe_vad.cpython-312.pyc ADDED Viewed

Binary file (4.14 kB). View file

python/pipelines/base.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from dataclasses import dataclass, field
+from multiprocessing import Process, Queue
+from multiprocessing import Event
+from logging import getLogger
+logger = getLogger(__name__)
+@dataclass
+class Segment:
+    t0: int
+    t1: int
+    text: str
+@dataclass
+class MetaItem:
+    segments: list[Segment] = field(default_factory=list)
+    source_audio: bytes = b""
+    audio: bytes = b''
+    transcribe_content: str = ''
+    translate_content: str = ''
+    source_language: str = 'zh'
+    destination_language: str = 'en'
+    speech_status: str = 'END' # "END", "START"
+class BasePipe(Process):
+    def __init__(self, in_queue=None, out_queue=None) -> None:
+        super().__init__()  # Initialize the Process class
+        self._in_queue = in_queue if in_queue else Queue()
+        self._out_queue = out_queue if out_queue else  Queue()
+        self._ready = Event()
+    def set_ready(self):
+        self._ready.set()
+    def is_ready(self):
+        return self._ready.is_set()
+    def wait(self):
+        self._ready.wait()
+    @property
+    def output_queue(self):
+        return self._out_queue
+    @property
+    def input_queue(self):
+        return self._in_queue
+    def process(self, in_data: MetaItem) -> MetaItem:
+        raise NotImplementedError("Subclasses should implement this method.")
+    @classmethod
+    def init(cls):
+        raise NotImplementedError
+    def run(self):
+        logger.info(f"start initial {self.__class__.__name__}")
+        self.init()
+        logger.info(f"finish initial {self.__class__.__name__}")
+        self.set_ready()
+        while True:
+            item = self.input_queue.get()
+            if item is None:  # Check for termination signal
+                break
+            out_item = self.process(item)
+            if out_item:
+                self.output_queue.put(out_item)

python/pipelines/pipe_vad.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from .base import MetaItem, BasePipe
+from ..helpers.vadprocessor import FixedVADIterator
+import numpy as np
+import logging
+# import noisereduce as nr
+class VadPipe(BasePipe):
+    vac = None
+    sample_rate = 16000
+    def __init__(self, in_queue=None, out_queue=None) -> None:
+        super().__init__(in_queue, out_queue)
+        self._offset = 0 # 处理的frame size offset
+        self._status = 'END'
+    def reset(self):
+        self._offset = 0
+        self._status = 'END'
+        self.vac.reset_states()
+    @classmethod
+    def init(cls):
+        if cls.vac is None:
+            cls.vac = FixedVADIterator(
+                threshold=0.6,
+                sampling_rate=cls.sample_rate,
+                # speech_pad_ms=10
+                min_silence_duration_ms = 100,
+                # speech_pad_ms = 30,
+                )
+            cls.vac.reset_states()
+    # def reduce_noise(self, data):
+    #     return nr.reduce_noise(y=data, sr=self.sample_rate)
+    def _process_speech_chunk(self, source_audio:np.ndarray):
+        speech_dict = self.vac(source_audio, return_seconds=False)
+        # print(f"speech_dict : {speech_dict}")
+        if speech_dict:
+            relative_start_frame = None
+            relative_end_frame = None
+            start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
+            if start_frame:
+                relative_start_frame =start_frame - self._offset
+            if end_frame:
+                relative_end_frame = end_frame - self._offset
+            return relative_start_frame, relative_end_frame
+    def process(self, in_data: MetaItem) -> MetaItem:
+        if self._offset == 0:
+            self.vac.reset_states()
+        # silence_audio_100ms = np.zeros(int(0.1*self.sample_rate))
+        source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
+        print(f"source_audio.shape = {source_audio.shape}")
+        speech_data  = self._process_speech_chunk(source_audio)
+        if speech_data: # 表示有音频的变化点出现
+            rel_start_frame, rel_end_frame = speech_data
+            if rel_start_frame is not None and rel_end_frame is None:
+                self._status = "START" # 语音开始
+                target_audio = source_audio[max(rel_start_frame-100, 0):]
+                logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
+            elif rel_start_frame is None and rel_end_frame is not None:
+                self._status = "END" # 音频结束
+                target_audio = source_audio[:rel_end_frame]
+                logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
+            else:
+                self._status = 'END'
+                target_audio = source_audio[max(rel_start_frame-100, 0):rel_end_frame]
+                logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
+                # logging.debug("❌ No valid speech segment detected, setting status to END")
+        else:
+            if self._status == 'START':
+                target_audio = source_audio
+                # logging.debug("🔊 Continuing to capture audio as speech is still ongoing")
+            else: # end
+                target_audio = np.array([],dtype=np.float32)
+                # self._status = 'END'
+                # logging.debug("❌ No speech detected, setting status to END")
+        print(f"strat: {rel_start_frame} end: {rel_end_frame}")
+        self._offset += len(source_audio)
+        in_data.audio = target_audio.tobytes()
+        in_data.source_audio = b''
+        in_data.speech_status = self._status
+        return in_data

python/processing.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import os
+import sys
+current_dir = os.path.dirname(os.path.abspath(__file__))
+parent_dir = os.path.dirname(current_dir)
+sys.path.append(parent_dir)
+# sys.path.append("/Users/chenxiang/translator/Translator/llama-cpp-python/llama_cpp")
+from .pipelines import MetaItem, VadPipe
+class ProcessingPipes:
+    def __init__(self) -> None:
+        self._process = []
+        # vad
+        self._vad_pipe = self._launch_process(VadPipe())
+    def _launch_process(self, process_obj):
+        process_obj.daemon = True
+        process_obj.start()
+        self._process.append(process_obj)
+        return process_obj
+    def wait_ready(self):
+        for p in self._process:
+            p.wait()
+    def voice_detect(self, audio_buffer: bytes) -> MetaItem:
+        item = MetaItem(source_audio=audio_buffer)
+        self._vad_pipe.input_queue.put(item)
+        return self._vad_pipe.output_queue.get()
+if __name__ == "__main__":
+    import soundfile
+    import numpy as np
+    wav_path1 = "/Users/chenxiang/translator/core/vad_cpp/bin/Chinese-liyongle-part1.mp3"
+    wav_path2 = "/Users/chenxiang/translator/core/whisper_wrapper/bin/zh.wav"
+    tp = ProcessingPipes()
+    audio, sr, = soundfile.read(wav_path2)
+    # 确保是单声道
+    if len(audio.shape) > 1:
+        print("不是单声道")
+        audio = audio.mean(axis=1)
+    # 重采样到 16kHz（如果需要）
+    if sr != 16000:
+        print("采样率不是 16000, 重新采样到 16kHz（如果需要）")
+        import resampy
+        audio = resampy.resample(audio, sr, 16000)
+    # 转换为 float32
+    print(f"original audio data type = {audio.dtype}")
+    audio = audio.astype(np.float32)
+    print(f"original audio data size = {audio.shape}")
+    result = tp.voice_detect(audio)
+    # print(f"{result.speech_status}  {result.segments}  {result.segments}")
+    print("********** END *************")

reference/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

reference/cpp/onnx_wrapper.cpp ADDED Viewed

	@@ -0,0 +1,185 @@

+#include <stdexcept>
+#include <cmath>
+#include <iostream>
+#include "onnx_wrapper.h"
+static void get_input_names(Ort::Session* session, std::vector<std::string> &input_names_str,
+                   std::vector<const char *> &input_names_char) {
+    Ort::AllocatorWithDefaultOptions allocator;
+    size_t nodes_num = session->GetInputCount();
+    input_names_str.resize(nodes_num);
+    input_names_char.resize(nodes_num);
+    for (size_t i = 0; i != nodes_num; ++i) {
+        auto t = session->GetInputNameAllocated(i, allocator);
+        input_names_str[i] = t.get();
+        input_names_char[i] = input_names_str[i].c_str();
+    }
+}
+static void get_output_names(Ort::Session* session, std::vector<std::string> &output_names_,
+                   std::vector<const char *> &vad_out_names_) {
+    Ort::AllocatorWithDefaultOptions allocator;
+    size_t nodes_num = session->GetOutputCount();
+    output_names_.resize(nodes_num);
+    vad_out_names_.resize(nodes_num);
+    for (size_t i = 0; i != nodes_num; ++i) {
+        auto t = session->GetOutputNameAllocated(i, allocator);
+        output_names_[i] = t.get();
+        vad_out_names_[i] = output_names_[i].c_str();
+    }
+}
+OnnxVadWrapper::OnnxVadWrapper(const std::string& model_path, bool force_cpu, int thread_num)
+    : sample_rates_{16000}, model_path_(model_path) {
+    Ort::SessionOptions session_options;
+    session_options.SetIntraOpNumThreads(thread_num);
+    session_options.SetGraphOptimizationLevel(ORT_ENABLE_ALL);
+    session_options.DisableCpuMemArena();
+    // if (force_cpu && supports_cpu()) {
+    //     session_options.AppendExecutionProvider_CPU();
+    // }
+    // 初始化 ONNX Session
+    try {
+        env_ = Ort::Env(ORT_LOGGING_LEVEL_WARNING, "OnnxVadWrapper");
+        session_ = std::make_unique<Ort::Session>(env_, ORTCHAR(model_path.c_str()), session_options);
+        std::cout << "Successfully load model from " << model_path << std::endl;
+    } catch (std::exception const &e) {
+        std::cout << "Error when load vad onnx model: " << e.what() << std::endl;
+        exit(-1);
+    }
+    get_input_names(session_.get(), input_names_, vad_in_names_);
+    get_output_names(session_.get(), output_names_, vad_out_names_);
+    reset_states();
+}
+OnnxVadWrapper::~OnnxVadWrapper() = default;
+void OnnxVadWrapper::reset_states(int batch_size) {
+    int total_size = 2 * batch_size * 128;
+    state_.resize(total_size); /////
+    state_.assign(state_.size(), 0.0f);
+    context_.clear();
+    last_sr_ = 0;
+    last_batch_size_ = 0;
+}
+std::pair<std::vector<float>, std::vector<float>> OnnxVadWrapper::operator()(const std::vector<float>& x, int sr) {
+    validate_input(x, sr);
+    int num_samples = (sr == 16000) ? 512 : 256;
+    int context_size = (sr == 16000) ? 64 : 32;
+    int batch_size = 1;  // 假设单通道输入
+    if (x.size() != num_samples) {
+        throw std::invalid_argument("Input must be exactly " + std::to_string(num_samples) + " samples.");
+    }
+    if (!last_batch_size_) reset_states(batch_size);
+    if (last_sr_ != 0 && last_sr_ != sr) reset_states(batch_size);
+    if (last_batch_size_ != 0 && last_batch_size_ != batch_size) reset_states(batch_size);
+    if (context_.empty()) {
+        context_.resize(batch_size * context_size, 0.0f);
+    }
+    // 合并 context 和 input
+    std::vector<float> x_with_context(context_.begin(), context_.end());
+    x_with_context.insert(x_with_context.end(), x.begin(), x.end());
+    // Prepare inputs
+    std::vector<Ort::Value> inputs;
+    auto mem_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
+    std::array<int64_t, 3> input_shape = {1, 1, static_cast<int64_t>(x_with_context.size())};
+    Ort::Value input_tensor = Ort::Value::CreateTensor<float>(
+        mem_info, const_cast<float*>(x_with_context.data()), x_with_context.size(),
+        input_shape.data(), input_shape.size());
+    inputs.emplace_back(std::move(input_tensor));
+    std::array<int64_t, 3> state_shape = {2, batch_size, 128};
+    Ort::Value state_tensor = Ort::Value::CreateTensor<float>(
+        mem_info, state_.data(), state_.size(), state_shape.data(), state_shape.size());
+    inputs.emplace_back(std::move(state_tensor));
+    std::array<int64_t, 1> sr_shape = {1};
+    float sr_f = static_cast<float>(sr);
+    Ort::Value sr_tensor = Ort::Value::CreateTensor<float>(
+        mem_info, &sr_f, 1, sr_shape.data(), sr_shape.size());
+    inputs.emplace_back(std::move(sr_tensor));
+    // const char* input_names[] = {"input", "state", "sr"};
+    // std::vector<Ort::Value> inputs = {std::move(input_tensor), std::move(state_tensor), std::move(sr_tensor)};
+    // Run inference
+    std::vector<Ort::Value> outputs;
+    try {
+        outputs = session_->Run(
+                Ort::RunOptions{nullptr}, vad_in_names_.data(), inputs.data(),
+                inputs.size(), vad_out_names_.data(), vad_out_names_.size());
+    } catch (std::exception const &e) {
+        std::cout << "Error when run vad onnx forword: " << e.what() << std::endl;
+        exit(-1);
+    }
+    // Get output
+    float* out_data = outputs[0].GetTensorMutableData<float>();
+    size_t out_len = outputs[0].GetTensorTypeAndShapeInfo().GetElementCount();
+    std::vector<float> out(out_data, out_data + out_len);
+    // Update state and context
+    float* new_state = outputs[1].GetTensorMutableData<float>();
+    std::copy(new_state, new_state + state_.size(), state_.begin());
+    context_.assign(x_with_context.end() - context_size, x_with_context.end());
+    last_sr_ = sr;
+    last_batch_size_ = batch_size;
+    return {out, {}};
+}
+std::vector<float> OnnxVadWrapper::audio_forward(const std::vector<float>& audio, int sr) {
+    std::vector<float> x = audio;
+    reset_states();
+    int num_samples = (sr == 16000) ? 512 : 256;
+    std::vector<float> result;
+    // Pad to multiple of num_samples
+    int pad_num = (num_samples - (x.size() % num_samples)) % num_samples;
+    x.resize(x.size() + pad_num, 0.0f);
+    for (size_t i = 0; i < x.size(); i += num_samples) {
+        std::vector<float> chunk(x.begin() + i, x.begin() + i + num_samples);
+        auto [out, _] = (*this)(chunk, sr);
+        result.insert(result.end(), out.begin(), out.end());
+    }
+    return result;
+}
+bool OnnxVadWrapper::supports_cpu() {
+    auto providers = Ort::GetAvailableProviders();
+    for (const std::string& provider : providers) {
+        if (provider == "CPUExecutionProvider") {
+            return true;
+        }
+    }
+    return false;
+}
+void OnnxVadWrapper::validate_input(const std::vector<float>& x, int sr) {
+    if (sr != 16000 && sr % 16000 != 0) {
+        throw std::invalid_argument("Unsupported sampling rate: " + std::to_string(sr));
+    }
+    if ((sr / x.size()) > 31.25) {
+        throw std::invalid_argument("Input audio chunk is too short");
+    }
+}

reference/cpp/onnx_wrapper.h ADDED Viewed

	@@ -0,0 +1,70 @@

+#include <vector>
+#include <string>
+#if defined(__APPLE__)
+#include <onnxruntime/onnxruntime_cxx_api.h>
+#else
+#include "onnxruntime_run_options_config_keys.h"
+#include "onnxruntime_cxx_api.h"
+#endif
+#ifdef _WIN32
+#define ORTSTRING(str) StrToWstr(str)
+#define ORTCHAR(str) StrToWstr(str).c_str()
+inline std::wstring String2wstring(const std::string& str, const std::string& locale)
+{
+    typedef std::codecvt_byname<wchar_t, char, std::mbstate_t> F;
+    std::wstring_convert<F> strCnv(new F(locale));
+    return strCnv.from_bytes(str);
+}
+inline std::wstring  StrToWstr(std::string str) {
+    if (str.length() == 0)
+        return L"";
+    return  String2wstring(str, "zh-CN");
+}
+#else
+#define ORTSTRING(str) str
+#define ORTCHAR(str) str
+#endif
+class OnnxVadWrapper {
+public:
+    explicit OnnxVadWrapper(const std::string& model_path, bool force_cpu = false, int thread_num = 1);
+    ~OnnxVadWrapper();
+    // 重载 operator()，使得对象可以像函数一样调用
+    std::pair<std::vector<float>, std::vector<float>> operator()(const std::vector<float>& x, int sr);
+    // 批量处理整个音频
+    std::vector<float> audio_forward(const std::vector<float>& audio, int sr);
+    // 重置 RNN 状态
+    void reset_states(int batch_size = 1);
+private:
+    Ort::Env env_;
+    std::unique_ptr<Ort::Session> session_;
+    std::vector<std::string> input_names_, output_names_;
+    std::vector<const char *> vad_in_names_;
+    std::vector<const char *> vad_out_names_;
+    std::vector<int> sample_rates_;
+    std::string model_path_;
+    std::vector<float> state_;     // RNN State
+    std::vector<float> context_;   // Context buffer
+    int last_sr_ = 0;
+    int last_batch_size_ = 0;
+    void read_model();
+    bool supports_cpu();
+    void validate_input(const std::vector<float>& x, int sr);
+};

reference/cpp/vad_iterator_onnx.cpp ADDED Viewed

	@@ -0,0 +1,104 @@

+#include <cmath>
+#include <stdexcept>
+#include <unordered_map>
+#include <vector>
+#include "vad_iterator_onnx.h"
+VadIteratorOnnx::VadIteratorOnnx(float threshold,
+                         int sampling_rate,
+                         int min_silence_duration_ms,
+                         float max_speech_duration_s,
+                         int speech_pad_ms)
+    : threshold_(threshold),
+      sampling_rate_(sampling_rate),
+      min_silence_samples_(sampling_rate_ * min_silence_duration_ms / 1000.0),
+      speech_pad_samples_(sampling_rate_ * speech_pad_ms / 1000.0),
+      triggered_(false),
+      temp_end_(0),
+      current_sample_(0),
+      start_(0) {
+    if (sampling_rate_ != 8000 && sampling_rate_ != 16000) {
+        throw std::invalid_argument("Only support sampling rates of 8000 or 16000");
+    }
+    model_ = std::make_unique<OnnxVadWrapper>("path/to/vad.onnx", true); // 可配置路径
+}
+VadIteratorOnnx::~VadIteratorOnnx() = default;
+void VadIteratorOnnx::reset_states() {
+    model_->reset_states();
+    triggered_ = false;
+    temp_end_ = 0;
+    current_sample_ = 0;
+    start_ = 0;
+    buffer_.clear();
+}
+std::unordered_map<std::string, double>
+VadIteratorOnnx::operator()(const std::vector<float>& x, bool return_seconds) {
+    std::unordered_map<std::string, double> result;
+    int window_size_samples = (sampling_rate_ == 16000) ? 512 : 256;
+    // 将新音频追加到缓存中
+    buffer_.insert(buffer_.end(), x.begin(), x.end());
+    while (buffer_.size() > 0) {
+        std::unordered_map<std::string, double> tmp;
+        std::vector<float> chunk(buffer_.begin(), buffer_.begin() + std::min(static_cast<int>(x.size()), window_size_samples));
+        // 补零到固定长度
+        if (chunk.size() < static_cast<size_t>(window_size_samples)) {
+            chunk.resize(window_size_samples, 0.0f);
+        }
+        current_sample_ += window_size_samples;
+        // 推理得到语音概率
+        auto [output, _] = (*model_)(chunk, sampling_rate_);
+        float speech_prob = output[0];
+        if (speech_prob >= threshold_ && temp_end_ > 0) {
+            temp_end_ = 0;
+        }
+        if (speech_prob >= threshold_ && !triggered_) {
+            triggered_ = true;
+            start_ = std::max(0.0, current_sample_ - speech_pad_samples_ - window_size_samples);
+            tmp["start"] = return_seconds ? start_ / sampling_rate_ : start_;
+        }
+        if (speech_prob < (threshold_ - 0.15) && triggered_) {
+            if (temp_end_ == 0) {
+                temp_end_ = current_sample_;
+            }
+            if (current_sample_ - temp_end_ >= min_silence_samples_) {
+                double speech_end = temp_end_ + speech_pad_samples_ - window_size_samples;
+                tmp["end"] = return_seconds ? speech_end / sampling_rate_ : speech_end;
+                temp_end_ = 0;
+                triggered_ = false;
+            }
+        }
+        // 移除已处理的数据
+        std::vector<float>(buffer_.begin() + window_size_samples, buffer_.end()).swap(buffer_);
+        if (result.empty()) {
+            result = tmp;
+        } else if (!tmp.empty()) {
+            // 如果当前结果有 'end'，更新最终 end
+            if (tmp.find("end") != tmp.end()) {
+                result["end"] = tmp["end"];
+            }
+            // 如果有新的 start，但前一个有 end，则合并成连续语音段
+            if (tmp.find("start") != tmp.end() && result.find("end") != result.end()) {
+                result.erase("end");
+            }
+        }
+    }
+    return result;
+}

reference/cpp/vad_iterator_onnx.h ADDED Viewed

	@@ -0,0 +1,38 @@

+#pragma once
+#include <vector>
+#include <string>
+#include <unordered_map>
+#include "onnx_wrapper.h"
+class OnnxVadWrapper; // 前向声明
+class VadIteratorOnnx {
+public:
+    explicit VadIteratorOnnx(float threshold = 0.5,
+                         int sampling_rate = 16000,
+                         int min_silence_duration_ms = 100,
+                         float max_speech_duration_s = INFINITY,
+                         int speech_pad_ms = 30);
+    virtual ~VadIteratorOnnx();
+    // 重置内部状态
+    virtual void reset_states();
+    // 输入音频块，返回语音事件（start/end）
+    virtual std::unordered_map<std::string, double> operator()(const std::vector<float>& x, bool return_seconds = false);
+private:
+    std::unique_ptr<OnnxVadWrapper> model_;
+    std::vector<float> buffer_;  // 缓冲区用于保存未处理完的音频
+    float threshold_;
+    int sampling_rate_;
+    double min_silence_samples_;
+    double speech_pad_samples_;
+    bool triggered_;
+    double temp_end_;
+    double current_sample_;
+    double start_;
+};

reference/python/__pycache__/audio_utils.cpython-312.pyc ADDED Viewed

Binary file (2.21 kB). View file

reference/python/audio_utils.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import numpy as np
+import soundfile as sf
+import time
+def audio_stream_generator(audio_file_path, chunk_size=4096, simulate_realtime=True):
+    """
+    音频流生成器，从音频文件中读取数据并以流的方式输出
+    参数:
+        audio_file_path: 音频文件路径
+        chunk_size: 每个数据块的大小（采样点数）
+        simulate_realtime: 是否模拟实时流处理的速度
+    生成:
+        numpy.ndarray: 每次生成一个chunk_size大小的np.float32数据块
+    """
+    # 加载音频文件
+    audio_data, sample_rate = sf.read(audio_file_path)
+    # 确保音频数据是float32类型
+    if audio_data.dtype != np.float32:
+        audio_data = audio_data.astype(np.float32)
+    # 如果是立体声，转换为单声道
+    if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
+        audio_data = audio_data.mean(axis=1)
+    print(f"已加载音频文件: {audio_file_path}")
+    print(f"采样率: {sample_rate} Hz")
+    print(f"音频长度: {len(audio_data)/sample_rate:.2f} 秒")
+    # 计算每个块的时长（秒）
+    chunk_duration = chunk_size / sample_rate if simulate_realtime else 0
+    # 按块生成数据
+    audio_len = len(audio_data)
+    for pos in range(0, audio_len, chunk_size):
+        # 获取当前块
+        end_pos = min(pos + chunk_size, audio_len)
+        chunk = audio_data[pos:end_pos]
+        # 如果块大小不足，用0填充
+        if len(chunk) < chunk_size:
+            padded_chunk = np.zeros(chunk_size, dtype=np.float32)
+            padded_chunk[:len(chunk)] = chunk
+            chunk = padded_chunk
+        # 模拟实时处理的延迟
+        if simulate_realtime:
+            time.sleep(chunk_duration)
+        yield chunk
+    print("音频流处理完成")

reference/python/test_vad.ipynb ADDED Viewed

	@@ -0,0 +1,295 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from audio_utils import audio_stream_generator\n",
+    "import  IPython.display as ipd\n",
+    "import sys\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sys.path.append(\"/Users/chenxiang/translator/core/vad_cpp/\")\n",
+    "from python.helpers.vadprocessor import FixedVADIterator\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vac = FixedVADIterator(\n",
+    "                threshold=0.5,\n",
+    "                sampling_rate=16000,\n",
+    "                # speech_pad_ms=10\n",
+    "                min_silence_duration_ms = 100,\n",
+    "                # speech_pad_ms = 30,\n",
+    "                max_speech_duration_s=5.0,\n",
+    "                )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SAMPLE_FILE_PATH = \"/Users/chenxiang/translator/core/vad_cpp/bin/Chinese-liyongle-part1.mp3\"\n",
+    "SAMPLING_RATE = 16000\n",
+    "\n",
+    "chunks_generator =  audio_stream_generator(SAMPLE_FILE_PATH, chunk_size=4096)\n",
+    "vac.reset_states()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "已加载音频文件: /Users/chenxiang/translator/core/vad_cpp/bin/Chinese-liyongle-part1.mp3\n",
+      "采样率: 44100 Hz\n",
+      "音频长度: 64.00 秒\n",
+      "None\n"
+     ]
+    }
+   ],
+   "source": [
+    "# speech_dict = vac(next(chunks_generator), return_seconds=False)\n",
+    "# print(speech_dict)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "已加载音频文件: /Users/chenxiang/translator/core/vad_cpp/bin/Chinese-liyongle-part1.mp3\n",
+      "采样率: 44100 Hz\n",
+      "音频长度: 64.00 秒\n",
+      "0 ->>> None\n",
+      "1 ->>> None\n",
+      "2 ->>> {'start': 10272}\n",
+      "3 ->>> None\n",
+      "4 ->>> None\n",
+      "5 ->>> None\n",
+      "6 ->>> None\n",
+      "7 ->>> None\n",
+      "8 ->>> None\n",
+      "9 ->>> None\n",
+      "10 ->>> None\n",
+      "11 ->>> None\n",
+      "12 ->>> None\n",
+      "13 ->>> {'end': 55264}\n",
+      "14 ->>> None\n",
+      "15 ->>> {'start': 60960}\n",
+      "16 ->>> None\n",
+      "17 ->>> None\n",
+      "18 ->>> None\n",
+      "19 ->>> None\n",
+      "20 ->>> {'end': 82912}\n",
+      "21 ->>> {'start': 89120}\n",
+      "22 ->>> None\n",
+      "23 ->>> None\n",
+      "24 ->>> None\n",
+      "25 ->>> None\n",
+      "26 ->>> None\n",
+      "27 ->>> None\n",
+      "28 ->>> None\n",
+      "29 ->>> None\n",
+      "30 ->>> None\n",
+      "31 ->>> None\n",
+      "32 ->>> None\n",
+      "33 ->>> None\n",
+      "34 ->>> None\n",
+      "35 ->>> None\n",
+      "36 ->>> None\n",
+      "37 ->>> None\n",
+      "38 ->>> None\n",
+      "39 ->>> None\n",
+      "40 ->>> None\n",
+      "41 ->>> None\n",
+      "42 ->>> None\n",
+      "43 ->>> None\n",
+      "44 ->>> None\n",
+      "45 ->>> None\n",
+      "46 ->>> None\n",
+      "47 ->>> None\n",
+      "48 ->>> None\n",
+      "49 ->>> None\n",
+      "50 ->>> {'end': 206816}\n",
+      "51 ->>> None\n",
+      "52 ->>> None\n",
+      "53 ->>> {'start': 219680}\n",
+      "54 ->>> None\n",
+      "55 ->>> None\n",
+      "56 ->>> None\n",
+      "57 ->>> None\n",
+      "58 ->>> None\n",
+      "59 ->>> None\n",
+      "60 ->>> None\n",
+      "61 ->>> None\n",
+      "62 ->>> None\n",
+      "63 ->>> None\n",
+      "64 ->>> None\n",
+      "65 ->>> None\n",
+      "66 ->>> None\n",
+      "67 ->>> None\n",
+      "68 ->>> None\n",
+      "69 ->>> None\n",
+      "70 ->>> None\n",
+      "71 ->>> None\n",
+      "72 ->>> None\n",
+      "73 ->>> None\n",
+      "74 ->>> None\n",
+      "75 ->>> None\n",
+      "76 ->>> None\n",
+      "77 ->>> None\n",
+      "78 ->>> None\n",
+      "79 ->>> None\n",
+      "80 ->>> None\n",
+      "81 ->>> None\n",
+      "82 ->>> None\n",
+      "83 ->>> None\n",
+      "84 ->>> None\n",
+      "85 ->>> None\n",
+      "86 ->>> None\n",
+      "87 ->>> None\n",
+      "88 ->>> None\n",
+      "89 ->>> None\n",
+      "90 ->>> None\n",
+      "91 ->>> None\n",
+      "92 ->>> None\n",
+      "93 ->>> None\n",
+      "94 ->>> None\n",
+      "95 ->>> None\n",
+      "96 ->>> {'end': 394720}\n",
+      "97 ->>> None\n",
+      "98 ->>> None\n",
+      "99 ->>> None\n",
+      "100 ->>> {'start': 410144}\n",
+      "101 ->>> None\n",
+      "102 ->>> None\n",
+      "103 ->>> None\n",
+      "104 ->>> None\n",
+      "105 ->>> None\n",
+      "106 ->>> None\n",
+      "107 ->>> None\n",
+      "108 ->>> None\n",
+      "109 ->>> None\n",
+      "110 ->>> None\n",
+      "111 ->>> None\n",
+      "112 ->>> None\n",
+      "113 ->>> None\n",
+      "114 ->>> None\n",
+      "115 ->>> None\n",
+      "116 ->>> None\n",
+      "117 ->>> None\n",
+      "118 ->>> None\n",
+      "119 ->>> None\n",
+      "120 ->>> None\n",
+      "121 ->>> None\n",
+      "122 ->>> {'end': 500192}\n",
+      "123 ->>> {'start': 503328}\n",
+      "124 ->>> {'end': 509920}\n",
+      "125 ->>> None\n",
+      "126 ->>> {'start': 519200}\n",
+      "127 ->>> None\n",
+      "128 ->>> None\n",
+      "129 ->>> None\n",
+      "130 ->>> None\n",
+      "131 ->>> None\n",
+      "132 ->>> None\n",
+      "133 ->>> None\n",
+      "134 ->>> None\n",
+      "135 ->>> {'end': 554976}\n",
+      "136 ->>> {'start': 556576}\n",
+      "137 ->>> None\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mKeyboardInterrupt\u001b[39m                         Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m      1\u001b[39m i = \u001b[32m0\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mchunks_generator\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m      3\u001b[39m \u001b[43m    \u001b[49m\u001b[38;5;66;43;03m# vad_iterator.reset_states()\u001b[39;49;00m\n\u001b[32m      4\u001b[39m \u001b[43m    \u001b[49m\u001b[38;5;66;43;03m# audio_buffer = np.append(audio_buffer, chunk)\u001b[39;49;00m\n\u001b[32m      6\u001b[39m \u001b[43m    \u001b[49m\u001b[43mspeech_dict\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[43mvac\u001b[49m\u001b[43m(\u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreturn_seconds\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[32m      7\u001b[39m \u001b[43m    \u001b[49m\u001b[38;5;28;43mprint\u001b[39;49m\u001b[43m(\u001b[49m\u001b[33;43mf\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mi\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[33;43m ->>> \u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mspeech_dict\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m/Users/chenxiang/translator/core/vad_cpp/reference/归档/audio_utils.py:50\u001b[39m, in \u001b[36maudio_stream_generator\u001b[39m\u001b[34m(audio_file_path, chunk_size, simulate_realtime)\u001b[39m\n\u001b[32m     48\u001b[39m     \u001b[38;5;66;03m# 模拟实时处理的延迟\u001b[39;00m\n\u001b[32m     49\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m simulate_realtime:\n\u001b[32m---> \u001b[39m\u001b[32m50\u001b[39m         \u001b[43mtime\u001b[49m\u001b[43m.\u001b[49m\u001b[43msleep\u001b[49m\u001b[43m(\u001b[49m\u001b[43mchunk_duration\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     52\u001b[39m     \u001b[38;5;28;01myield\u001b[39;00m chunk\n\u001b[32m     54\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m音频流处理完成\u001b[39m\u001b[33m\"\u001b[39m)\n",
+      "\u001b[31mKeyboardInterrupt\u001b[39m: "
+     ]
+    }
+   ],
+   "source": [
+    "i = 0\n",
+    "for chunk in chunks_generator:\n",
+    "    # vad_iterator.reset_states()\n",
+    "    # audio_buffer = np.append(audio_buffer, chunk)\n",
+    "    \n",
+    "    speech_dict = vac(chunk, return_seconds=False)\n",
+    "    print(f\"{i} ->>> {speech_dict}\")\n",
+    "    # if speech_dict:\n",
+    "    #     print(speech_dict)\n",
+    "    i+=1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "audio_data, sample_rate = sf.read(audio_file_path)\n",
+    "\n",
+    "# 确保音频数据是float32类型\n",
+    "if audio_data.dtype != np.float32:\n",
+    "    audio_data = audio_data.astype(np.float32)\n",
+    "\n",
+    "# 如果是立体声，转换为单声道\n",
+    "if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:\n",
+    "    audio_data = audio_data.mean(axis=1)\n",
+    "    \n",
+    "print(f\"已加载音频文件: {audio_file_path}\")\n",
+    "print(f\"采样率: {sample_rate} Hz\")\n",
+    "print(f\"音频长度: {len(audio_data)/sample_rate:.2f} 秒\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

silero_vad_onnx/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,39 @@

+cmake_minimum_required(VERSION 3.16)
+project(VadOnnx)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+# 添加 ONNX Runtime include 路径
+include_directories(${ONNXRUNTIME_DIR}/include)
+# 添加项目头文件目录
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+add_library(silero_vad_onnx SHARED ${CMAKE_CURRENT_SOURCE_DIR}/vad_iterator.cpp
+                                   ${CMAKE_CURRENT_SOURCE_DIR}/time_stamp.cpp)
+# 设置库输出名称（跨平台兼容）
+# set_target_properties(silero_vad_onnx PROPERTIES
+#     PREFIX ""
+#     SUFFIX ".so"
+#     LIBRARY_OUTPUT_NAME_DEBUG "silero_vad_onnx"
+#     LIBRARY_OUTPUT_NAME_RELEASE "silero_vad_onnx"
+# )
+# 链接 ONNX Runtime 库
+if(APPLE)
+    # macOS 上链接 dylib
+    target_link_libraries(silero_vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/libonnxruntime.dylib)
+elseif(UNIX)
+    # Linux 上链接 so
+    target_link_libraries(silero_vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/libonnxruntime.so)
+elseif(WIN32)
+    # Windows 上链接 dll + lib
+    target_link_libraries(silero_vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/onnxruntime.lib)
+    set_target_properties(silero_vad_onnx PROPERTIES SUFFIX ".dll")
+else()
+    message(WARNING "Unknown platform, no ONNX Runtime linking applied.")
+endif()

silero_vad_onnx/time_stamp.cpp ADDED Viewed

	@@ -0,0 +1,53 @@

+#include "time_stamp.h"
+#include <cstdio>
+#include <cstdarg>
+#include <memory>
+#include <string>
+timestamp_t::timestamp_t(int s, int e)
+    : start(s), end(e) {}
+timestamp_t& timestamp_t::operator=(const timestamp_t& a) {
+    if (this != &a) {
+        start = a.start;
+        end = a.end;
+    }
+    return *this;
+}
+bool timestamp_t::operator==(const timestamp_t& a) const {
+    return (start == a.start && end == a.end);
+}
+std::string timestamp_t::c_str() const {
+    return format("{start:%08d, end:%08d}", start, end);
+}
+std::string timestamp_t::format(const char* fmt, ...) const {
+    char buf[256];
+    va_list args;
+    va_start(args, fmt);
+    const auto r = std::vsnprintf(buf, sizeof(buf), fmt, args);
+    va_end(args);
+    if (r < 0)
+        return {};
+    const size_t len = r;
+    if (len < sizeof(buf))
+        return std::string(buf, len);
+#if __cplusplus >= 201703L
+    std::string s(len + 1, '\0');
+    va_start(args, fmt);
+    std::vsnprintf(s.data(), len + 1, fmt, args);
+    va_end(args);
+    return s;
+#else
+    std::unique_ptr<char[]> vbuf(new char[len + 1]);
+    va_start(args, fmt);
+    std::vsnprintf(vbuf.get(), len + 1, fmt, args);
+    va_end(args);
+    return std::string(vbuf.get(), len);
+#endif
+}

silero_vad_onnx/time_stamp.h ADDED Viewed

	@@ -0,0 +1,26 @@

+#ifndef TIME_STAMP_H
+#define TIME_STAMP_H
+#include <string>
+// timestamp_t class: stores the start and end (in samples) of a speech segment.
+class timestamp_t {
+public:
+    int start;
+    int end;
+    timestamp_t(int start = -1, int end = -1);
+    timestamp_t& operator=(const timestamp_t& a);
+    bool operator==(const timestamp_t& a) const;
+    // Returns a formatted string of the timestamp.
+    std::string c_str() const;
+private:
+    // Helper function for formatting.
+    std::string format(const char* fmt, ...) const;
+};
+#endif // TIME_STAMP_H

silero_vad_onnx/vad_iterator.cpp ADDED Viewed

	@@ -0,0 +1,196 @@

+#include "vad_iterator.h"
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <memory>
+void VadIterator::init_onnx_model(const std::string& model_path) {
+    init_engine_threads(1, 1);
+    session = std::make_shared<Ort::Session>(env, model_path.c_str(), session_options);
+}
+void VadIterator::init_engine_threads(int inter_threads, int intra_threads) {
+    session_options.SetIntraOpNumThreads(intra_threads);
+    session_options.SetInterOpNumThreads(inter_threads);
+    session_options.SetGraphOptimizationLevel(ORT_ENABLE_ALL);
+}
+void VadIterator::reset_states() {
+    std::memset(_state.data(), 0, _state.size() * sizeof(float));
+    triggered = false;
+    temp_end = 0;
+    current_sample = 0;
+    prev_end = next_start = 0;
+    speeches.clear();
+    current_speech = timestamp_t();
+    std::fill(_context.begin(), _context.end(), 0.0f);
+}
+void VadIterator::predict(const std::vector<float>& data_chunk) {
+    std::vector<float> new_data(effective_window_size, 0.0f);
+    std::copy(_context.begin(), _context.end(), new_data.begin());
+    std::copy(data_chunk.begin(), data_chunk.end(), new_data.begin() + context_samples);
+    input = new_data;
+    Ort::Value input_ort = Ort::Value::CreateTensor<float>(
+        memory_info, input.data(), input.size(), input_node_dims, 2);
+    Ort::Value state_ort = Ort::Value::CreateTensor<float>(
+        memory_info, _state.data(), _state.size(), state_node_dims, 3);
+    Ort::Value sr_ort = Ort::Value::CreateTensor<int64_t>(
+        memory_info, sr.data(), sr.size(), sr_node_dims, 1);
+    ort_inputs.clear();
+    ort_inputs.emplace_back(std::move(input_ort));
+    ort_inputs.emplace_back(std::move(state_ort));
+    ort_inputs.emplace_back(std::move(sr_ort));
+    ort_outputs = session->Run(
+        Ort::RunOptions{nullptr},
+        input_node_names.data(), ort_inputs.data(), ort_inputs.size(),
+        output_node_names.data(), output_node_names.size());
+    float speech_prob = ort_outputs[0].GetTensorMutableData<float>()[0];
+    float* stateN = ort_outputs[1].GetTensorMutableData<float>();
+    std::memcpy(_state.data(), stateN, size_state * sizeof(float));
+    current_sample += static_cast<unsigned int>(window_size_samples);
+    if (speech_prob >= threshold) {
+        if (temp_end != 0) {
+            temp_end = 0;
+            if (next_start < prev_end)
+                next_start = current_sample - window_size_samples;
+        }
+        if (!triggered) {
+            triggered = true;
+            current_speech.start = current_sample - window_size_samples;
+        }
+        std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
+        return;
+    }
+    if (triggered && ((current_sample - current_speech.start) > max_speech_samples)) {
+        if (prev_end > 0) {
+            current_speech.end = prev_end;
+            speeches.push_back(current_speech);
+            current_speech = timestamp_t();
+            if (next_start < prev_end)
+                triggered = false;
+            else
+                current_speech.start = next_start;
+            prev_end = 0;
+            next_start = 0;
+            temp_end = 0;
+        } else {
+            current_speech.end = current_sample;
+            speeches.push_back(current_speech);
+            current_speech = timestamp_t();
+            prev_end = 0;
+            next_start = 0;
+            temp_end = 0;
+            triggered = false;
+        }
+        std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
+        return;
+    }
+    if ((speech_prob >= (threshold - 0.15)) && (speech_prob < threshold)) {
+        std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
+        return;
+    }
+    if (speech_prob < (threshold - 0.15)) {
+        if (triggered) {
+            if (temp_end == 0)
+                temp_end = current_sample;
+            if (current_sample - temp_end > min_silence_samples_at_max_speech)
+                prev_end = temp_end;
+            if ((current_sample - temp_end) >= min_silence_samples) {
+                current_speech.end = temp_end;
+                if (current_speech.end - current_speech.start > min_speech_samples) {
+                    speeches.push_back(current_speech);
+                    current_speech = timestamp_t();
+                    prev_end = 0;
+                    next_start = 0;
+                    temp_end = 0;
+                    triggered = false;
+                }
+            }
+        }
+        std::copy(new_data.end() - context_samples, new_data.end(), _context.begin());
+        return;
+    }
+}
+void VadIterator::process(const std::vector<float>& input_wav) {
+    reset_states();
+    audio_length_samples = static_cast<int>(input_wav.size());
+    for (size_t j = 0; j < static_cast<size_t>(audio_length_samples); j += static_cast<size_t>(window_size_samples)) {
+        if (j + static_cast<size_t>(window_size_samples) > static_cast<size_t>(audio_length_samples))
+            break;
+        std::vector<float> chunk(&input_wav[j], &input_wav[j] + window_size_samples);
+        predict(chunk);
+    }
+    if (current_speech.start >= 0) {
+        current_speech.end = audio_length_samples;
+        speeches.push_back(current_speech);
+        current_speech = timestamp_t();
+        prev_end = 0;
+        next_start = 0;
+        temp_end = 0;
+        triggered = false;
+    }
+}
+const std::vector<timestamp_t>& VadIterator::get_speech_timestamps() const {
+    return speeches;
+}
+void VadIterator::reset() {
+    reset_states();
+}
+// 构造函数实现
+VadIterator::VadIterator(const std::string ModelPath,
+                         int Sample_rate,
+                         int windows_frame_size,
+                         float Threshold,
+                         int min_silence_duration_ms,
+                         int speech_pad_ms,
+                         int min_speech_duration_ms,
+                         float max_speech_duration_s)
+    : sample_rate(Sample_rate),
+      threshold(Threshold),
+      speech_pad_samples(speech_pad_ms),
+      prev_end(0),
+      memory_info(Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtDeviceAllocator, OrtMemType::OrtMemTypeDefault))
+      {
+    sr_per_ms = sample_rate / 1000;
+    window_size_samples = windows_frame_size * sr_per_ms;
+    effective_window_size = window_size_samples + context_samples;
+    input_node_dims[0] = 1;
+    input_node_dims[1] = effective_window_size;
+    _state.resize(size_state);
+    sr.resize(1);
+    sr[0] = sample_rate;
+    _context.assign(context_samples, 0.0f);
+    min_speech_samples = sr_per_ms * min_speech_duration_ms;
+    if (max_speech_duration_s < 0) {
+        max_speech_samples = std::numeric_limits<float>::infinity();
+    } else {
+        max_speech_samples = (sample_rate * max_speech_duration_s - window_size_samples - 2 * speech_pad_samples);
+    }
+    min_silence_samples = sr_per_ms * min_silence_duration_ms;
+    min_silence_samples_at_max_speech = sr_per_ms * 98;
+    init_onnx_model(ModelPath);
+}

silero_vad_onnx/vad_iterator.h ADDED Viewed

	@@ -0,0 +1,87 @@

+#ifndef VAD_ITERATOR_H
+#define VAD_ITERATOR_H
+#include "time_stamp.h"
+#include <vector>
+#include <string>
+#if defined(__APPLE__)
+#include <onnxruntime/onnxruntime_cxx_api.h>
+#else
+#include "onnxruntime_run_options_config_keys.h"
+#include "onnxruntime_cxx_api.h"
+#endif
+// 前向声明 timestamp_t
+class timestamp_t;
+class VadIterator {
+public:
+    // 构造函数
+    VadIterator(const std::string ModelPath,
+                int Sample_rate = 16000,
+                int windows_frame_size = 32,
+                float Threshold = 0.5,
+                int min_silence_duration_ms = 100,
+                int speech_pad_ms = 30,
+                int min_speech_duration_ms = 250,
+                float max_speech_duration_s = -1); // -1 表示无穷大
+    // 公共方法
+    void process(const std::vector<float>& input_wav);
+    const std::vector<timestamp_t>& get_speech_timestamps() const;
+    void reset();
+private:
+    // ONNX Runtime 资源
+    Ort::Env env;
+    Ort::SessionOptions session_options;
+    std::shared_ptr<Ort::Session> session = nullptr;
+    Ort::AllocatorWithDefaultOptions allocator;
+    Ort::MemoryInfo memory_info;
+    // Context 相关变量
+    const int context_samples = 64;
+    std::vector<float> _context;
+    int window_size_samples;
+    int effective_window_size;
+    int sr_per_ms;
+    // ONNX 输入输出相关
+    std::vector<Ort::Value> ort_inputs;
+    std::vector<const char*> input_node_names = {"input", "state", "sr"};
+    std::vector<float> input;
+    unsigned int size_state = 2 * 1 * 128;
+    std::vector<float> _state;
+    std::vector<int64_t> sr;
+    int64_t input_node_dims[2];
+    const int64_t state_node_dims[3] = {2, 1, 128};
+    const int64_t sr_node_dims[1] = {1};
+    std::vector<Ort::Value> ort_outputs;
+    std::vector<const char*> output_node_names = {"output", "stateN"};
+    // 模型参数
+    int sample_rate;
+    float threshold;
+    int min_silence_samples;
+    int min_silence_samples_at_max_speech;
+    int min_speech_samples;
+    float max_speech_samples;
+    int speech_pad_samples;
+    int audio_length_samples;
+    // 状态管理
+    bool triggered = false;
+    unsigned int temp_end = 0;
+    unsigned int current_sample = 0;
+    int prev_end;
+    int next_start = 0;
+    std::vector<timestamp_t> speeches;
+    timestamp_t current_speech;
+    // 私有方法
+    void init_onnx_model(const std::string& model_path);
+    void init_engine_threads(int inter_threads, int intra_threads);
+    void reset_states();
+    void predict(const std::vector<float>& data_chunk);
+};
+#endif // VAD_ITERATOR_H

silero_vad_onnx/wav.h ADDED Viewed

	@@ -0,0 +1,237 @@

+// Copyright (c) 2016 Personal (Binbin Zhang)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef FRONTEND_WAV_H_
+#define FRONTEND_WAV_H_
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <iostream>
+// #include "utils/log.h"
+namespace wav {
+struct WavHeader {
+  char riff[4];  // "riff"
+  unsigned int size;
+  char wav[4];  // "WAVE"
+  char fmt[4];  // "fmt "
+  unsigned int fmt_size;
+  uint16_t format;
+  uint16_t channels;
+  unsigned int sample_rate;
+  unsigned int bytes_per_second;
+  uint16_t block_size;
+  uint16_t bit;
+  char data[4];  // "data"
+  unsigned int data_size;
+};
+class WavReader {
+ public:
+  WavReader() : data_(nullptr) {}
+  explicit WavReader(const std::string& filename) { Open(filename); }
+  bool Open(const std::string& filename) {
+    FILE* fp = fopen(filename.c_str(), "rb"); //文件读取
+    if (NULL == fp) {
+      std::cout << "Error in read " << filename;
+      return false;
+    }
+    WavHeader header;
+    fread(&header, 1, sizeof(header), fp);
+    if (header.fmt_size < 16) {
+      printf("WaveData: expect PCM format data "
+              "to have fmt chunk of at least size 16.\n");
+      return false;
+    } else if (header.fmt_size > 16) {
+      int offset = 44 - 8 + header.fmt_size - 16;
+      fseek(fp, offset, SEEK_SET);
+      fread(header.data, 8, sizeof(char), fp);
+    }
+    // check "riff" "WAVE" "fmt " "data"
+    // Skip any sub-chunks between "fmt" and "data".  Usually there will
+    // be a single "fact" sub chunk, but on Windows there can also be a
+    // "list" sub chunk.
+    while (0 != strncmp(header.data, "data", 4)) {
+      // We will just ignore the data in these chunks.
+      fseek(fp, header.data_size, SEEK_CUR);
+      // read next sub chunk
+      fread(header.data, 8, sizeof(char), fp);
+    }
+    if (header.data_size == 0) {
+        int offset = ftell(fp);
+        fseek(fp, 0, SEEK_END);
+        header.data_size = ftell(fp) - offset;
+        fseek(fp, offset, SEEK_SET);
+    }
+    num_channel_ = header.channels;
+    sample_rate_ = header.sample_rate;
+    bits_per_sample_ = header.bit;
+    int num_data = header.data_size / (bits_per_sample_ / 8);
+    data_ = new float[num_data]; // Create 1-dim array
+    num_samples_ = num_data / num_channel_;
+    std::cout << "num_channel_    :" << num_channel_ << std::endl;
+    std::cout << "sample_rate_    :" << sample_rate_ << std::endl;
+    std::cout << "bits_per_sample_:" << bits_per_sample_ << std::endl;
+    std::cout << "num_samples     :" << num_data << std::endl;
+    std::cout << "num_data_size   :" << header.data_size << std::endl;
+    switch (bits_per_sample_) {
+        case 8: {
+            char sample;
+            for (int i = 0; i < num_data; ++i) {
+                fread(&sample, 1, sizeof(char), fp);
+                data_[i] = static_cast<float>(sample) / 32768;
+            }
+            break;
+        }
+        case 16: {
+            int16_t sample;
+            for (int i = 0; i < num_data; ++i) {
+                fread(&sample, 1, sizeof(int16_t), fp);
+                data_[i] = static_cast<float>(sample) / 32768;
+            }
+            break;
+        }
+        case 32:
+        {
+            if (header.format == 1) //S32
+            {
+                int sample;
+                for (int i = 0; i < num_data; ++i) {
+                    fread(&sample, 1, sizeof(int), fp);
+                    data_[i] = static_cast<float>(sample) / 32768;
+                }
+            }
+            else if (header.format == 3) // IEEE-float
+            {
+                float sample;
+                for (int i = 0; i < num_data; ++i) {
+                    fread(&sample, 1, sizeof(float), fp);
+                    data_[i] = static_cast<float>(sample);
+                }
+            }
+            else {
+                printf("unsupported quantization bits\n");
+            }
+            break;
+        }
+        default:
+            printf("unsupported quantization bits\n");
+            break;
+    }
+    fclose(fp);
+    return true;
+  }
+  int num_channel() const { return num_channel_; }
+  int sample_rate() const { return sample_rate_; }
+  int bits_per_sample() const { return bits_per_sample_; }
+  int num_samples() const { return num_samples_; }
+  ~WavReader() {
+    delete[] data_;
+  }
+  const float* data() const { return data_; }
+ private:
+  int num_channel_;
+  int sample_rate_;
+  int bits_per_sample_;
+  int num_samples_;  // sample points per channel
+  float* data_;
+};
+class WavWriter {
+ public:
+  WavWriter(const float* data, int num_samples, int num_channel,
+            int sample_rate, int bits_per_sample)
+      : data_(data),
+        num_samples_(num_samples),
+        num_channel_(num_channel),
+        sample_rate_(sample_rate),
+        bits_per_sample_(bits_per_sample) {}
+  void Write(const std::string& filename) {
+    FILE* fp = fopen(filename.c_str(), "w");
+    // init char 'riff' 'WAVE' 'fmt ' 'data'
+    WavHeader header;
+    char wav_header[44] = {0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57,
+                           0x41, 0x56, 0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00,
+                           0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                           0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                           0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00};
+    memcpy(&header, wav_header, sizeof(header));
+    header.channels = num_channel_;
+    header.bit = bits_per_sample_;
+    header.sample_rate = sample_rate_;
+    header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8);
+    header.size = sizeof(header) - 8 + header.data_size;
+    header.bytes_per_second =
+        sample_rate_ * num_channel_ * (bits_per_sample_ / 8);
+    header.block_size = num_channel_ * (bits_per_sample_ / 8);
+    fwrite(&header, 1, sizeof(header), fp);
+    for (int i = 0; i < num_samples_; ++i) {
+      for (int j = 0; j < num_channel_; ++j) {
+        switch (bits_per_sample_) {
+          case 8: {
+            char sample = static_cast<char>(data_[i * num_channel_ + j]);
+            fwrite(&sample, 1, sizeof(sample), fp);
+            break;
+          }
+          case 16: {
+            int16_t sample = static_cast<int16_t>(data_[i * num_channel_ + j]);
+            fwrite(&sample, 1, sizeof(sample), fp);
+            break;
+          }
+          case 32: {
+            int sample = static_cast<int>(data_[i * num_channel_ + j]);
+            fwrite(&sample, 1, sizeof(sample), fp);
+            break;
+          }
+        }
+      }
+    }
+    fclose(fp);
+  }
+ private:
+  const float* data_;
+  int num_samples_;  // total float points in data_
+  int num_channel_;
+  int sample_rate_;
+  int bits_per_sample_;
+};
+}  // namespace wav
+#endif  // FRONTEND_WAV_H_

vad_onnx/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,40 @@

+cmake_minimum_required(VERSION 3.16)
+project(VadOnnx)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+# 添加 ONNX Runtime include 路径
+include_directories(${ONNXRUNTIME_DIR}/include)
+# 添加项目头文件目录
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+add_library(vad_onnx SHARED ${CMAKE_CURRENT_SOURCE_DIR}/vad_onnx.cpp)
+# 设置库输出名称（跨平台兼容）
+# set_target_properties(vad_onnx PROPERTIES
+#     PREFIX ""
+#     SUFFIX ".so"
+#     LIBRARY_OUTPUT_NAME_DEBUG "vad_onnx"
+#     LIBRARY_OUTPUT_NAME_RELEASE "vad_onnx"
+# )
+# 链接 ONNX Runtime 库
+if(APPLE)
+    # macOS 上链接 dylib
+    target_link_libraries(vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/libonnxruntime.dylib)
+elseif(UNIX)
+    # Linux 上链接 so
+    target_link_libraries(vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/libonnxruntime.so)
+elseif(WIN32)
+    # Windows 上链接 dll + lib
+    target_link_libraries(vad_onnx PRIVATE ${ONNXRUNTIME_DIR}/lib/onnxruntime.lib)
+    set_target_properties(vad_onnx PROPERTIES SUFFIX ".dll")
+else()
+    message(WARNING "Unknown platform, no ONNX Runtime linking applied.")
+endif()

vad_onnx/vad_onnx.cpp ADDED Viewed

	@@ -0,0 +1,245 @@

+#include <stdexcept>
+#include <cmath>
+#include <iostream>
+#include "vad_onnx.h"
+static void get_input_names(Ort::Session* session, std::vector<std::string> &input_names_str,
+                   std::vector<const char *> &input_names_char) {
+    Ort::AllocatorWithDefaultOptions allocator;
+    size_t nodes_num = session->GetInputCount();
+    input_names_str.resize(nodes_num);
+    input_names_char.resize(nodes_num);
+    for (size_t i = 0; i != nodes_num; ++i) {
+        auto t = session->GetInputNameAllocated(i, allocator);
+        input_names_str[i] = t.get();
+        input_names_char[i] = input_names_str[i].c_str();
+    }
+}
+static void get_output_names(Ort::Session* session, std::vector<std::string> &output_names_,
+                   std::vector<const char *> &vad_out_names_) {
+    Ort::AllocatorWithDefaultOptions allocator;
+    size_t nodes_num = session->GetOutputCount();
+    output_names_.resize(nodes_num);
+    vad_out_names_.resize(nodes_num);
+    for (size_t i = 0; i != nodes_num; ++i) {
+        auto t = session->GetOutputNameAllocated(i, allocator);
+        output_names_[i] = t.get();
+        vad_out_names_[i] = output_names_[i].c_str();
+    }
+}
+VadOnnx::VadOnnx(const std::string& model_path,
+                int batch_size,
+                int thread_num,
+                float threshold,
+                int sampling_rate,
+                int min_silence_duration_ms,
+                float max_speech_duration_s,
+                int speech_pad_ms)
+    : batch_size_(batch_size),
+      thread_num_(thread_num),
+      threshold_(threshold),
+      sample_rates_(sampling_rate),
+      min_silence_samples_(sampling_rate * min_silence_duration_ms / 1000.0),
+      speech_pad_samples_(sampling_rate * speech_pad_ms / 1000.0),
+      triggered_(false),
+      temp_end_(0),
+      current_sample_(0),
+      start_(0),
+      memory_info(Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU))
+      {
+    init_onnx_model(model_path);
+    get_input_names(session.get(), input_names_, vad_in_names_);
+    get_output_names(session.get(), output_names_, vad_out_names_);
+    sr.resize(1);
+    sr[0] = sample_rates_;
+    if (batch_size_ != 1) {
+        state_shape = {2, batch_size_, 128};
+        state_size = 2 * batch_size_ * 128;
+    }
+    state_.resize(state_size);
+    context_size = (sample_rates_ == 16000) ? 64 : 32;
+    context_.resize(context_size);
+    effective_window_size = window_size_samples + context_size;
+    input_node_shape[0] = 1;
+    input_node_shape[1] = effective_window_size;
+    reset_states();
+}
+VadOnnx::~VadOnnx() = default;
+void VadOnnx::reset_states() {
+    std::memset(state_.data(), 0, state_.size() * sizeof(float));
+    std::fill(context_.begin(), context_.end(), 0.0f);
+    triggered_ = false;
+    temp_end_ = 0;
+    current_sample_ = 0;
+    start_ = 0;
+    last_sr_ = 0;
+    last_batch_size_ = 0;
+}
+float VadOnnx::forward_infer(std::vector<float>& data_chunk) {
+    // 合并 context 和 input
+    std::vector<float> x_with_context(effective_window_size, 0.0f);
+    std::copy(context_.begin(), context_.end(), x_with_context.begin());
+    std::copy(data_chunk.begin(), data_chunk.end(), x_with_context.begin() + context_size);
+    input = x_with_context;
+    // Prepare inputs
+    Ort::Value input_tensor = Ort::Value::CreateTensor<float>(
+        memory_info, input.data(), input.size(), input_node_shape.data(), 2);
+    Ort::Value state_tensor = Ort::Value::CreateTensor<float>(
+        memory_info, state_.data(), state_.size(), state_shape.data(), 3);
+    Ort::Value sr_tensor = Ort::Value::CreateTensor<int64_t>(
+        memory_info, sr.data(), 1, sr_shape.data(), 1);
+    ort_inputs.clear();
+    ort_inputs.emplace_back(std::move(input_tensor));
+    ort_inputs.emplace_back(std::move(state_tensor));
+    ort_inputs.emplace_back(std::move(sr_tensor));
+    // Run inference
+    ort_outputs = session->Run(
+        Ort::RunOptions{nullptr}, vad_in_names_.data(), ort_inputs.data(),
+        ort_inputs.size(), vad_out_names_.data(), vad_out_names_.size());
+    // Get output
+    float speech_prob = ort_outputs[0].GetTensorMutableData<float>()[0];
+    // Update state
+    float* stateN = ort_outputs[1].GetTensorMutableData<float>();
+    std::memcpy(state_.data(), stateN, state_size * sizeof(float));
+    // Update context
+    std::copy(x_with_context.end() - context_size, x_with_context.end(), context_.begin());
+    return speech_prob;
+}
+std::vector<float> VadOnnx::vad_dectect(std::vector<float>& audio) {
+    std::vector<float> result;
+    // Pad to multiple of num_samples
+    int pad_num = (window_size_samples - (audio.size() % window_size_samples)) % window_size_samples;
+    audio.insert(audio.end(), pad_num, 0.0f);
+    for (size_t i = 0; i < audio.size(); i += window_size_samples) {
+        std::vector<float> chunk(audio.begin() + i, audio.begin() + i + window_size_samples);
+        auto prob = forward_infer(chunk);
+        result.emplace_back(prob);
+    }
+    return result;
+}
+std::map<std::string, double> VadOnnx::vad_dectect(std::vector<float>& audio, bool return_seconds) {
+    std::map<std::string, double> result;
+    // 将新音频追加到缓存中
+    buffer_.insert(buffer_.end(), audio.begin(), audio.end());
+    while (buffer_.size() > 0) {
+        std::map<std::string, double> tmp;
+        std::vector<float> chunk(buffer_.begin(), buffer_.begin() + std::min(static_cast<int>(buffer_.size()), window_size_samples));
+        // 补零到固定长度
+        if (chunk.size() < static_cast<size_t>(window_size_samples)) {
+            chunk.resize(window_size_samples, 0.0f);
+        }
+        current_sample_ += window_size_samples;
+        // 推理得到语音概率
+        float speech_prob = forward_infer(chunk);
+        if (speech_prob >= threshold_ && temp_end_ > 0) {
+            temp_end_ = 0;
+        }
+        if (speech_prob >= threshold_ && !triggered_) {
+            triggered_ = true;
+            start_ = std::max(0.0, current_sample_ - window_size_samples);
+            tmp["start"] = return_seconds ? start_ / sample_rates_ : start_;
+        }
+        if (speech_prob < (threshold_ - 0.15) && triggered_) {
+            if (temp_end_ == 0) {
+                temp_end_ = current_sample_;
+            }
+            if (current_sample_ - temp_end_ >= min_silence_samples_) {
+                double speech_end = temp_end_;
+                tmp["end"] = return_seconds ? speech_end / sample_rates_ : speech_end;
+                temp_end_ = 0;
+                triggered_ = false;
+            }
+        }
+        // 移除已处理的数据
+        if (window_size_samples >= buffer_.size()) {
+            buffer_.clear();  // 全部丢弃
+        } else {
+            std::copy(buffer_.begin() + window_size_samples, buffer_.end(), buffer_.begin());
+            buffer_.resize(buffer_.size() - window_size_samples);
+        }
+        // 合并检测结果
+        if (result.empty()) {
+            result = tmp;
+        } else if (!tmp.empty()) {
+            // 如果当前结果有 'end'，更新最终 end
+            if (tmp.find("end") != tmp.end()) {
+                result["end"] = tmp["end"];
+            }
+            // 如果有新的 start，但前一个有 end，则合并成连续语音段
+            if (tmp.find("start") != tmp.end() && result.find("end") != result.end()) {
+                result.erase("end");
+            }
+        }
+    }
+    return result;
+}
+void VadOnnx::init_onnx_model(const std::string& model_path) {
+    init_engine_threads(1, 1);
+    init_exec_provider();
+    // 初始化 ONNX Session
+    env_ = Ort::Env(ORT_LOGGING_LEVEL_WARNING, "VadOnnx");
+    session = std::make_unique<Ort::Session>(env_, ORTCHAR(model_path.c_str()), session_options);
+}
+void VadOnnx::init_engine_threads(int inter_threads, int intra_threads) {
+    session_options.SetInterOpNumThreads(inter_threads);
+    session_options.SetIntraOpNumThreads(intra_threads);
+    session_options.SetGraphOptimizationLevel(ORT_ENABLE_ALL);
+}
+void VadOnnx::init_exec_provider() {
+    // 获取所有可用的 Execution Providers
+    std::vector<std::string> providers = Ort::GetAvailableProviders();
+    // 根据支持情况添加 Execution Provider
+    if (std::find(providers.begin(), providers.end(), "CUDAExecutionProvider") != providers.end()) {
+        OrtCUDAProviderOptions cuda_options{};
+        session_options.AppendExecutionProvider_CUDA(cuda_options);
+    }
+// #if defined(__APPLE__)
+//     if (std::find(providers.begin(), providers.end(), "CoreMLExecutionProvider") != providers.end()) {
+//         session_options.AppendExecutionProvider_CoreML();
+//     }
+// #endif
+}

vad_onnx/vad_onnx.h ADDED Viewed

	@@ -0,0 +1,107 @@

+#include <vector>
+#include <string>
+#include <map>
+#if defined(__APPLE__)
+#include <onnxruntime/onnxruntime_cxx_api.h>
+#else
+#include "onnxruntime_run_options_config_keys.h"
+#include "onnxruntime_cxx_api.h"
+#endif
+#ifdef _WIN32
+#define ORTSTRING(str) StrToWstr(str)
+#define ORTCHAR(str) StrToWstr(str).c_str()
+inline std::wstring String2wstring(const std::string& str, const std::string& locale)
+{
+    typedef std::codecvt_byname<wchar_t, char, std::mbstate_t> F;
+    std::wstring_convert<F> strCnv(new F(locale));
+    return strCnv.from_bytes(str);
+}
+inline std::wstring  StrToWstr(std::string str) {
+    if (str.length() == 0)
+        return L"";
+    return  String2wstring(str, "zh-CN");
+}
+#else
+#define ORTSTRING(str) str
+#define ORTCHAR(str) str
+#endif
+class VadOnnx {
+public:
+    explicit VadOnnx(const std::string& model_path,
+                    int batch_size = 1,
+                    int thread_num = 1,
+                    float threshold = 0.5,
+                    int sampling_rate = 16000,
+                    int min_silence_duration_ms = 100,
+                    float max_speech_duration_s = INFINITY,
+                    int speech_pad_ms = 30);
+    ~VadOnnx();
+    // 处理固定长度音频(16000 -> 512 , 8000 -> 256)
+    float forward_infer(std::vector<float>& data_chunk);
+    // 处理整个长音频，返回概率
+    std::vector<float> vad_dectect(std::vector<float>& audio);
+    // 处理整个长音频，返回有效音频区间
+    std::map<std::string, double> vad_dectect(std::vector<float>& audio, bool return_seconds);
+    // 重置 RNN 状态
+    void reset_states();
+private:
+    // onnx资源参数
+    Ort::Env env_;
+    Ort::SessionOptions session_options;
+    std::unique_ptr<Ort::Session> session = nullptr;
+    Ort::AllocatorWithDefaultOptions allocator;
+    Ort::MemoryInfo memory_info;
+    int thread_num_;
+    // onnx输入输出相关
+    std::vector<Ort::Value> ort_inputs, ort_outputs;
+    std::vector<std::string> input_names_, output_names_;
+    std::vector<const char *> vad_in_names_;
+    std::vector<const char *> vad_out_names_;
+    int window_size_samples = 512;
+    int effective_window_size;
+    std::array<int64_t, 2> input_node_shape;
+    std::vector<float> input;
+    std::array<int64_t, 3> state_shape = {2, 1, 128};
+    int state_size = 2 * 1 * 128;
+    std::vector<float> state_;   // RNN State
+    int context_size;
+    std::vector<float> context_;   // Context buffer
+    std::array<int64_t, 1> sr_shape = {1};
+    std::vector<int64_t> sr;
+    // vad推理参数
+    std::vector<float> buffer_;  // 缓冲区用于保存未处理完的音频
+    double min_silence_samples_;
+    double speech_pad_samples_;
+    double temp_end_;
+    double current_sample_;
+    double start_;
+    float threshold_;
+    bool triggered_;
+    int batch_size_;
+    int sample_rates_;
+    int last_sr_ = 0;
+    int last_batch_size_ = 0;
+    void init_onnx_model(const std::string& model_path);
+    void init_engine_threads(int inter_threads, int intra_threads);
+    void init_exec_provider();
+};