Spaces:
Running
Running
File size: 2,659 Bytes
165d1b6 5deec0c d20c2c8 165d1b6 5deec0c 5d7199c 897c5df 5deec0c 9ff8c1e 8e74df8 5deec0c 9e67bc2 5deec0c 5a74d6f 165d1b6 5deec0c 6a85bff 8e74df8 5deec0c 165d1b6 6a85bff 165d1b6 b06194e e8560e7 165d1b6 b06194e 165d1b6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
package main
import (
"flag"
"fmt"
"log"
"os"
"github.com/streamer45/silero-vad-go/speech"
"github.com/go-audio/wav"
)
func main() {
modelPathPtr := flag.String("model_path", "./pretrained_models/silero_vad/silero_vad.onnx", "silero vad onnx model")
filenamePtr := flag.String("filename", "", "input wav audio file")
silenceTimePtr := flag.Float64("silence_time", 0.1, "in the end of each speech chunk wait for min_silence_duration_ms before separating it")
speechPadTimePtr := flag.Float64("speech_pad_time", 0.03, "final speech chunks are padded by speech_pad_ms each side")
sampleRatePtr := flag.Uint64("sample_rate", 8000, "sample rate")
thresholdPtr := flag.Float64("threshold", 0.5, "Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH. It is better to tune this parameter for each dataset separately, but lazy 0.5 is pretty good for most datasets.")
flag.Parse()
var modelPath string = *modelPathPtr
var filename string = *filenamePtr
var silenceTime float32 = float32(*silenceTimePtr)
var speechPadTime float32 = float32(*speechPadTimePtr)
var sampleRate int = int(*sampleRatePtr)
var threshold float32 = float32(*thresholdPtr)
var silenceTimeMs int = int(silenceTime * 1e3)
var speechPadTimeMs int = int(speechPadTime * 1e3)
fmt.Println(filename)
fmt.Printf("silenceTimeMs: %d\n", silenceTimeMs)
fmt.Printf("speechPadTimeMs: %d\n", speechPadTimeMs)
fmt.Printf("sampleRate: %d\n", sampleRate)
fmt.Printf("threshold: %0.2f\n", threshold)
sd, err := speech.NewDetector(speech.DetectorConfig{
ModelPath: modelPath,
SampleRate: sampleRate,
Threshold: threshold,
MinSilenceDurationMs: silenceTimeMs,
SpeechPadMs: speechPadTimeMs,
})
if err != nil {
log.Fatalf("failed to create speech detector: %s", err)
}
f, err := os.Open(filename)
if err != nil {
log.Fatalf("failed to open sample audio file: %s", err)
}
defer f.Close()
dec := wav.NewDecoder(f)
if ok := dec.IsValidFile(); !ok {
log.Fatalf("invalid WAV file")
}
buf, err := dec.FullPCMBuffer()
if err != nil {
log.Fatalf("failed to get PCM buffer")
}
pcmBuf := buf.AsFloat32Buffer()
segments, err := sd.Detect(pcmBuf.Data)
if err != nil {
log.Fatalf("Detect failed: %s", err)
}
for _, s := range segments {
fmt.Printf("speech starts at %0.2fs\n", s.SpeechStartAt)
if s.SpeechEndAt > 0 {
fmt.Printf("speech ends at %0.2fs\n", s.SpeechEndAt)
}
}
err = sd.Destroy()
if err != nil {
log.Fatalf("failed to destroy detector: %s", err)
}
}
|