package main import ( "flag" "fmt" "log" "os" "github.com/streamer45/silero-vad-go/speech" "github.com/go-audio/wav" ) func main() { modelPathPtr := flag.String("model_path", "./pretrained_models/silero_vad/silero_vad.onnx", "silero vad onnx model") filenamePtr := flag.String("filename", "", "input wav audio file") silenceTimePtr := flag.Float64("silence_time", 0.1, "in the end of each speech chunk wait for min_silence_duration_ms before separating it") speechPadTimePtr := flag.Float64("speech_pad_time", 0.03, "final speech chunks are padded by speech_pad_ms each side") sampleRatePtr := flag.Uint64("sample_rate", 8000, "sample rate") thresholdPtr := flag.Float64("threshold", 0.5, "Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH. It is better to tune this parameter for each dataset separately, but lazy 0.5 is pretty good for most datasets.") flag.Parse() var modelPath string = *modelPathPtr var filename string = *filenamePtr var silenceTime float32 = float32(*silenceTimePtr) var speechPadTime float32 = float32(*speechPadTimePtr) var sampleRate int = int(*sampleRatePtr) var threshold float32 = float32(*thresholdPtr) var silenceTimeMs int = int(silenceTime * 1e3) var speechPadTimeMs int = int(speechPadTime * 1e3) fmt.Println(filename) fmt.Printf("silenceTimeMs: %d\n", silenceTimeMs) fmt.Printf("speechPadTimeMs: %d\n", speechPadTimeMs) fmt.Printf("sampleRate: %d\n", sampleRate) fmt.Printf("threshold: %0.2f\n", threshold) sd, err := speech.NewDetector(speech.DetectorConfig{ ModelPath: modelPath, SampleRate: sampleRate, Threshold: threshold, MinSilenceDurationMs: silenceTimeMs, SpeechPadMs: speechPadTimeMs, }) if err != nil { log.Fatalf("failed to create speech detector: %s", err) } f, err := os.Open(filename) if err != nil { log.Fatalf("failed to open sample audio file: %s", err) } defer f.Close() dec := wav.NewDecoder(f) if ok := dec.IsValidFile(); !ok { log.Fatalf("invalid WAV file") } buf, err := dec.FullPCMBuffer() if err != nil { log.Fatalf("failed to get PCM buffer") } pcmBuf := buf.AsFloat32Buffer() segments, err := sd.Detect(pcmBuf.Data) if err != nil { log.Fatalf("Detect failed: %s", err) } for _, s := range segments { fmt.Printf("speech starts at %0.2fs\n", s.SpeechStartAt) if s.SpeechEndAt > 0 { fmt.Printf("speech ends at %0.2fs\n", s.SpeechEndAt) } } err = sd.Destroy() if err != nil { log.Fatalf("failed to destroy detector: %s", err) } }