File size: 2,659 Bytes
165d1b6
 
 
5deec0c
d20c2c8
165d1b6
 
 
 
 
 
 
5deec0c
 
 
 
5d7199c
897c5df
5deec0c
 
 
 
 
 
9ff8c1e
8e74df8
5deec0c
9e67bc2
 
5deec0c
5a74d6f
 
 
 
 
 
165d1b6
5deec0c
6a85bff
8e74df8
5deec0c
 
165d1b6
 
 
 
 
6a85bff
165d1b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b06194e
e8560e7
165d1b6
b06194e
165d1b6
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
package main

import (
	"flag"
	"fmt"
	"log"
	"os"
	"github.com/streamer45/silero-vad-go/speech"
	"github.com/go-audio/wav"
)

func main() {
    modelPathPtr := flag.String("model_path", "./pretrained_models/silero_vad/silero_vad.onnx", "silero vad onnx model")
    filenamePtr := flag.String("filename", "", "input wav audio file")
    silenceTimePtr := flag.Float64("silence_time", 0.1, "in the end of each speech chunk wait for min_silence_duration_ms before separating it")
	speechPadTimePtr := flag.Float64("speech_pad_time", 0.03, "final speech chunks are padded by speech_pad_ms each side")
    sampleRatePtr := flag.Uint64("sample_rate", 8000, "sample rate")
	thresholdPtr := flag.Float64("threshold", 0.5, "Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH. It is better to tune this parameter for each dataset separately, but lazy 0.5 is pretty good for most datasets.")
	flag.Parse()

    var modelPath string = *modelPathPtr
    var filename string = *filenamePtr
    var silenceTime float32 = float32(*silenceTimePtr)
    var speechPadTime float32 = float32(*speechPadTimePtr)
    var sampleRate int = int(*sampleRatePtr)
    var threshold float32 = float32(*thresholdPtr)

    var silenceTimeMs int = int(silenceTime * 1e3)
    var speechPadTimeMs int = int(speechPadTime * 1e3)

	fmt.Println(filename)
    fmt.Printf("silenceTimeMs: %d\n", silenceTimeMs)
    fmt.Printf("speechPadTimeMs: %d\n", speechPadTimeMs)
    fmt.Printf("sampleRate: %d\n", sampleRate)
    fmt.Printf("threshold: %0.2f\n", threshold)

	sd, err := speech.NewDetector(speech.DetectorConfig{
		ModelPath:            modelPath,
		SampleRate:           sampleRate,
		Threshold:            threshold,
		MinSilenceDurationMs: silenceTimeMs,
		SpeechPadMs:          speechPadTimeMs,
	})
	if err != nil {
		log.Fatalf("failed to create speech detector: %s", err)
	}

	f, err := os.Open(filename)
	if err != nil {
		log.Fatalf("failed to open sample audio file: %s", err)
	}
	defer f.Close()

	dec := wav.NewDecoder(f)

	if ok := dec.IsValidFile(); !ok {
		log.Fatalf("invalid WAV file")
	}

	buf, err := dec.FullPCMBuffer()
	if err != nil {
		log.Fatalf("failed to get PCM buffer")
	}

	pcmBuf := buf.AsFloat32Buffer()

	segments, err := sd.Detect(pcmBuf.Data)
	if err != nil {
		log.Fatalf("Detect failed: %s", err)
	}

	for _, s := range segments {
		fmt.Printf("speech starts at %0.2fs\n", s.SpeechStartAt)

		if s.SpeechEndAt > 0 {
			fmt.Printf("speech ends at %0.2fs\n", s.SpeechEndAt)
		}
	}

	err = sd.Destroy()
	if err != nil {
		log.Fatalf("failed to destroy detector: %s", err)
	}
}