File size: 2,097 Bytes
165d1b6
 
 
5deec0c
d20c2c8
165d1b6
 
 
 
 
 
 
5deec0c
 
 
 
6a85bff
5deec0c
 
 
 
 
 
6a85bff
5deec0c
 
9e67bc2
 
5deec0c
165d1b6
5deec0c
6a85bff
165d1b6
5deec0c
 
165d1b6
 
 
 
 
6a85bff
165d1b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b06194e
e8560e7
165d1b6
b06194e
165d1b6
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
package main

import (
	"flag"
	"fmt"
	"log"
	"os"
	"github.com/streamer45/silero-vad-go/speech"
	"github.com/go-audio/wav"
)

func main() {
    modelPathPtr := flag.String("model_path", "./pretrained_models/silero_vad/silero_vad.onnx", "silero vad onnx model")
    filenamePtr := flag.String("filename", "", "input wav audio file")
    silenceTimePtr := flag.Float64("silence_time", 0.1, "in the end of each speech chunk wait for min_silence_duration_ms before separating it")
	speechPadTimePtr := flag.Float64("speech_pad_time", 0.03, "final speech chunks are padded by speech_pad_ms each side")
    sampleRatePtr := flag.Int("sample_rate", 8000, "sample rate")
	flag.Parse()

    var modelPath string = *modelPathPtr
    var filename string = *filenamePtr
    var silenceTime float32 = float32(*silenceTimePtr)
    var speechPadTime float32 = float32(*speechPadTimePtr)
    var sampleRate int = float32(*sampleRatePtr)
	fmt.Println(filename)

    var silenceTimeMs int = int(silenceTime * 1e3)
    var speechPadTimeMs int = int(speechPadTime * 1e3)

	sd, err := speech.NewDetector(speech.DetectorConfig{
		ModelPath:            modelPath,
		SampleRate:           sampleRate,
		Threshold:            0.5,
		MinSilenceDurationMs: silenceTimeMs,
		SpeechPadMs:          speechPadTimeMs,
	})
	if err != nil {
		log.Fatalf("failed to create speech detector: %s", err)
	}

	f, err := os.Open(filename)
	if err != nil {
		log.Fatalf("failed to open sample audio file: %s", err)
	}
	defer f.Close()

	dec := wav.NewDecoder(f)

	if ok := dec.IsValidFile(); !ok {
		log.Fatalf("invalid WAV file")
	}

	buf, err := dec.FullPCMBuffer()
	if err != nil {
		log.Fatalf("failed to get PCM buffer")
	}

	pcmBuf := buf.AsFloat32Buffer()

	segments, err := sd.Detect(pcmBuf.Data)
	if err != nil {
		log.Fatalf("Detect failed: %s", err)
	}

	for _, s := range segments {
		fmt.Printf("speech starts at %0.2fs\n", s.SpeechStartAt)

		if s.SpeechEndAt > 0 {
			fmt.Printf("speech ends at %0.2fs\n", s.SpeechEndAt)
		}
	}

	err = sd.Destroy()
	if err != nil {
		log.Fatalf("failed to destroy detector: %s", err)
	}
}