rockdrigo commited on
Commit
9556d07
·
1 Parent(s): 64c445f
mic_test_whisper_simple.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from microphone_stream import MicrophoneStream
2
+ from voice_activity_controller import VoiceActivityController
3
+ from whisper_online import *
4
+ import numpy as np
5
+ import librosa
6
+ import io
7
+ import soundfile
8
+ import sys
9
+
10
+
11
+
12
+
13
+ class SimpleASRProcessor:
14
+
15
+ def __init__(self, asr, sampling_rate = 16000):
16
+ """run this when starting or restarting processing"""
17
+ self.audio_buffer = np.array([],dtype=np.float32)
18
+ self.prompt_buffer = ""
19
+ self.asr = asr
20
+ self.sampling_rate = sampling_rate
21
+ self.init_prompt = ''
22
+
23
+ def ts_words(self, segments):
24
+ result = ""
25
+ for segment in segments:
26
+ if segment.no_speech_prob > 0.9:
27
+ continue
28
+ for word in segment.words:
29
+ w = word.word
30
+ t = (word.start, word.end, w)
31
+ result +=w
32
+ return result
33
+
34
+ def stream_process(self, vad_result):
35
+ iter_in_phrase = 0
36
+ for chunk, is_final in vad_result:
37
+ iter_in_phrase += 1
38
+
39
+ if chunk is not None:
40
+ sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
41
+ audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
42
+ # self.audio_buffer.append(chunk)
43
+ out = []
44
+ out.append(audio)
45
+ a = np.concatenate(out)
46
+ self.audio_buffer = np.append(self.audio_buffer, a)
47
+
48
+ if is_final and len(self.audio_buffer) > 0:
49
+ res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
50
+ # use custom ts_words
51
+ tsw = self.ts_words(res)
52
+ self.init_prompt = self.init_prompt + tsw
53
+ self.init_prompt = self.init_prompt [-100:]
54
+ self.audio_buffer.resize(0)
55
+ iter_in_phrase =0
56
+ yield True, tsw
57
+ # show progress evry 10 chunks
58
+ elif iter_in_phrase % 20 == 0 and len(self.audio_buffer) > 0:
59
+ res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
60
+ # use custom ts_words
61
+ tsw = self.ts_words(res)
62
+ yield False, tsw
63
+
64
+
65
+
66
+
67
+
68
+
69
+
70
+ SAMPLING_RATE = 16000
71
+
72
+ model = "large-v2"
73
+ src_lan = "en" # source language
74
+ tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
75
+ use_vad_result = True
76
+ min_sample_length = 1 * SAMPLING_RATE
77
+
78
+
79
+
80
+ vad = VoiceActivityController(use_vad_result = use_vad_result)
81
+ asr = FasterWhisperASR(src_lan, "large-v2") # loads and wraps Whisper model
82
+
83
+ tokenizer = create_tokenizer(tgt_lan)
84
+ online = SimpleASRProcessor(asr)
85
+
86
+
87
+ stream = MicrophoneStream()
88
+ stream = vad.detect_user_speech(stream, audio_in_int16 = False)
89
+ stream = online.stream_process(stream)
90
+
91
+ for isFinal, text in stream:
92
+ if isFinal:
93
+ print( text, end="\r\n")
94
+ else:
95
+ print( text, end="\r")
mic_test_whisper_streaming.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from microphone_stream import MicrophoneStream
2
+ from voice_activity_controller import VoiceActivityController
3
+ from whisper_online import *
4
+ import numpy as np
5
+ import librosa
6
+ import io
7
+ import soundfile
8
+ import sys
9
+
10
+
11
+ SAMPLING_RATE = 16000
12
+ model = "large-v2"
13
+ src_lan = "en" # source language
14
+ tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
15
+ use_vad_result = True
16
+ min_sample_length = 1 * SAMPLING_RATE
17
+
18
+
19
+
20
+ asr = FasterWhisperASR(src_lan, model) # loads and wraps Whisper model
21
+ tokenizer = create_tokenizer(tgt_lan) # sentence segmenter for the target language
22
+ online = OnlineASRProcessor(asr, tokenizer) # create processing object
23
+
24
+ microphone_stream = MicrophoneStream()
25
+ vad = VoiceActivityController(use_vad_result = use_vad_result)
26
+
27
+ complete_text = ''
28
+ final_processing_pending = False
29
+ out = []
30
+ out_len = 0
31
+ for iter in vad.detect_user_speech(microphone_stream): # processing loop:
32
+ raw_bytes= iter[0]
33
+ is_final = iter[1]
34
+
35
+ if raw_bytes:
36
+ sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
37
+ audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
38
+ out.append(audio)
39
+ out_len += len(audio)
40
+
41
+
42
+ if (is_final or out_len >= min_sample_length) and out_len>0:
43
+ a = np.concatenate(out)
44
+ online.insert_audio_chunk(a)
45
+
46
+ if out_len > min_sample_length:
47
+ o = online.process_iter()
48
+ print('-----'*10)
49
+ complete_text = complete_text + o[2]
50
+ print('PARTIAL - '+ complete_text) # do something with current partial output
51
+ print('-----'*10)
52
+ out = []
53
+ out_len = 0
54
+
55
+ if is_final:
56
+ o = online.finish()
57
+ online.init()
58
+ # final_processing_pending = False
59
+ print('-----'*10)
60
+ complete_text = complete_text + o[2]
61
+ print('FINAL - '+ complete_text) # do something with current partial output
62
+ print('-----'*10)
63
+ out = []
64
+ out_len = 0
65
+
66
+
67
+
68
+
69
+
70
+
71
+
microphone_stream.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ ### mic stream
4
+
5
+ import queue
6
+ import re
7
+ import sys
8
+ import pyaudio
9
+
10
+
11
+ class MicrophoneStream:
12
+ def __init__(
13
+ self,
14
+ sample_rate: int = 16000,
15
+ ):
16
+ """
17
+ Creates a stream of audio from the microphone.
18
+
19
+ Args:
20
+ chunk_size: The size of each chunk of audio to read from the microphone.
21
+ channels: The number of channels to record audio from.
22
+ sample_rate: The sample rate to record audio at.
23
+ """
24
+ try:
25
+ import pyaudio
26
+ except ImportError:
27
+ raise Exception('py audio not installed')
28
+
29
+ self._pyaudio = pyaudio.PyAudio()
30
+ self.sample_rate = sample_rate
31
+
32
+ self._chunk_size = int(self.sample_rate * 0.1)
33
+ self._stream = self._pyaudio.open(
34
+ format=pyaudio.paInt16,
35
+ channels=1,
36
+ rate=sample_rate,
37
+ input=True,
38
+ frames_per_buffer=self._chunk_size,
39
+ )
40
+
41
+ self._open = True
42
+
43
+ def __iter__(self):
44
+ """
45
+ Returns the iterator object.
46
+ """
47
+
48
+ return self
49
+
50
+ def __next__(self):
51
+ """
52
+ Reads a chunk of audio from the microphone.
53
+ """
54
+ if not self._open:
55
+ raise StopIteration
56
+
57
+ try:
58
+ return self._stream.read(self._chunk_size)
59
+ except KeyboardInterrupt:
60
+ raise StopIteration
61
+
62
+ def close(self):
63
+ """
64
+ Closes the stream.
65
+ """
66
+
67
+ self._open = False
68
+
69
+ if self._stream.is_active():
70
+ self._stream.stop_stream()
71
+
72
+ self._stream.close()
73
+ self._pyaudio.terminate()
74
+
75
+
76
+
77
+
78
+
79
+
80
+
81
+
82
+
voice_activity_controller.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ # import sounddevice as sd
4
+ import torch
5
+ import numpy as np
6
+
7
+
8
+ class VoiceActivityController:
9
+ def __init__(
10
+ self,
11
+ sampling_rate = 16000,
12
+ second_ofSilence = 0.5,
13
+ second_ofSpeech = 0.25,
14
+ second_ofMinRecording = 10,
15
+ use_vad_result = True,
16
+ activity_detected_callback=None,
17
+ ):
18
+ self.activity_detected_callback=activity_detected_callback
19
+ self.model, self.utils = torch.hub.load(
20
+ repo_or_dir='snakers4/silero-vad',
21
+ model='silero_vad'
22
+ )
23
+ (self.get_speech_timestamps,
24
+ save_audio,
25
+ read_audio,
26
+ VADIterator,
27
+ collect_chunks) = self.utils
28
+
29
+ self.sampling_rate = sampling_rate
30
+ self.silence_limit = second_ofSilence * self.sampling_rate
31
+ self.speech_limit = second_ofSpeech *self.sampling_rate
32
+ self.MIN_RECORDING_LENGTH = second_ofMinRecording * self.sampling_rate
33
+
34
+ self.use_vad_result = use_vad_result
35
+ self.vad_iterator = VADIterator(
36
+ model =self.model,
37
+ threshold = 0.3,
38
+ sampling_rate= 16000,
39
+ min_silence_duration_ms = 500, #100
40
+ speech_pad_ms = 400 #30
41
+ )
42
+ self.last_marked_chunk = None
43
+
44
+
45
+ def int2float(self, sound):
46
+ abs_max = np.abs(sound).max()
47
+ sound = sound.astype('float32')
48
+ if abs_max > 0:
49
+ sound *= 1/32768
50
+ sound = sound.squeeze() # depends on the use case
51
+ return sound
52
+
53
+ def apply_vad(self, audio):
54
+ audio_float32 = self.int2float(audio)
55
+ chunk = self.vad_iterator(audio_float32, return_seconds=False)
56
+
57
+ if chunk is not None:
58
+ if "start" in chunk:
59
+ start = chunk["start"]
60
+ self.last_marked_chunk = chunk
61
+ return audio[start:] if self.use_vad_result else audio, (len(audio) - start), 0
62
+
63
+ if "end" in chunk:
64
+ #todo: pending get the padding from the next chunk
65
+ end = chunk["end"] if chunk["end"] < len(audio) else len(audio)
66
+ self.last_marked_chunk = chunk
67
+ return audio[:end] if self.use_vad_result else audio, end ,len(audio) - end
68
+
69
+ if self.last_marked_chunk is not None:
70
+ if "start" in self.last_marked_chunk:
71
+ return audio, len(audio) ,0
72
+
73
+ if "end" in self.last_marked_chunk:
74
+ return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 ,len(audio)
75
+
76
+ return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 , 0
77
+
78
+
79
+
80
+ def detect_user_speech(self, audio_stream, audio_in_int16 = False):
81
+ silence_len= 0
82
+ speech_len = 0
83
+
84
+ for data in audio_stream: # replace with your condition of choice
85
+ # if isinstance(data, EndOfTransmission):
86
+ # raise EndOfTransmission("End of transmission detected")
87
+
88
+
89
+ audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data
90
+ wav = audio_block
91
+
92
+
93
+ is_final = False
94
+ voice_audio, speech_in_wav, last_silent_duration_in_wav = self.apply_vad(wav)
95
+ # print(f'----r> speech_in_wav: {speech_in_wav} last_silent_duration_in_wav: {last_silent_duration_in_wav}')
96
+
97
+ if speech_in_wav > 0 :
98
+ silence_len= 0
99
+ speech_len += speech_in_wav
100
+ if self.activity_detected_callback is not None:
101
+ self.activity_detected_callback()
102
+
103
+ silence_len = silence_len + last_silent_duration_in_wav
104
+ if silence_len>= self.silence_limit and speech_len >= self.speech_limit:
105
+ is_final = True
106
+ silence_len= 0
107
+ speech_len = 0
108
+
109
+
110
+ yield voice_audio.tobytes(), is_final
111
+
112
+
113
+
114
+
115
+
116
+
117
+