File size: 2,417 Bytes
98ec0ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import json
import os
from vosk import SetLogLevel, Model, KaldiRecognizer
import ray
SetLogLevel(-1)  # mutes vosk verbosity

@ray.remote
class SpeechToTextVoskActor:
    def __init__(self, model='small', audio_bit_rate=16000) -> None:
        self.model = model
        self.audio_bit_rate = audio_bit_rate

        # load vosk model
        # get path of current file
        current_file_path = os.path.abspath(__file__)
        current_directory = os.path.dirname(current_file_path)
        _path = os.path.join(current_directory, 'models', 'vosk', self.model)
        self.model_voice = Model(_path)
        self.vosk = KaldiRecognizer(self.model_voice, self.audio_bit_rate)

        self.text_queue = []
        self.finished_queue = []
        
    def process_speech(self, data: bytearray) -> tuple[str, bool]:
        text = ''
        speaker_finished = False
        if self.vosk.AcceptWaveform(data):
            result = self.vosk.Result()
            result_json = json.loads(result)
            text = result_json['text']
            speaker_finished = True
        else:
            result = self.vosk.PartialResult()
            result_json = json.loads(result)
            text = result_json['partial']
        return text, speaker_finished
        

    def add_speech_bytes(self, data: bytearray):
        text, speaker_finished = self._process_speech(data)
        self.text_queue.append(text)
        if speaker_finished:
            self.finished_queue.append(speaker_finished)

    def _process_speech(self, data: bytearray) -> tuple[str, bool]:
        text = ''
        speaker_finished = False
        if self.vosk.AcceptWaveform(data):
            result = self.vosk.Result()
            result_json = json.loads(result)
            text = result_json['text']
            speaker_finished = True
        else:
            result = self.vosk.PartialResult()
            result_json = json.loads(result)
            text = result_json['partial']
        return text, speaker_finished

    def get_text(self):
        text = ''
        speaker_finished = False
        while self.text_queue:
            result = self.text_queue.pop(0)
            text += result
            if self.finished_queue:
                speaker_finished = self.finished_queue.pop(0)
                break
        return text, speaker_finished

    def get_audio_bit_rate(self):
        return self.audio_bit_rate