Spaces:
Sleeping
Sleeping
import json | |
import os | |
from vosk import SetLogLevel, Model, KaldiRecognizer | |
import ray | |
SetLogLevel(-1) # mutes vosk verbosity | |
class SpeechToTextVoskActor: | |
def __init__(self, model='small', audio_bit_rate=16000) -> None: | |
self.model = model | |
self.audio_bit_rate = audio_bit_rate | |
# load vosk model | |
# get path of current file | |
current_file_path = os.path.abspath(__file__) | |
current_directory = os.path.dirname(current_file_path) | |
_path = os.path.join(current_directory, 'models', 'vosk', self.model) | |
self.model_voice = Model(_path) | |
self.vosk = KaldiRecognizer(self.model_voice, self.audio_bit_rate) | |
self.text_queue = [] | |
self.finished_queue = [] | |
def process_speech(self, data: bytearray) -> tuple[str, bool]: | |
text = '' | |
speaker_finished = False | |
if self.vosk.AcceptWaveform(data): | |
result = self.vosk.Result() | |
result_json = json.loads(result) | |
text = result_json['text'] | |
speaker_finished = True | |
else: | |
result = self.vosk.PartialResult() | |
result_json = json.loads(result) | |
text = result_json['partial'] | |
return text, speaker_finished, result_json | |
def add_speech_bytes(self, data: bytearray): | |
text, speaker_finished = self._process_speech(data) | |
self.text_queue.append(text) | |
if speaker_finished: | |
self.finished_queue.append(speaker_finished) | |
def _process_speech(self, data: bytearray) -> tuple[str, bool]: | |
text = '' | |
speaker_finished = False | |
if self.vosk.AcceptWaveform(data): | |
result = self.vosk.Result() | |
result_json = json.loads(result) | |
text = result_json['text'] | |
speaker_finished = True | |
else: | |
result = self.vosk.PartialResult() | |
result_json = json.loads(result) | |
text = result_json['partial'] | |
return text, speaker_finished | |
def get_text(self): | |
text = '' | |
speaker_finished = False | |
while self.text_queue: | |
result = self.text_queue.pop(0) | |
text += result | |
if self.finished_queue: | |
speaker_finished = self.finished_queue.pop(0) | |
break | |
return text, speaker_finished | |
def get_audio_bit_rate(self): | |
return self.audio_bit_rate | |