|
""" |
|
Speech-to-text module based on Vosk and Whisper for SillyTavern Extras |
|
- Vosk website: https://alphacephei.com/vosk/ |
|
- Vosk api: https://github.com/alphacep/vosk-api |
|
- Whisper github: https://github.com/openai/whisper |
|
|
|
Authors: |
|
- Tony Ribeiro (https://github.com/Tony-sama) |
|
|
|
Models are saved into user cache folder, example: C:/Users/toto/.cache/whisper and C:/Users/toto/.cache/vosk |
|
|
|
References: |
|
- Code adapted from: |
|
- whisper github: https://github.com/openai/whisper |
|
- oobabooga text-generation-webui github: https://github.com/oobabooga/text-generation-webui |
|
- vosk github: https://github.com/alphacep/vosk-api/blob/master/python/example/test_microphone.py |
|
""" |
|
from flask import jsonify, abort |
|
|
|
import queue |
|
import sys |
|
import sounddevice as sd |
|
import soundfile as sf |
|
import io |
|
import numpy as np |
|
from scipy.io.wavfile import write |
|
|
|
import vosk |
|
import whisper |
|
|
|
DEBUG_PREFIX = "<stt streaming module>" |
|
RECORDING_FILE_PATH = "stt_test.wav" |
|
|
|
whisper_model = None |
|
vosk_model = None |
|
device = None |
|
|
|
def load_model(file_path=None): |
|
""" |
|
Load given vosk model from file or default to en-us model. |
|
Download model to user cache folder, example: C:/Users/toto/.cache/vosk |
|
""" |
|
|
|
if file_path is None: |
|
return (whisper.load_model("base.en"), vosk.Model(lang="en-us")) |
|
else: |
|
return (whisper.load_model(file_path), vosk.Model(lang="en-us")) |
|
|
|
def convert_bytearray_to_wav_ndarray(input_bytearray: bytes, sampling_rate=16000): |
|
""" |
|
Convert a bytearray to wav format to output in a file for quality check debuging |
|
""" |
|
bytes_wav = bytes() |
|
byte_io = io.BytesIO(bytes_wav) |
|
write(byte_io, sampling_rate, np.frombuffer(input_bytearray, dtype=np.int16)) |
|
output_wav = byte_io.read() |
|
output, _ = sf.read(io.BytesIO(output_wav)) |
|
return output |
|
|
|
def record_and_transcript(): |
|
""" |
|
Continuously record from mic and transcript voice. |
|
Return the transcript once no more voice is detected. |
|
""" |
|
if whisper_model is None: |
|
print(DEBUG_PREFIX,"Whisper model not initialized yet.") |
|
return "" |
|
|
|
q = queue.Queue() |
|
stream_errors = list() |
|
|
|
def callback(indata, frames, time, status): |
|
"""This is called (from a separate thread) for each audio block.""" |
|
if status: |
|
print(status, file=sys.stderr) |
|
stream_errors.append(status) |
|
q.put(bytes(indata)) |
|
|
|
try: |
|
device_info = sd.query_devices(device, "input") |
|
|
|
samplerate = int(device_info["default_samplerate"]) |
|
|
|
print(DEBUG_PREFIX, "Start recording from:", device_info["name"], "with samplerate", samplerate) |
|
|
|
with sd.RawInputStream(samplerate=samplerate, blocksize = 8000, device=device, dtype="int16", channels=1, callback=callback): |
|
|
|
rec = vosk.KaldiRecognizer(vosk_model, samplerate) |
|
full_recording = bytearray() |
|
while True: |
|
data = q.get() |
|
if len(stream_errors) > 0: |
|
raise Exception(DEBUG_PREFIX+" Stream errors: "+str(stream_errors)) |
|
|
|
full_recording.extend(data) |
|
|
|
if rec.AcceptWaveform(data): |
|
|
|
transcript = rec.Result()[14:-3] |
|
print(DEBUG_PREFIX, "Transcripted from microphone stream (vosk):", transcript) |
|
|
|
|
|
|
|
|
|
output_file = convert_bytearray_to_wav_ndarray(input_bytearray=full_recording, sampling_rate=samplerate) |
|
sf.write(file=RECORDING_FILE_PATH, data=output_file, samplerate=samplerate) |
|
print(DEBUG_PREFIX, "Recorded message saved to", RECORDING_FILE_PATH) |
|
|
|
|
|
result = whisper_model.transcribe(RECORDING_FILE_PATH) |
|
transcript = result["text"] |
|
print(DEBUG_PREFIX, "Transcripted from audio file (whisper):", transcript) |
|
|
|
|
|
return jsonify({"transcript": transcript}) |
|
|
|
|
|
|
|
except Exception as e: |
|
print(e) |
|
abort(500, DEBUG_PREFIX+" Exception occurs while recording") |