File size: 3,440 Bytes
b585c7f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import numpy as np
from src.utils import get_device
def get_transcriber(model="openai/whisper-base.en", use_gpu=True, gpu_id='auto'):
if gpu_id == 'auto':
gpu_id = 0
device = get_device()
if device == 'cpu' or not use_gpu:
device_map = 'auto' # {"", 'cpu'}
else:
device_map = {"": gpu_id} if gpu_id >= 0 else {'': 'cuda'}
from transformers import pipeline
transcriber = pipeline("automatic-speech-recognition", model=model, device_map=device_map)
return transcriber
def transcribe(audio_state1, new_chunk, transcriber=None, max_chunks=None, sst_floor=100.0, reject_no_new_text=True,
debug=False):
if audio_state1[0] is None:
audio_state1[0] = ''
if audio_state1[2] is None:
audio_state1[2] = []
if max_chunks is not None and audio_state1[2] is not None and len(audio_state1[2]) > max_chunks:
# refuse to update
return audio_state1, audio_state1[1]
if audio_state1[3] == 'off':
if debug:
print("Already ended", flush=True)
return audio_state1, audio_state1[1]
# assume sampling rate always same
# keep chunks so don't normalize on noise periods, which would then saturate noise with non-noise
sr, y = new_chunk
if y.shape[0] == 0:
avg = 0.0
else:
# stereo to mono if needed
if len(y.shape) > 1:
y = np.mean(y, dim=0)
avg = np.average(np.abs(y))
if not np.isfinite(avg):
avg = 0.0
if avg > sst_floor:
if debug:
print("Got possible chunk: %s" % avg, flush=True)
chunks_new = audio_state1[2] + [y]
else:
chunks_new = audio_state1[2]
if debug:
print("Rejected quiet chunk: %s" % avg, flush=True)
if chunks_new:
stream = np.concatenate(chunks_new)
stream = stream.astype(np.float32)
max_stream = np.max(np.abs(stream) + 1E-7)
stream /= max_stream
text = transcriber({"sampling_rate": sr, "raw": stream})["text"]
if audio_state1[2]:
try:
stream0 = np.concatenate(audio_state1[2])
except:
raise
stream0 = stream0.astype(np.float32)
max_stream0 = np.max(np.abs(stream0) + 1E-7)
stream0 /= max_stream0
text_y = transcriber({"sampling_rate": sr, "raw": stream0})["text"]
else:
text_y = None
if debug:
print("y.shape: %s stream.shape: %s text0=%s text=%s text_y=%s" % (
str(y.shape), str(stream.shape), audio_state1[0], text, text_y))
if reject_no_new_text and (text == text_y):
if debug:
print("Rejected non-textual chunk: %s" % avg, flush=True)
# if didn't generate text, reject the chunk. E.g. when typing on keyboard that ends up being loud enough but is definitely not words.
else:
audio_state1[2] = chunks_new
else:
text = ''
# print("H9: %s %s" % (audio_state1[0], text), flush=True)
# work-around race
if audio_state1[0] == text:
# print("H10: %s %s" % (audio_state1[0], text), flush=True)
text = ''
if audio_state1[0] is not None:
# For race, when action hits done while streaming occurs, to know now to use updated result
audio_state1[1] = audio_state1[0] + text
return audio_state1, audio_state1[1]
|