File size: 1,897 Bytes
4ee5676 3edbfec e18fb9d a677076 54f1c88 b12e8e8 4ee5676 13a15d8 2ba318d a677076 5b8ef5f a677076 03d44ec e18fb9d 03d44ec 4ee5676 847a572 4ee5676 6b2a0ad 13a15d8 4ee5676 cbd5d1a 4ee5676 13a15d8 cbd5d1a 4ee5676 847a572 a677076 95bbb32 13a15d8 4ee5676 cbd5d1a 95bbb32 ab121f4 6d73c34 03d44ec 6d73c34 ab121f4 95bbb32 a677076 ab121f4 2ba318d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import librosa
import numpy as np
from .init import pipe
TASK = "transcribe"
BATCH_SIZE = 8
LIMIT = 60
SAMPLING_RATE = 16000
class A2T:
def __init__(self, mic):
self.mic = mic
def __transcribe(self, inputs, task: str = None):
if inputs is None:
print("Inputs None")
transcribed_text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
return transcribed_text
def __preprocces(self, raw: np.ndarray, sampling_rate: int):
chunk = raw.astype(np.float32, order='C') / 32768.0
print(f"Chunk : {chunk} max chunk : {np.max(chunk)}")
if len(chunk.shape) > 1:
chunk = librosa.to_mono(chunk.T)
chunk = chunk[:SAMPLING_RATE*LIMIT]
print(f"Chunk cut : {chunk} max chunk : {np.max(chunk)}")
return chunk
def predict(self):
try:
if self.mic is not None:
raw = self.mic.get_array_of_samples()
chunk = np.array(raw, dtype=np.int16)
sampling_rate = self.mic.frame_rate
audio = self.__preprocces(raw=chunk, sampling_rate=sampling_rate)
print(f"audio : {audio} \n shape : {audio.shape} \n max : {np.max(audio)} \n shape of chunk : {chunk.shape} \n sampling rate : {sampling_rate} \n max chunk : {np.max(chunk)} \n chunk : {chunk}")
else:
raise Exception("please provide audio")
if isinstance(audio , np.ndarray):
inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
return self.__transcribe(inputs=inputs, task=TASK)
else:
raise Exception("Audio is not np array")
except Exception as e:
return f"Oops some kinda error : {e}"
|