Spaces:
Sleeping
Sleeping
import ffmpeg | |
import numpy as np | |
# import whisper | |
# model = whisper.load_model("base") | |
def load_audio(file: (str, bytes), sr: int = 16000): | |
""" | |
Open an audio file and read as mono waveform, resampling as necessary | |
Parameters | |
---------- | |
file: (str, bytes) | |
The audio file to open or bytes of audio file | |
sr: int | |
The sample rate to resample the audio if necessary | |
Returns | |
------- | |
A NumPy array containing the audio waveform, in float32 dtype. | |
""" | |
if isinstance(file, bytes): | |
inp = file | |
file = 'pipe:' | |
else: | |
inp = None | |
try: | |
out, _ = ( | |
ffmpeg.input(file, threads=0) | |
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr) | |
.run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True, input=inp) | |
) | |
except ffmpeg.Error as e: | |
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e | |
return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 | |
def stt_client(audio_data): | |
return "" | |
# audio = whisper.pad_or_trim(load_audio(audio_data)) | |
# mel = whisper.log_mel_spectrogram(audio).to(model.device) | |
# options = whisper.DecodingOptions(fp16=False) | |
# result = whisper.decode(model, mel, options) | |
# return ""result.text | |