|
import numpy as np |
|
from transformers import AutomaticSpeechRecognitionPipeline, AutoTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC |
|
from typing import Dict |
|
|
|
class PreTrainedModel(): |
|
def __init__(self, path): |
|
""" |
|
Loads model and tokenizer from local directory |
|
""" |
|
model = Wav2Vec2ForCTC.from_pretrained(path) |
|
tokenizer = AutoTokenizer.from_pretrained(path) |
|
extractor = Wav2Vec2FeatureExtractor.from_pretrained(path) |
|
|
|
self.model = AutomaticSpeechRecognitionPipeline(model=model, feature_extractor=extractor, tokenizer=tokenizer) |
|
def __call__(self, inputs)-> Dict[str, str]: |
|
""" |
|
Args: |
|
inputs (:obj:`np.array`): |
|
The raw waveform of audio received. By default at 16KHz. |
|
Return: |
|
A :obj:`dict`:. The object return should be liked {"text": "XXX"} containing |
|
the detected text from the input audio. |
|
""" |
|
return self.model(inputs) |
|
|
|
|
|
|
|
|
|
""" |
|
# Just an example using this. |
|
import subprocess |
|
from datasets import load_dataset |
|
|
|
def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array: |
|
ar = f"{sampling_rate}" |
|
ac = "1" |
|
format_for_conversion = "f32le" |
|
ffmpeg_command = [ |
|
"ffmpeg", |
|
"-i", |
|
"pipe:0", |
|
"-ac", |
|
ac, |
|
"-ar", |
|
ar, |
|
"-f", |
|
format_for_conversion, |
|
"-hide_banner", |
|
"-loglevel", |
|
"quiet", |
|
"pipe:1", |
|
] |
|
|
|
ffmpeg_process = subprocess.Popen( |
|
ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE |
|
) |
|
output_stream = ffmpeg_process.communicate(bpayload) |
|
out_bytes = output_stream[0] |
|
|
|
audio = np.frombuffer(out_bytes, np.float32).copy() |
|
if audio.shape[0] == 0: |
|
raise ValueError("Malformed soundfile") |
|
return audio |
|
|
|
model = PreTrainedModel() |
|
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") |
|
filename = ds[0]["file"] |
|
with open(filename, "rb") as f: |
|
data = ffmpeg_read(f.read(), 16000) |
|
print(model(data)) |
|
""" |