|  | """ | 
					
						
						|  | This is just an example of what people would submit for | 
					
						
						|  | inference. | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | from s3prl.downstream.runner import Runner | 
					
						
						|  | from typing import Dict | 
					
						
						|  | import torch | 
					
						
						|  | import os | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class PreTrainedModel(Runner): | 
					
						
						|  | def __init__(self, path=""): | 
					
						
						|  | """ | 
					
						
						|  | Initialize downstream model. | 
					
						
						|  | """ | 
					
						
						|  | ckp_file = os.path.join(path, "hubert_asr.ckpt") | 
					
						
						|  | ckp = torch.load(ckp_file, map_location='cpu') | 
					
						
						|  | ckp["Args"].init_ckpt = ckp_file | 
					
						
						|  | ckp["Args"].mode = "inference" | 
					
						
						|  | ckp["Args"].device = "cpu" | 
					
						
						|  | ckp["Config"]["downstream_expert"]["datarc"]["dict_path"]=os.path.join(path,'char.dict') | 
					
						
						|  |  | 
					
						
						|  | Runner.__init__(self, ckp["Args"], ckp["Config"]) | 
					
						
						|  |  | 
					
						
						|  | def __call__(self, inputs)-> Dict[str, str]: | 
					
						
						|  | """ | 
					
						
						|  | Args: | 
					
						
						|  | inputs (:obj:`np.array`): | 
					
						
						|  | The raw waveform of audio received. By default at 16KHz. | 
					
						
						|  | Return: | 
					
						
						|  | A :obj:`dict`:. The object return should be liked {"text": "XXX"} containing | 
					
						
						|  | the detected text from the input audio. | 
					
						
						|  | """ | 
					
						
						|  | for entry in self.all_entries: | 
					
						
						|  | entry.model.eval() | 
					
						
						|  |  | 
					
						
						|  | inputs = [torch.FloatTensor(inputs)] | 
					
						
						|  |  | 
					
						
						|  | with torch.no_grad(): | 
					
						
						|  | features = self.upstream.model(inputs) | 
					
						
						|  | features = self.featurizer.model(inputs, features) | 
					
						
						|  | preds = self.downstream.model.inference(features, []) | 
					
						
						|  | return {"text": preds[0]} | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | """ | 
					
						
						|  | import subprocess | 
					
						
						|  | import numpy as np | 
					
						
						|  | from datasets import load_dataset | 
					
						
						|  | # This is already done in the Inference API | 
					
						
						|  | def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array: | 
					
						
						|  | ar = f"{sampling_rate}" | 
					
						
						|  | ac = "1" | 
					
						
						|  | format_for_conversion = "f32le" | 
					
						
						|  | ffmpeg_command = [ | 
					
						
						|  | "ffmpeg", | 
					
						
						|  | "-i", | 
					
						
						|  | "pipe:0", | 
					
						
						|  | "-ac", | 
					
						
						|  | ac, | 
					
						
						|  | "-ar", | 
					
						
						|  | ar, | 
					
						
						|  | "-f", | 
					
						
						|  | format_for_conversion, | 
					
						
						|  | "-hide_banner", | 
					
						
						|  | "-loglevel", | 
					
						
						|  | "quiet", | 
					
						
						|  | "pipe:1", | 
					
						
						|  | ] | 
					
						
						|  |  | 
					
						
						|  | ffmpeg_process = subprocess.Popen( | 
					
						
						|  | ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE | 
					
						
						|  | ) | 
					
						
						|  | output_stream = ffmpeg_process.communicate(bpayload) | 
					
						
						|  | out_bytes = output_stream[0] | 
					
						
						|  |  | 
					
						
						|  | audio = np.frombuffer(out_bytes, np.float32).copy() | 
					
						
						|  | if audio.shape[0] == 0: | 
					
						
						|  | raise ValueError("Malformed soundfile") | 
					
						
						|  | return audio | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | model = PreTrainedModel() | 
					
						
						|  | ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") | 
					
						
						|  | filename = ds[0]["file"] | 
					
						
						|  | with open(filename, "rb") as f: | 
					
						
						|  | data = ffmpeg_read(f.read(), 16000) | 
					
						
						|  | print(model(data)) | 
					
						
						|  | """ |