osanseviero
/

hubert_s3prl

Automatic Speech Recognition superb benchmark:superb

Model card Files Files and versions Community

hubert_s3prl / model.py

osanseviero's picture

osanseviero HF staff

Add model

e4f0af6 almost 3 years ago

raw history blame

No virus

2.49 kB

	"""
	This is just an example of what people would submit for
	inference.
	"""

	from s3prl.downstream.runner import Runner
	from typing import Dict
	import torch
	from datasets import load_dataset


	class PreTrainedModel(Runner):
	def __init__(self):
	"""
	Loads model and tokenizer from local directory
	"""
	ckp_file = "hubert_asr.ckpt"
	ckp = torch.load(ckp_file, map_location='cpu')
	ckp["Args"].init_ckpt = ckp_file
	ckp["Args"].mode = "inference"
	ckp["Args"].device = "cpu" # Just to try in my computer
	ckp["Config"]["downstream_expert"]["datarc"]["dict_path"]='char.dict'

	Runner.__init__(self, ckp["Args"], ckp["Config"])

	def __call__(self, inputs)-> Dict[str, str]:
	"""
	Args:
	inputs (:obj:`np.array`):
	The raw waveform of audio received. By default at 16KHz.
	Return:
	A :obj:`dict`:. The object return should be liked {"text": "XXX"} containing
	the detected text from the input audio.
	"""
	for entry in self.all_entries:
	entry.model.eval()

	inputs = [torch.FloatTensor(inputs)]

	with torch.no_grad():
	features = self.upstream.model(inputs)
	features = self.featurizer.model(inputs, features)
	preds = self.downstream.model.inference(features, [])
	return preds[0]


	"""
	import subprocess
	import numpy as np
	# This is already done in the Inference API
	def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
	ar = f"{sampling_rate}"
	ac = "1"
	format_for_conversion = "f32le"
	ffmpeg_command = [
	"ffmpeg",
	"-i",
	"pipe:0",
	"-ac",
	ac,
	"-ar",
	ar,
	"-f",
	format_for_conversion,
	"-hide_banner",
	"-loglevel",
	"quiet",
	"pipe:1",
	]

	ffmpeg_process = subprocess.Popen(
	ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE
	)
	output_stream = ffmpeg_process.communicate(bpayload)
	out_bytes = output_stream[0]

	audio = np.frombuffer(out_bytes, np.float32).copy()
	if audio.shape[0] == 0:
	raise ValueError("Malformed soundfile")
	return audio


	model = PreTrainedModel()
	ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
	filename = ds[0]["file"]
	with open(filename, "rb") as f:
	data = ffmpeg_read(f.read(), 16000)
	print(model(data))
	"""