Blaxzter
/

whisper-large-v2-inference-endpoint

Automatic Speech Recognition

hf-asr-leaderboard

Inference Endpoints

Model card Files Files and versions Community

whisper-large-v2-inference-endpoint / handler.py

Blaxzter's picture

Upload handler.py

a8eab90 12 months ago

2.71 kB

	import base64
	import json
	import os
	from io import StringIO
	from typing import Dict, Any

	from transformers import pipeline


	class EndpointHandler:

	def __init__(self, asr_model_path: str = "./whisper-large-v2"):
	# Create an ASR pipeline using the model located in the specified directory
	self.asr_pipeline = pipeline(
	"automatic-speech-recognition",
	model = asr_model_path,
	)

	def __call__(self, data: Dict[str, Any]) -> str:

	json_data = json.loads(data)
	if "audio_data" not in json_data.keys():
	raise Exception("Request must contain a top-level key named 'audio_data'")

	# Get the audio data from the input
	audio_data = json_data["audio_data"]
	language = json_data["language"]

	# Decode the binary audio data if it's provided as a base64 string
	if isinstance(audio_data, str):
	audio_data = base64.b64decode(audio_data)

	# Process the audio data with the ASR pipeline
	transcription = self.asr_pipeline(
	audio_data,
	return_timestamps=False,
	chunk_length_s=30,
	batch_size=8,
	max_length=10000,
	max_new_tokens=10000,
	generate_kwargs={"task": "transcribe", "language": "<\|language\|>"}
	)

	# Convert the transcription to JSON
	result = StringIO()
	json.dump(transcription, result)

	return result.getvalue()

	def init():
	global asr_pipeline
	# Set the path to the directory where the model is stored
	model_path = os.getenv("AZUREML_MODEL_DIR", "./whisper-large-v2")

	# Create an ASR pipeline using the model located in the specified directory
	asr_pipeline = pipeline(
	"automatic-speech-recognition",
	model = model_path,
	)


	def run(raw_data):
	json_data = json.loads(raw_data)
	if "audio_data" not in json_data.keys():
	raise Exception("Request must contain a top level key named 'audio_data'")

	# Get the audio data from the input
	audio_data = json_data["audio_data"]

	# Decode the binary audio data if it's provided as a base64 string
	if isinstance(audio_data, str):
	import base64
	audio_data = base64.b64decode(audio_data)

	# Process the audio data with the ASR pipeline
	transcription = asr_pipeline(
	audio_data,
	return_timestamps = False,
	chunk_length_s = 30,
	batch_size = 8,
	max_new_tokens = 1000,
	generate_kwargs = {"task": "transcribe", "language": "<\|de\|>"}
	)

	# Convert the transcription to JSON
	result = StringIO()
	json.dump(transcription, result)

	return result.getvalue()