Spaces:

jts-ai-team
/

homeshopping

Paused

App Files Files Community

homeshopping / backend /asr.py

jts-ai-team

Upload 7 files

abb09c3 verified about 1 month ago

raw

history blame contribute delete

3.53 kB

	"""Speech-to-text utilities with graceful fallbacks."""

	from __future__ import annotations

	import numpy as np

	from backend.utils import device
	import nemo.collections.asr as nemo_asr

	try:
	import torch
	from transformers import pipeline
	except ModuleNotFoundError: # PyTorch or transformers not available on Python 3.13 wheels
	torch = None # type: ignore
	pipeline = None # type: ignore

	try:
	from google.cloud import speech
	except ModuleNotFoundError:
	speech = None # type: ignore


	_ASR_PIPELINE = None


	def _huggingface_device() -> int \| str \| None:
	if device == "cuda":
	return 0
	if device == "mps":
	return "mps"
	return None


	def _initialize_typhoon_pipeline():
	if torch is None or pipeline is None:
	return None
	device = 'cuda' if torch.cuda.is_available() else 'mps'
	print(f"Using device: {device}")
	print("Initializing Typhoon ASR pipeline...")
	asr_model = nemo_asr.models.ASRModel.from_pretrained(
	model_name="scb10x/typhoon-asr-realtime",
	map_location=device
	)
	print("Typhoon ASR pipeline initialized.")
	return asr_model

	def _initialize_whisper_pipeline():
	pipe = pipeline(
	task="automatic-speech-recognition",
	model="nectec/Pathumma-whisper-th-medium",
	chunk_length_s=30,
	device=device,
	model_kwargs={"torch_dtype": torch.bfloat16},
	)
	pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(
	language='th',
	task="transcribe"
	)
	return pipe
	_ASR_TYPHOON = None
	# _ASR_TYPHOON = _initialize_typhoon_pipeline()
	_ASR_WHISPER = _initialize_whisper_pipeline()


	def _transcribe_with_pipeline(audio_array: np.ndarray) -> str:
	output = _ASR_PIPELINE(audio_array) # type: ignore[operator]
	if isinstance(output, dict):
	text = output.get("text", "")
	else:
	text = str(output)
	return text.replace("ทางลัด", "ทางรัฐ")


	def _transcribe_with_google(audio_array: np.ndarray) -> str:
	if speech is None:
	raise RuntimeError("google-cloud-speech is not available")

	int16_audio = (audio_array * 32767.0).astype(np.int16)
	audio_bytes = int16_audio.tobytes()

	client = speech.SpeechClient()
	audio_config = speech.RecognitionConfig(
	encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
	sample_rate_hertz=16000,
	language_code="th-TH",
	alternative_language_codes=["en-US"],
	model = "telephony"
	)
	audio_data = speech.RecognitionAudio(content=audio_bytes)
	response = client.recognize(config=audio_config, audio=audio_data)
	transcription = " ".join(
	result.alternatives[0].transcript for result in response.results
	)
	return transcription


	def transcribe_audio(audio_array: np.ndarray) -> str:
	"""Transcribe user audio with the best available backend."""
	if audio_array is None or not np.any(audio_array):
	return ""
	# if _ASR_TYPHOON:
	# try:
	# transcriptions = _ASR_PIPELINE.transcribe(audio=audio_array)
	# except Exception as exc:
	# print(f"Typhoon ASR pipeline failed: {exc}")
	if _ASR_WHISPER:
	try:
	transcription = _ASR_WHISPER(audio_array)["text"]
	return transcription
	except Exception as exc:
	print(f"Typhoon ASR pipeline failed: {exc}")

	try:
	return _transcribe_with_google(audio_array)
	except Exception as exc:
	print(f"ASR fallback failed: {exc}")
	return ""