src/asr/cache_aware_modules.py · pltobing/streaming-speech-translation at main

streaming-speech-translation / src /asr /cache_aware_modules.py

Formatting black, isort, flake8

0c397a9 7 days ago

17.8 kB

	#!/usr/bin/env python3
	# License: CC-BY-NC-ND-4.0
	# Created by: Patrick Lumbantobing, Vertox-AI
	# Copyright (c) 2026 Vertox-AI. All rights reserved.
	#
	# This work is licensed under the Creative Commons
	# Attribution-NonCommercial-NoDerivatives 4.0 International License.
	# To view a copy of this license, visit
	# http://creativecommons.org/licenses/by-nc-nd/4.0/
	"""
	Cache-aware streaming audio and feature buffers for Nemotron ASR.

	Adapted from: https://github.com/NVIDIA-NeMo/NeMo/tree/main

	Implements:

	- :class:`CacheAwareStreamingAudioBuffer` for audio → feature chunks
	compatible with NeMo cache-aware encoders.
	- :class:`CacheAwareStreamingASR` for encoder/decoder state management,
	hypothesis accumulation, and timestamped text output.
	"""

	from __future__ import annotations

	import re
	from collections.abc import Iterable
	from typing import Generator, List, Optional

	import numpy as np
	import numpy.typing as npt

	from src.asr.cache_aware_modules_config import (CacheAwareStreamingConfig,
	TimestampedResult)
	from src.asr.utils import log_softmax

	LOG_ZERO_GUARD_VALUE = float(2**-24)


	class CacheAwareStreamingAudioBuffer:
	"""
	Streaming audio and feature buffer for cache-aware ASR.

	Handles:

	- Chunking raw audio into overlapping frames for the preprocessor.
	- Dropping padded STFT frames after the first chunk.
	- Maintaining a feature buffer with pre-encode cache appended.
	"""

	def __init__(self, preprocessor, streaming_cfg: CacheAwareStreamingConfig) -> None:
	"""
	Parameters
	----------
	preprocessor :
	Callable that maps ``(waveforms, lengths)`` to
	``(features, feature_lengths)``.
	streaming_cfg :
	Cache-aware streaming configuration.
	"""
	self._preprocessor = preprocessor
	self._streaming_cfg = streaming_cfg

	self.audio_buffer: Optional[npt.NDArray[np.float32]] = None
	self.audio_step: int = 0
	self.features_buffer: Optional[npt.NDArray[np.float32]] = None

	self._audio_chunks_lens = np.array(
	[self._streaming_cfg.audio_chunk_frames * self._streaming_cfg.audio_frame_size],
	dtype=np.int64,
	)
	self._audio_frames_drops_lens = (
	self._streaming_cfg.audio_chunk_frames_drop * self._streaming_cfg.audio_frame_size
	)
	self._features_frames_takes_lens = self._streaming_cfg.audio_chunk_frames - 1

	self._chunk_size = self._streaming_cfg.chunk_size[1]
	self._shift_size = self._streaming_cfg.shift_size[1]
	self._pre_encode_cache_size = self._streaming_cfg.pre_encode_cache_size[1]
	self._cache_chunk_size = self._pre_encode_cache_size + self._chunk_size
	self._features_chunk_lengths = np.array([self._cache_chunk_size], dtype=np.int64)

	self._current_text: str = ""

	self._first_cache_pre_encode = np.log(
	np.zeros(
	(1, self._streaming_cfg.input_features, self._pre_encode_cache_size),
	dtype=np.float32,
	)
	+ LOG_ZERO_GUARD_VALUE
	)

	def len_audio_buffer(self) -> int:
	"""Return current audio buffer length (samples)."""
	return int(self.audio_buffer.shape[-1]) if self.audio_buffer is not None else 0

	def len_features_buffer(self) -> int:
	"""Return current feature buffer length (frames)."""
	return int(self.features_buffer.shape[-1]) if self.features_buffer is not None else 0

	def reset_buffers(self) -> None:
	"""Reset both audio and feature buffers."""
	self.reset_audio_buffer()
	self.reset_features_buffer()

	def reset_audio_buffer(self) -> None:
	"""Reset audio buffer and step counter."""
	self.audio_buffer = None
	self.audio_step = 0

	def reset_features_buffer(self) -> None:
	"""Reset feature buffer."""
	self.features_buffer = None

	def append_audio_buffer(self, audio_signal: npt.NDArray[np.float32]) -> None:
	"""Append new audio samples to the buffer."""
	if self.audio_buffer is None:
	self.audio_buffer = audio_signal
	else:
	self.audio_buffer = np.concatenate((self.audio_buffer, audio_signal), axis=-1).astype(np.float32)

	def process_audio_buffer(
	self,
	last: bool = False,
	) -> Generator[Optional[npt.NDArray[np.float32]], None, None]:
	"""
	Convert buffered audio into feature chunks.

	Yields
	------
	np.ndarray or None
	Feature chunks of shape ``(1, feats, frames)`` or ``None`` when
	no more chunks are available.
	"""
	if self.audio_buffer is None:
	if last:
	yield None
	return

	while self._audio_chunks_lens[0] <= self.audio_buffer.shape[-1]:
	audio_chunks = self.audio_buffer[:, : self._audio_chunks_lens[0]]
	audio_features, _ = self._preprocessor(audio_chunks, self._audio_chunks_lens)

	self.audio_buffer = self.audio_buffer[:, self._audio_frames_drops_lens :]

	if self.audio_step > 0:
	audio_features = audio_features[
	:,
	:,
	self._streaming_cfg.audio_chunk_frames_drop : self._features_frames_takes_lens,
	]
	else:
	audio_features = audio_features[:, :, : self._features_frames_takes_lens]

	self.audio_step += self._audio_frames_drops_lens
	yield audio_features

	if last and self.audio_buffer is not None and self.audio_buffer.shape[-1] > 0:
	n_pad = self._audio_chunks_lens[0] - self.audio_buffer.shape[-1]
	zeros_pad = np.zeros((1, n_pad), dtype=np.float32)
	self.audio_buffer = np.concatenate((self.audio_buffer, zeros_pad), axis=-1).astype(np.float32)

	audio_chunks = self.audio_buffer[:, : self._audio_chunks_lens[0]]
	audio_features, _ = self._preprocessor(audio_chunks, self._audio_chunks_lens)
	self.audio_buffer = self.audio_buffer[:, self._audio_chunks_lens[0] :]

	if self.audio_step > 0:
	yield audio_features[:, :, self._streaming_cfg.audio_chunk_frames_drop :]
	else:
	yield audio_features

	self.reset_audio_buffer()

	yield None

	def append_audio_buffer_to_process_for_features(
	self,
	audio_signal: npt.NDArray[np.float32],
	last: bool = False,
	) -> Generator[Optional[npt.NDArray[np.float32]], None, None]:
	"""Append audio and immediately yield any ready feature chunks."""
	self.append_audio_buffer(audio_signal)
	return self.process_audio_buffer(last=last)

	def append_features_buffer(self, audio_features: npt.NDArray[np.float32]) -> None:
	"""Append new feature frames, preprending initial pre-encode cache if needed."""
	if self.features_buffer is None:
	self.features_buffer = np.concatenate((self._first_cache_pre_encode, audio_features), axis=-1).astype(
	np.float32
	)
	else:
	self.features_buffer = np.concatenate((self.features_buffer, audio_features), axis=-1).astype(np.float32)

	def process_features_buffer(
	self,
	last: bool = False,
	) -> Generator[Optional[npt.NDArray[np.float32]], None, None]:
	"""
	Convert feature buffer into encoder-ready feature chunks.

	Yields
	------
	np.ndarray or None
	Feature chunks of shape ``(1, feats, cache_chunk_size)`` or
	``None`` when no more chunks are available.
	"""
	if self.features_buffer is None:
	if last:
	yield None
	return

	while self._cache_chunk_size <= self.features_buffer.shape[-1]:
	features_chunk = self.features_buffer[:, :, : self._cache_chunk_size]
	self.features_buffer = self.features_buffer[:, :, self._shift_size :]
	yield features_chunk

	if last and self.features_buffer.shape[-1] > 0:
	n_pad = self._cache_chunk_size - self.features_buffer.shape[-1]
	zeros_pad = np.log(
	np.zeros(
	(1, self.features_buffer.shape[1], n_pad),
	dtype=np.float32,
	)
	+ LOG_ZERO_GUARD_VALUE
	)
	features_chunk = np.concatenate((self.features_buffer, zeros_pad), axis=-1).astype(np.float32)
	self.features_buffer = self.features_buffer[:, :, self._cache_chunk_size :]
	yield features_chunk
	self.reset_features_buffer()

	yield None

	def append_features_buffer_to_process_for_features_chunk(
	self,
	audio_features: npt.NDArray[np.float32],
	last: bool = False,
	) -> Generator[Optional[npt.NDArray[np.float32]], None, None]:
	"""Append features and immediately yield any ready feature chunks."""
	self.append_features_buffer(audio_features)
	return self.process_features_buffer(last=last)


	class CacheAwareStreamingASR:
	"""
	Cache-aware streaming ASR wrapper around encoder/decoder ONNX models.

	Maintains encoder caches, decoder recurrent state, and an evolving
	hypothesis (tokens, timestamps, logprobs), producing incremental
	:class:`TimestampedResult` objects from feature chunks.
	"""

	def __init__(
	self,
	asr_encoder,
	asr_decoder,
	vocab: List[int],
	blank_idx: int,
	streaming_cfg: CacheAwareStreamingConfig,
	) -> None:
	"""
	Parameters
	----------
	asr_encoder :
	ONNX Runtime session for the cache-aware encoder.
	asr_decoder :
	ONNX Runtime session for the decoder/joint network.
	vocab :
	Mapping from token IDs to text pieces.
	blank_idx :
	Index of the blank label in the vocabulary.
	streaming_cfg :
	Cache-aware streaming configuration.
	"""
	self._asr_encoder = asr_encoder
	self._asr_decoder = asr_decoder
	self._vocab = vocab
	self._vocab_size = len(self._vocab)
	self._blank_idx = blank_idx
	self._streaming_cfg = streaming_cfg

	# encoder cache
	self._cache_last_channel: npt.NDArray[np.float32] \| None = None
	self._cache_last_time: npt.NDArray[np.float32] \| None = None
	self._cache_last_channel_len: npt.NDArray[np.int64] \| None = None
	self.set_init_encoder_cache()

	# encoder lengths
	self._chunk_size = self._streaming_cfg.chunk_size[1]
	self._pre_encode_cache_size = self._streaming_cfg.pre_encode_cache_size[1]
	self._cache_chunk_size = self._pre_encode_cache_size + self._chunk_size
	self._features_chunk_lengths = np.array([self._cache_chunk_size], dtype=np.int64)
	self._encoder_out_lengths = np.array(
	[self._streaming_cfg.valid_encoder_out_len],
	dtype=np.int64,
	)

	# decoder state
	self._prev_state: tuple[npt.NDArray[np.float32], npt.NDArray[np.float32]] \| None = None
	self._tokens: List[int] \| None = None
	self._timestamps: List[int] \| None = None
	self._logprobs: List[float] \| None = None
	self._t_index: int \| None = None
	self.set_init_decoder_state()
	self.set_init_decoder_vars()

	self._current_text: str = ""
	self._DECODE_SPACE_PATTERN = re.compile(r"\A\s\|\s\B\|(\s)\b")

	def set_init_encoder_cache(self) -> None:
	"""Initialise encoder caches to zeros."""
	self._cache_last_channel = np.zeros(
	(
	self._streaming_cfg.len_layers,
	1,
	self._streaming_cfg.last_channel_cache_size,
	self._streaming_cfg.d_model,
	),
	dtype=np.float32,
	).transpose(1, 0, 2, 3)

	self._cache_last_time = np.zeros(
	(
	self._streaming_cfg.len_layers,
	1,
	self._streaming_cfg.d_model,
	self._streaming_cfg.conv_context_size[0],
	),
	dtype=np.float32,
	).transpose(1, 0, 2, 3)

	self._cache_last_channel_len = np.zeros(1, dtype=np.int64)

	def set_init_decoder_state(self) -> None:
	"""Initialise decoder hidden states to zeros based on input shapes."""
	shapes = {x.name: x.shape for x in self._asr_decoder.get_inputs()}
	self._prev_state = (
	np.zeros(
	shape=(shapes["input_states_1"][0], 1, shapes["input_states_1"][2]),
	dtype=np.float32,
	),
	np.zeros(
	shape=(shapes["input_states_2"][0], 1, shapes["input_states_2"][2]),
	dtype=np.float32,
	),
	)

	def set_init_decoder_vars(self) -> None:
	"""Reset token, timestamp, logprob lists and time index."""
	self._tokens = []
	self._timestamps = []
	self._logprobs = []
	self._t_index = 0

	def reset_states(self) -> None:
	"""Reset encoder cache, decoder state, and current text."""
	self.set_init_encoder_cache()
	self.set_init_decoder_state()
	self.set_init_decoder_vars()
	self._current_text = ""

	def process_encoder_step(
	self,
	features_chunk: npt.NDArray[np.float32],
	) -> npt.NDArray[np.float32]:
	"""
	Run one encoder step with cache-aware inputs.

	Returns
	-------
	encoder_out: ``(batch, time, dimension)``
	"""
	assert self._features_chunk_lengths[0] == features_chunk.shape[-1]

	(
	encoder_out,
	encoder_out_lens,
	cache_last_channel_next,
	cache_last_time_next,
	cache_last_channel_next_len,
	) = self._asr_encoder.run(
	[
	"outputs",
	"encoded_lengths",
	"cache_last_channel_next",
	"cache_last_time_next",
	"cache_last_channel_next_len",
	],
	{
	"audio_signal": features_chunk,
	"length": self._features_chunk_lengths,
	"cache_last_channel": self._cache_last_channel,
	"cache_last_time": self._cache_last_time,
	"cache_last_channel_len": self._cache_last_channel_len,
	},
	)

	self._cache_last_channel = cache_last_channel_next
	self._cache_last_time = cache_last_time_next
	self._cache_last_channel_len = cache_last_channel_next_len

	return encoder_out.transpose(0, 2, 1)

	def _decode_tokens(
	self, ids: Iterable[int], indices: Iterable[int] \| None, logprobs: Iterable[float] \| None
	) -> TimestampedResult:
	"""
	Decode token ids including timestamps, running text, and text delta.

	Returns
	-------
	TimestampedResult:
	contains running text, timestamps, all tokens, all logprobs, and text delta
	"""
	tokens = [self._vocab[i] for i in ids]
	text = re.sub(self._DECODE_SPACE_PATTERN, lambda x: " " if x.group(1) else "", "".join(tokens))
	n_added_chars = len(text) - len(self._current_text)
	added_text = text[-n_added_chars:] if n_added_chars > 0 else ""
	timestamps = (
	None
	if indices is None
	else (
	self._streaming_cfg.window_step * self._streaming_cfg.subsampling_factor * np.asarray(indices)
	).tolist()
	)
	return TimestampedResult(
	text, timestamps, tokens, None if logprobs is None else np.asarray(logprobs).tolist(), added_text
	)

	def process_decoder_step(self, encoder_out):
	"""
	Run decoder steps with chunked encoder output.

	Returns
	-------
	text: string
	full transcript from the start
	added_text: string
	text delta
	"""
	encodings = encoder_out[0]
	encodings_len = self._encoder_out_lengths[0]
	assert encodings_len == encodings.shape[0]

	step = 0
	emitted_tokens = 0
	while step < encodings_len:
	outputs, state1, state2 = self._asr_decoder.run(
	["outputs", "output_states_1", "output_states_2"],
	{
	"encoder_outputs": encodings[step : step + 1, :, None],
	"targets": [[self._tokens[-1] if self._tokens else self._blank_idx]],
	"target_length": [1],
	"input_states_1": self._prev_state[0],
	"input_states_2": self._prev_state[1],
	},
	)
	logits = outputs.squeeze()
	state = (state1, state2)

	assert logits.shape[-1] <= self._vocab_size

	token = logits.argmax()

	if token != self._blank_idx:
	self._prev_state = state
	self._tokens.append(int(token))
	self._timestamps.append(self._t_index)
	emitted_tokens += 1
	self._logprobs.append(log_softmax(logits)[token])
	if token == self._blank_idx or emitted_tokens == self._streaming_cfg.max_tokens_per_step:
	self._t_index += 1
	emitted_tokens = 0
	step += 1

	if len(self._tokens) > 0:
	res = self._decode_tokens(self._tokens, self._timestamps, self._logprobs)

	self._current_text = res.text

	return res.text, res.added_text
	else:
	return None, None