src/asr/cache_aware_modules_config.py · pltobing/streaming-speech-translation at main

streaming-speech-translation / src /asr /cache_aware_modules_config.py

Enhanced docstrings, type hints, and comments

da63a34 6 days ago

3.56 kB

	#!/usr/bin/env python3
	# License: CC-BY-NC-ND-4.0
	# Created by: Patrick Lumbantobing, Vertox-AI
	# Copyright (c) 2026 Vertox-AI. All rights reserved.
	#
	# This work is licensed under the Creative Commons
	# Attribution-NonCommercial-NoDerivatives 4.0 International License.
	# To view a copy of this license, visit
	# http://creativecommons.org/licenses/by-nc-nd/4.0/
	"""
	Configs for cache-aware streaming audio and feature buffers.

	Adapted from: https://github.com/NVIDIA-NeMo/NeMo/tree/main

	Defines dataclasses used by the Nemotron cache-aware streaming ASR demo
	to control chunking, cache sizes, and frame-level buffering.
	"""

	from __future__ import annotations

	from dataclasses import dataclass, field
	from typing import List


	@dataclass
	class TimestampedResult:
	"""
	Timestamped recognition result from the streaming decoder.

	Attributes
	----------
	text :
	Full recognized text so far.
	timestamps :
	Optional per-token timestamps.
	tokens :
	Optional list of token strings.
	logprobs :
	Optional per-token log-probabilities.
	added_text :
	Incremental text added in the latest step (if any).
	"""

	text: str
	timestamps: List[float] \| None = None
	tokens: List[str] \| None = None
	logprobs: List[float] \| None = None
	added_text: str \| None = None


	@dataclass
	class CacheAwareStreamingConfig:
	"""
	Configuration for cache-aware streaming audio/feature buffering.

	Parameters
	----------
	chunk_size :
	Chunk size (in frames) per step. Can be a two-element list to
	specify different sizes for the first and subsequent steps.
	shift_size :
	Shift size (in frames) per step; same two-element semantics as
	``chunk_size``.
	cache_drop_size :
	Number of steps to drop from the cache periodically.
	last_channel_cache_size :
	Cache size needed for the last channel layers.
	valid_encoder_out_len :
	Number of steps in the final output that are guaranteed to match
	offline encoder output.
	pre_encode_cache_size :
	Cache size for pre-encoding layers to avoid internal caching.
	drop_extra_pre_encoded :
	Number of extra pre-encoded steps to drop.
	last_channel_num, last_time_num :
	Number of channel/time layers that require cache maintenance.
	audio_chunk_frames, audio_chunk_frames_drop, audio_frame_size :
	Audio framing parameters for streaming input.
	input_features :
	Input feature dimension (e.g., mel-spectrogram size).
	conv_context_size, len_layers, d_model :
	Model architecture parameters (convolution context, layers, hidden dim).
	max_tokens_per_step, window_step, subsampling_factor :
	Decoder step and alignment parameters.
	"""

	chunk_size: List[int] = field(default_factory=lambda: [49, 56])
	shift_size: List[int] = field(default_factory=lambda: [49, 56])

	cache_drop_size: int = 0
	last_channel_cache_size: int = 70

	valid_encoder_out_len: int = 7

	pre_encode_cache_size: List[int] = field(default_factory=lambda: [0, 9])
	drop_extra_pre_encoded: int = 2

	last_channel_num: int = 0
	last_time_num: int = 0

	audio_chunk_frames: int = 5
	audio_chunk_frames_drop: int = 2
	audio_frame_size: int = 160

	input_features: int = 128

	conv_context_size: List[int] = field(default_factory=lambda: [8, 0])
	len_layers: int = 24
	d_model: int = 1024

	max_tokens_per_step: int = 10
	window_step: float = 0.01
	subsampling_factor: int = 10