streaming-speech-translation / src /asr /cache_aware_modules_config.py
pltobing's picture
Enhanced docstrings, type hints, and comments
da63a34
#!/usr/bin/env python3
# License: CC-BY-NC-ND-4.0
# Created by: Patrick Lumbantobing, Vertox-AI
# Copyright (c) 2026 Vertox-AI. All rights reserved.
#
# This work is licensed under the Creative Commons
# Attribution-NonCommercial-NoDerivatives 4.0 International License.
# To view a copy of this license, visit
# http://creativecommons.org/licenses/by-nc-nd/4.0/
"""
Configs for cache-aware streaming audio and feature buffers.
Adapted from: https://github.com/NVIDIA-NeMo/NeMo/tree/main
Defines dataclasses used by the Nemotron cache-aware streaming ASR demo
to control chunking, cache sizes, and frame-level buffering.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import List
@dataclass
class TimestampedResult:
"""
Timestamped recognition result from the streaming decoder.
Attributes
----------
text :
Full recognized text so far.
timestamps :
Optional per-token timestamps.
tokens :
Optional list of token strings.
logprobs :
Optional per-token log-probabilities.
added_text :
Incremental text added in the latest step (if any).
"""
text: str
timestamps: List[float] | None = None
tokens: List[str] | None = None
logprobs: List[float] | None = None
added_text: str | None = None
@dataclass
class CacheAwareStreamingConfig:
"""
Configuration for cache-aware streaming audio/feature buffering.
Parameters
----------
chunk_size :
Chunk size (in frames) per step. Can be a two-element list to
specify different sizes for the first and subsequent steps.
shift_size :
Shift size (in frames) per step; same two-element semantics as
``chunk_size``.
cache_drop_size :
Number of steps to drop from the cache periodically.
last_channel_cache_size :
Cache size needed for the last channel layers.
valid_encoder_out_len :
Number of steps in the final output that are guaranteed to match
offline encoder output.
pre_encode_cache_size :
Cache size for pre-encoding layers to avoid internal caching.
drop_extra_pre_encoded :
Number of extra pre-encoded steps to drop.
last_channel_num, last_time_num :
Number of channel/time layers that require cache maintenance.
audio_chunk_frames, audio_chunk_frames_drop, audio_frame_size :
Audio framing parameters for streaming input.
input_features :
Input feature dimension (e.g., mel-spectrogram size).
conv_context_size, len_layers, d_model :
Model architecture parameters (convolution context, layers, hidden dim).
max_tokens_per_step, window_step, subsampling_factor :
Decoder step and alignment parameters.
"""
chunk_size: List[int] = field(default_factory=lambda: [49, 56])
shift_size: List[int] = field(default_factory=lambda: [49, 56])
cache_drop_size: int = 0
last_channel_cache_size: int = 70
valid_encoder_out_len: int = 7
pre_encode_cache_size: List[int] = field(default_factory=lambda: [0, 9])
drop_extra_pre_encoded: int = 2
last_channel_num: int = 0
last_time_num: int = 0
audio_chunk_frames: int = 5
audio_chunk_frames_drop: int = 2
audio_frame_size: int = 160
input_features: int = 128
conv_context_size: List[int] = field(default_factory=lambda: [8, 0])
len_layers: int = 24
d_model: int = 1024
max_tokens_per_step: int = 10
window_step: float = 0.01
subsampling_factor: int = 10