Parakeet-TDT-0.6b-V3_multilingual_file_as_mic_sim

Paused

App Files Files Community

WJ88 commited on Oct 8

Commit

8f87442

verified ·

1 Parent(s): 7a412d6

test, previous was working ok

Browse files

Files changed (1) hide show

app.py +147 -96

app.py CHANGED Viewed

@@ -1,10 +1,10 @@
-"""Gradio Blocks app for streaming ASR with NVIDIA NeMo Parakeet-TDT-0.6B-v3.
-Fixes for HF Spaces + Gradio SSR:
-  - Uses Blocks + .stream() API for input streaming.
-  - Forces client-side rendering by setting ssr_mode=False on launch.
-  - Accepts both (sr, np.ndarray) and {"sampling_rate","data"} chunk formats.
 """
 from __future__ import annotations
@@ -14,12 +14,10 @@ from typing import Optional, Tuple, Union, Dict
 import copy
 import numpy as np
 import torch
-# torch.set_num_threads(2)
 import torchaudio
 import gradio as gr
 import nemo.collections.asr as nemo_asr
-# from omegaconf import OmegaConf
 from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig
 from nemo.collections.asr.parts.utils.rnnt_utils import batched_hyps_to_hypotheses
 from nemo.collections.asr.parts.utils.streaming_utils import ContextSize, StreamingBatchedAudioBuffer
@@ -33,13 +31,13 @@ Chunk = Union[Tuple[int, np.ndarray], Dict[str, ArrayLike]]
 # ----------------------------
 @dataclass
 class AppConfig:
-    model_name: str = "nvidia/parakeet-tdt-0.6b-v3"
     left_s: float = 10.0
     chunk_s: float = 2.0
     right_s: float = 2.0
     max_buffer_s: float = 40.0
     batch_size: int = 1
-    device: str = "cpu"  # set "cuda" to force GPU if available
 # ----------------------------
@@ -50,47 +48,64 @@ def _floor_multiple(a: int, b: int) -> int:
 # ----------------------------
-# ASR Engine
 # ----------------------------
 class ParakeetStreamer:
     def __init__(self, cfg: AppConfig) -> None:
         self.cfg = cfg
-        # Load model
         self.model = (
             nemo_asr.models.EncDecRNNTModel.from_pretrained(cfg.model_name)
-            .to(cfg.device)
             .eval()
         )
         for p in self.model.parameters():
             p.requires_grad_(False)
         # Decoding strategy: greedy-batch with label-looping
         dec_cfg = RNNTDecodingConfig(
-            strategy="greedy_batch", fused_batch_size=-1, compute_timestamps=False
         )
         dec_cfg.greedy.loop_labels = True
         self.model.change_decoding_strategy(dec_cfg)
         self._decoding_computer = self.model.decoding.decoding.decoding_computer
-        # Clone + tweak preprocessor for inference
-        mcfg = copy.deepcopy(self.model.cfg)
-        # OmegaConf.set_struct(mcfg.preprocessor, False)
-        # mcfg.preprocessor.dither = 0.0
-        # mcfg.preprocessor.pad_to = 0
-        # OmegaConf.set_struct(mcfg.preprocessor, True)
-        # Derived constants
         self.sample_rate: int = int(mcfg.preprocessor.sample_rate)
         window_stride: float = float(mcfg.preprocessor.window_stride)
         self.frames_per_second: float = 1.0 / window_stride
         self.subsampling: int = int(self.model.encoder.subsampling_factor)
-        # Encoder-step to audio alignment
         feat_f2a = _floor_multiple(int(self.sample_rate * window_stride), self.subsampling)
         self.enc_f2a = feat_f2a * self.subsampling
-        # Context sizes (encoder and samples)
         self.ctx_enc = ContextSize(
             left=int(cfg.left_s * self.frames_per_second / self.subsampling),
             chunk=int(cfg.chunk_s * self.frames_per_second / self.subsampling),
@@ -104,119 +119,155 @@ class ParakeetStreamer:
         self.max_samples = int(cfg.max_buffer_s * self.sample_rate)
-    # -------- audio helpers --------
     @staticmethod
     def _to_mono(x: np.ndarray) -> np.ndarray:
         x = np.asarray(x)
         if x.ndim == 2:
-            # handle (samples, channels) or (channels, samples)
-            if x.shape[0] == 2 and x.shape[1] != 2:
-                # ambiguous case; fallback to last axis
-                x = x.mean(axis=-1)
-            else:
-                x = x.mean(axis=-1 if x.shape[-1] in (1, 2) else 1)
         return x.astype(np.float32, copy=False)
     def _resample_if_needed(self, x: np.ndarray, in_sr: int) -> np.ndarray:
-        if int(in_sr) == self.sample_rate:
             return x
-        y = torchaudio.functional.resample(torch.from_numpy(x), in_sr, self.sample_rate)
         return y.numpy().astype(np.float32, copy=False)
     @staticmethod
     def _parse_chunk(new_chunk: Chunk) -> Tuple[int, np.ndarray]:
-        # Accept tuple (sr, np.ndarray) or dict {"sampling_rate": int, "data": array-like}
         if isinstance(new_chunk, dict):
             sr = int(new_chunk.get("sampling_rate") or new_chunk.get("sample_rate"))
             data = new_chunk["data"]
             if isinstance(data, torch.Tensor):
                 data = data.detach().cpu().numpy()
             return sr, np.asarray(data)
-        # assume (sr, np.ndarray)
         sr, data = new_chunk
         return int(sr), np.asarray(data)
-    # -------- core decoding --------
     @torch.inference_mode()
-    def _decode_buffer(self, audio_np: np.ndarray) -> str:
-        if audio_np.size == 0:
             return ""
-        a = torch.from_numpy(audio_np).unsqueeze(0).to(torch.float32).to(self.cfg.device)
-        total_len = torch.tensor([a.shape[1]], dtype=torch.long, device=self.cfg.device)
-        cur_hyps = None
-        prev_state = None
-        l = 0
-        r = min(self.ctx_samp.chunk + self.ctx_samp.right, a.shape[1])
-        buf = StreamingBatchedAudioBuffer(
-            batch_size=self.cfg.batch_size,
-            context_samples=self.ctx_samp,
-            dtype=a.dtype,
-            device=self.cfg.device,
-        )
-        remaining = total_len.clone()
-        while l < a.shape[1]:
-            clen = int(min(r, a.shape[1]) - l)
-            is_last = r >= a.shape[1]
-            is_last_b = torch.tensor([clen >= remaining[0]], dtype=torch.bool, device=self.cfg.device)
-            clen_b = torch.where(is_last_b, remaining, torch.full_like(remaining, fill_value=clen))
-            buf.add_audio_batch_(
-                a[:, l:r], audio_lengths=clen_b, is_last_chunk=is_last, is_last_chunk_batch=is_last_b
             )
-            enc, _ = self.model(input_signal=buf.samples, input_signal_length=buf.context_size_batch.total())
             enc = enc.transpose(1, 2)  # [B, T, C]
-            enc_ctx = buf.context_size.subsample(factor=self.enc_f2a)
-            enc_ctx_b = buf.context_size_batch.subsample(factor=self.enc_f2a)
-            enc = enc[:, enc_ctx.left:]  # drop left context before decoding
-            hyps, _, prev_state = self._decoding_computer(
-                x=enc, out_len=enc_ctx_b.chunk, prev_batched_state=prev_state
             )
-            if cur_hyps is None:
-                cur_hyps = hyps
             else:
-                cur_hyps.merge_(hyps)
-            remaining -= clen_b
-            l = r
-            r = min(r + self.ctx_samp.chunk, a.shape[1])
-        outs = batched_hyps_to_hypotheses(cur_hyps, None, batch_size=self.cfg.batch_size) if cur_hyps is not None else []
         for h in outs:
             h.text = self.model.tokenizer.ids_to_text(h.y_sequence.tolist())
         return outs[0].text if outs else ""
-    # -------- public streaming API (stateless) --------
-    def transcribe(self, stream: Optional[np.ndarray], new_chunk: Optional[Chunk]):
         if new_chunk is None:
-            return stream, ""
         in_sr, data = self._parse_chunk(new_chunk)
         y = self._to_mono(data)
-        y = self._resample_if_needed(y, int(in_sr))
-        if stream is None or len(stream) == 0:
-            a = y
         else:
-            a = np.concatenate([stream, y])
-        if a.size > self.max_samples:
-            a = a[-self.max_samples:]
-        text = self._decode_buffer(a) if a.size else ""
-        return a, text
 # ----------------------------
@@ -226,26 +277,26 @@ def build_demo(cfg: Optional[AppConfig] = None):
     cfg = cfg or AppConfig()
     engine = ParakeetStreamer(cfg)
-    with gr.Blocks(title="Parakeet-TDT-0.6B-v3 — CPU streaming") as demo:
-        gr.Markdown("**Multilingual buffered streaming (10-2-2) in memory**")
         with gr.Row():
             mic = gr.Audio(
                 sources=["microphone"],
                 type="numpy",
                 streaming=True,
-                label="Mic",
                 recording=False,
             )
             out = gr.Textbox(label="Transcript", lines=3)
         state = gr.State(value=None)
-        # Stream mic to backend every ~1s for more context and lower CPU churn
         mic.stream(
             fn=engine.transcribe,
             inputs=[state, mic],
             outputs=[state, out],
             stream_every=1.0,
-            time_limit=120,
             concurrency_limit=1,
         )
@@ -254,6 +305,6 @@ def build_demo(cfg: Optional[AppConfig] = None):
 if __name__ == "__main__":
     demo = build_demo()
-    # Disable SSR explicitly to avoid Audio preprocessing via file paths on HF Spaces.
     demo.queue()
     demo.launch(ssr_mode=False, show_api=False)

+"""Gradio Blocks app for CPU streaming ASR with NVIDIA NeMo Parakeet RNNT (label-looping greedy).
+Key points (CPU-tailored):
+  - Incremental decoding with StreamingBatchedAudioBuffer and persistent RNNT state.
+  - Avoids re-decoding the entire accumulated buffer each chunk.
+  - Disables dither/padding for streaming and reuses a torchaudio resampler.
+  - Works on Hugging Face Spaces (CPU) with SSR disabled.
 """
 from __future__ import annotations
 import copy
 import numpy as np
 import torch
 import torchaudio
 import gradio as gr
 import nemo.collections.asr as nemo_asr
 from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig
 from nemo.collections.asr.parts.utils.rnnt_utils import batched_hyps_to_hypotheses
 from nemo.collections.asr.parts.utils.streaming_utils import ContextSize, StreamingBatchedAudioBuffer
 # ----------------------------
 @dataclass
 class AppConfig:
+    model_name: str = "nvidia/parakeet-tdt-0.6b-v3"  # or any RNNT model compatible with label-looping greedy
     left_s: float = 10.0
     chunk_s: float = 2.0
     right_s: float = 2.0
     max_buffer_s: float = 40.0
     batch_size: int = 1
+    device: str = "cpu"  # CPU-only for HF Spaces
 # ----------------------------
 # ----------------------------
+# ASR Engine (stateful incremental streaming)
 # ----------------------------
 class ParakeetStreamer:
     def __init__(self, cfg: AppConfig) -> None:
         self.cfg = cfg
+        self.device = torch.device("cpu")
+        torch.set_grad_enabled(False)
+        torch.set_float32_matmul_precision("high")
+        # Optionally tune CPU threads (uncomment to adjust for your HF Space)
+        # torch.set_num_threads(max(1, (torch.get_num_threads() or 1)))
+        # Load model (RNNT)
         self.model = (
             nemo_asr.models.EncDecRNNTModel.from_pretrained(cfg.model_name)
+            .to(self.device)
             .eval()
         )
         for p in self.model.parameters():
             p.requires_grad_(False)
+        # Set streaming-friendly preprocessor params
+        try:
+            if hasattr(self.model, "preprocessor") and hasattr(self.model.preprocessor, "featurizer"):
+                self.model.preprocessor.featurizer.dither = 0.0
+                self.model.preprocessor.featurizer.pad_to = 0
+        except Exception:
+            pass
         # Decoding strategy: greedy-batch with label-looping
         dec_cfg = RNNTDecodingConfig(
+            strategy="greedy_batch",
+            fused_batch_size=-1,
+            compute_timestamps=False,
         )
         dec_cfg.greedy.loop_labels = True
+        dec_cfg.greedy.preserve_alignments = False
         self.model.change_decoding_strategy(dec_cfg)
         self._decoding_computer = self.model.decoding.decoding.decoding_computer
+        # Clone + read model cfg for derived params
+        mcfg = copy.deepcopy(getattr(self.model, "_cfg", getattr(self.model, "cfg", None)))
+        if mcfg is None:
+            raise RuntimeError("Unable to access model config. Update NeMo or provide a compatible RNNT model.")
         self.sample_rate: int = int(mcfg.preprocessor.sample_rate)
         window_stride: float = float(mcfg.preprocessor.window_stride)
         self.frames_per_second: float = 1.0 / window_stride
+        # Encoder subsampling factor
+        if not hasattr(self.model, "encoder") or not hasattr(self.model.encoder, "subsampling_factor"):
+            raise RuntimeError("Model encoder must expose subsampling_factor for streaming alignment.")
         self.subsampling: int = int(self.model.encoder.subsampling_factor)
+        # Map encoder frames to audio samples
         feat_f2a = _floor_multiple(int(self.sample_rate * window_stride), self.subsampling)
         self.enc_f2a = feat_f2a * self.subsampling
+        # Context sizes (encoder frames and audio samples)
         self.ctx_enc = ContextSize(
             left=int(cfg.left_s * self.frames_per_second / self.subsampling),
             chunk=int(cfg.chunk_s * self.frames_per_second / self.subsampling),
         self.max_samples = int(cfg.max_buffer_s * self.sample_rate)
+        # Persistent streaming state
+        self._stream_np: Optional[np.ndarray] = None
+        self._buf: Optional[StreamingBatchedAudioBuffer] = None
+        self._prev_state = None
+        self._cur_hyps = None
+        self._l = 0  # left cursor (samples)
+        self._r = 0  # right cursor (samples)
+        # Cached resampler
+        self._resampler: Optional[torchaudio.transforms.Resample] = None
+        self._resampler_in_sr: Optional[int] = None
+    def reset(self):
+        self._stream_np = None
+        self._buf = None
+        self._prev_state = None
+        self._cur_hyps = None
+        self._l = 0
+        self._r = 0
+        self._resampler = None
+        self._resampler_in_sr = None
     @staticmethod
     def _to_mono(x: np.ndarray) -> np.ndarray:
         x = np.asarray(x)
         if x.ndim == 2:
+            # average over last axis
+            x = x.mean(axis=-1 if x.shape[-1] in (1, 2) else 1)
         return x.astype(np.float32, copy=False)
     def _resample_if_needed(self, x: np.ndarray, in_sr: int) -> np.ndarray:
+        in_sr = int(in_sr)
+        if in_sr == self.sample_rate:
             return x
+        if self._resampler is None or self._resampler_in_sr != in_sr:
+            self._resampler = torchaudio.transforms.Resample(orig_freq=in_sr, new_freq=self.sample_rate)
+            self._resampler_in_sr = in_sr
+        y = self._resampler(torch.from_numpy(x))
         return y.numpy().astype(np.float32, copy=False)
     @staticmethod
     def _parse_chunk(new_chunk: Chunk) -> Tuple[int, np.ndarray]:
+        # Accept dict {"sampling_rate"|"sample_rate", "data"} or tuple (sr, np.ndarray)
         if isinstance(new_chunk, dict):
             sr = int(new_chunk.get("sampling_rate") or new_chunk.get("sample_rate"))
             data = new_chunk["data"]
             if isinstance(data, torch.Tensor):
                 data = data.detach().cpu().numpy()
             return sr, np.asarray(data)
         sr, data = new_chunk
         return int(sr), np.asarray(data)
     @torch.inference_mode()
+    def _decode_increment(self) -> str:
+        if self._stream_np is None or self._stream_np.size == 0:
             return ""
+        # Lazily initialize buffer and cursors
+        if self._buf is None:
+            self._buf = StreamingBatchedAudioBuffer(
+                batch_size=self.cfg.batch_size,
+                context_samples=self.ctx_samp,
+                dtype=torch.float32,
+                device=self.device,
+            )
+            self._l = 0
+            # First decode when we have chunk+right samples available
+            self._r = self.ctx_samp.chunk + self.ctx_samp.right
+        a = torch.from_numpy(self._stream_np).unsqueeze(0).to(torch.float32).to(self.device)
+        # Decode as long as we have enough samples for the next window [left: right]
+        while self._l < a.shape[1]:
+            if a.shape[1] < self._r:
+                break  # wait for more right-context samples
+            clen = int(self._r - self._l)
+            if clen <= 0:
+                break
+            is_last_chunk = False  # not final; mic keeps streaming
+            is_last_b = torch.tensor([False], dtype=torch.bool, device=self.device)
+            clen_b = torch.tensor([clen], dtype=torch.long, device=self.device)
+            self._buf.add_audio_batch_(
+                a[:, self._l:self._r],
+                audio_lengths=clen_b,
+                is_last_chunk=is_last_chunk,
+                is_last_chunk_batch=is_last_b,
             )
+            enc, _ = self.model(
+                input_signal=self._buf.samples,
+                input_signal_length=self._buf.context_size_batch.total(),
+            )
             enc = enc.transpose(1, 2)  # [B, T, C]
+            enc_ctx = self._buf.context_size.subsample(factor=self.enc_f2a)
+            enc_ctx_b = self._buf.context_size_batch.subsample(factor=self.enc_f2a)
+            # Drop left context before decoding; decode only the chunk frames
+            enc = enc[:, enc_ctx.left:]
+            hyps, _, self._prev_state = self._decoding_computer(
+                x=enc, out_len=enc_ctx_b.chunk, prev_batched_state=self._prev_state
             )
+            if self._cur_hyps is None:
+                self._cur_hyps = hyps
             else:
+                self._cur_hyps.merge_(hyps)
+            # Advance to next chunk window
+            self._l = self._r
+            self._r = self._r + self.ctx_samp.chunk
+        outs = (
+            batched_hyps_to_hypotheses(self._cur_hyps, None, batch_size=self.cfg.batch_size)
+            if self._cur_hyps is not None
+            else []
+        )
         for h in outs:
             h.text = self.model.tokenizer.ids_to_text(h.y_sequence.tolist())
         return outs[0].text if outs else ""
+    # Public API for Gradio streaming callback (stateful)
+    def transcribe(self, state: Optional[np.ndarray], new_chunk: Optional[Chunk]):
+        # Reset when a new session starts
+        if state is None and self._cur_hyps is not None:
+            self.reset()
         if new_chunk is None:
+            return state, ""
         in_sr, data = self._parse_chunk(new_chunk)
         y = self._to_mono(data)
+        y = self._resample_if_needed(y, in_sr)
+        if self._stream_np is None or self._stream_np.size == 0:
+            self._stream_np = y
         else:
+            self._stream_np = np.concatenate([self._stream_np, y])
+            if self._stream_np.size > self.max_samples:
+                # Trim buffer and shift cursors accordingly
+                drop = self._stream_np.size - self.max_samples
+                self._stream_np = self._stream_np[-self.max_samples:]
+                self._l = max(0, self._l - drop)
+                self._r = max(self.ctx_samp.chunk + self.ctx_samp.right, self._r - drop)
+        text = self._decode_increment() if self._stream_np.size else ""
+        return self._stream_np, text
 # ----------------------------
     cfg = cfg or AppConfig()
     engine = ParakeetStreamer(cfg)
+    with gr.Blocks(title="Parakeet RNNT — CPU Streaming") as demo:
+        gr.Markdown("**Buffered streaming (10-2-2) on CPU with incremental decoding**")
         with gr.Row():
             mic = gr.Audio(
                 sources=["microphone"],
                 type="numpy",
                 streaming=True,
+                label="Microphone",
                 recording=False,
             )
             out = gr.Textbox(label="Transcript", lines=3)
         state = gr.State(value=None)
+        # Stream mic to backend periodically. Increase to 1.0 for lower CPU, decrease for lower latency.
         mic.stream(
             fn=engine.transcribe,
             inputs=[state, mic],
             outputs=[state, out],
             stream_every=1.0,
+            time_limit=180,
             concurrency_limit=1,
         )
 if __name__ == "__main__":
     demo = build_demo()
     demo.queue()
+    # Disable SSR to avoid file-path based Audio preprocessing on HF Spaces
     demo.launch(ssr_mode=False, show_api=False)