Parakeet-TDT-0.6b-V3_multilingual_file_as_mic_sim

Paused

App Files Files Community

WJ88 commited on Oct 18

Commit

0e644a1

verified ·

1 Parent(s): c6358db

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -62

app.py CHANGED Viewed

@@ -4,50 +4,43 @@ import copy
 import uuid
 import logging
 from typing import List, Optional, Tuple, Dict
 # Reduce progress/log spam before heavy imports
 os.environ.setdefault("TQDM_DISABLE", "1")
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 import numpy as np
 import torch
 import torchaudio
 import soundfile as sf
 import gradio as gr
 # NeMo
 from nemo.collections.asr.models import ASRModel
 from omegaconf import OmegaConf
 from nemo.utils import logging as nemo_logging
 # ----------------------------
 # Config
 # ----------------------------
-MODEL_NAME   = os.environ.get("PARAKEET_MODEL", "nvidia/parakeet-tdt-0.6b-v3")
-TARGET_SR    = 16_000
-BEAM_SIZE    = int(os.environ.get("PARAKEET_BEAM_SIZE", "32"))  # Increased for subtle quality gains
 OFFLINE_BATCH= int(os.environ.get("PARAKEET_BATCH", "8"))
-CHUNK_S      = float(os.environ.get("PARAKEET_CHUNK_S", "2.0"))
-FLUSH_PAD_S  = float(os.environ.get("PARAKEET_FLUSH_PAD_S", "2.0"))
 # ----------------------------
 # Logging (unified)
 # ----------------------------
-LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO").upper()
 logger = logging.getLogger("parakeet_app")
 logger.setLevel(getattr(logging, LOG_LEVEL, logging.INFO))
 _handler = logging.StreamHandler()
 _handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(name)s: %(message)s"))
 logger.handlers = [_handler]
 logger.propagate = False
 # Quiet NeMo logs
 nemo_logging.setLevel(logging.ERROR)
 logging.getLogger("nemo").setLevel(logging.ERROR)
 logging.getLogger("nemo.collections.asr").setLevel(logging.ERROR)
 torch.set_grad_enabled(False)
 # ----------------------------
 # Audio utils
 # ----------------------------
@@ -55,7 +48,6 @@ def to_mono_np(x: np.ndarray) -> np.ndarray:
     if x.ndim == 2:
         x = x.mean(axis=1)
     return x.astype(np.float32, copy=False)
 class ResamplerCache:
     def __init__(self):
         self._cache: Dict[int, torchaudio.transforms.Resample] = {}
@@ -70,22 +62,19 @@ class ResamplerCache:
             t = t.unsqueeze(0)
         y = self._cache[src_sr](t)
         return y.squeeze(0).numpy()
 RESAMPLER = ResamplerCache()
 def load_mono16k(path: str) -> np.ndarray:
     """Load any audio file, convert to mono float32 at 16 kHz."""
     try:
-        wav, sr = sf.read(path, dtype="float32", always_2d=True)  # (T,C)
         wav = wav.mean(axis=1).astype(np.float32, copy=False)
         return RESAMPLER.resample(wav, sr)
     except Exception:
-        wav_t, sr = torchaudio.load(path)  # (C,T)
         if wav_t.dtype != torch.float32:
             wav_t = wav_t.float()
         wav = wav_t.mean(dim=0).numpy()
         return RESAMPLER.resample(wav, int(sr))
 # ----------------------------
 # Model manager (MALSD batched beam everywhere, loop_labels=True)
 # ----------------------------
@@ -98,22 +87,17 @@ class ParakeetManager:
         self.model.eval()
         for p in self.model.parameters():
             p.requires_grad = False
         # Base decoding cfg differs by class
         if hasattr(self.model, "decoder") and hasattr(self.model.decoder, "decoder"):
             self._base_decoding = copy.deepcopy(self.model.decoder.decoder.cfg)
         else:
             self._base_decoding = copy.deepcopy(self.model.cfg.decoding)
         self._set_malsd_beam()
         # Enable encoder caching for better streaming context (per NeMo docs/tutorials)
         if hasattr(self.model.encoder, "set_default_att_context_size"):
-            self.model.encoder.set_default_att_context_size([512, 16])  # Large left for cumulative context, small right for buffering
             logger.info("encoder_caching_enabled left=512 right=16")
         logger.info(f"model_loaded strategy=malsd_batch beam_size={BEAM_SIZE}")
     def _set_malsd_beam(self):
         cfg = copy.deepcopy(self._base_decoding)
         cfg.strategy = "malsd_batch"
@@ -121,18 +105,17 @@ class ParakeetManager:
             "beam_size": BEAM_SIZE,
             "return_best_hypothesis": True,
             "score_norm": True,
-            "allow_cuda_graphs": False,   # CPU-only
             "max_symbols_per_step": 10,
         })
         OmegaConf.set_struct(cfg, False)
         cfg["loop_labels"] = True
         cfg["fused_batch_size"] = -1
-        cfg["compute_timestamps"] = False
         if hasattr(cfg, "greedy"):
             cfg.greedy.use_cuda_graph_decoder = False
         self.model.change_decoding_strategy(cfg)
         logger.info("decoding_set strategy=malsd_batch loop_labels=True")
     def _transcribe(self, items: List, *, partial=None):
         with torch.inference_mode():
             return self.model.transcribe(
@@ -142,7 +125,6 @@ class ParakeetManager:
                 return_hypotheses=True,
                 partial_hypothesis=partial,
             )
     # Offline batch
     def transcribe_files(self, paths: List[str]):
         n = 0 if not paths else len(paths)
@@ -155,18 +137,35 @@ class ParakeetManager:
         for p, o in zip(paths, out):
             h = o[0] if isinstance(o, list) and o else o
             text = h if isinstance(h, str) else getattr(h, "text", "")
             results.append({"path": p, "text": text})
         logger.info("files_run ok")
         return results
     # Streaming step (rolling hypothesis)
     def stream_step(self, audio_16k: np.ndarray, prev_hyp) -> object:
         out = self._transcribe([audio_16k], partial=[prev_hyp] if prev_hyp is not None else None)
         h = out[0][0] if isinstance(out[0], list) else out[0]
-        return h  # Hypothesis
 # ----------------------------
-# Streaming session (no overlap, rolling hypothesis)
 # ----------------------------
 class StreamingSession:
     def __init__(self, manager: ParakeetManager, chunk_s: float, flush_pad_s: float):
@@ -176,61 +175,84 @@ class StreamingSession:
         self.hyp = None
         self.pending = np.zeros(0, dtype=np.float32)
         self.text = ""
         logger.info(f"mic_reset chunk={self.chunk_s}s flush_pad={self.flush_pad_s}s")
     def add_audio(self, audio: np.ndarray, src_sr: int):
         mono = to_mono_np(audio)
         res = RESAMPLER.resample(mono, src_sr)
         self.pending = np.concatenate([self.pending, res]) if self.pending.size else res
         self._drain()
     def _drain(self):
         C = int(self.chunk_s * TARGET_SR)
         while self.pending.size >= C:
             chunk = self.pending[:C]
             self.pending = self.pending[C:]
             try:
-                self.hyp = self.mgr.stream_step(chunk, self.hyp)
-                new_text = getattr(self.hyp, "text", "")
-                if new_text:
-                    if self.text and new_text.startswith(self.text):  # If cumulative (partial extends), replace with extended
-                        self.text = new_text
-                    else:  # Else append (handles per-chunk case)
-                        self.text += (' ' if self.text else '') + new_text
             except Exception:
                 logger.exception("mic_step failed")
                 break
     def flush(self) -> str:
         if self.pending.size:
             pad = np.zeros(int(self.flush_pad_s * TARGET_SR), dtype=np.float32)
             final = np.concatenate([self.pending, pad])
             try:
-                self.hyp = self.mgr.stream_step(final, self.hyp)
-                new_text = getattr(self.hyp, "text", "")
-                if new_text:
-                    if self.text and new_text.startswith(self.text):
-                        self.text = new_text
-                    else:
-                        self.text += (' ' if self.text else '') + new_text
-                self.text += '.'  # Add period for sentence closure on flush
             except Exception:
                 logger.exception("mic_flush failed")
         self.pending = np.zeros(0, dtype=np.float32)
         return self.text
 # ----------------------------
 # Simple session registry (avoid deepcopy in gr.State)
 # ----------------------------
 SESS: Dict[str, StreamingSession] = {}
 def _new_session_id() -> str:
     return uuid.uuid4().hex
 # ----------------------------
 # Gradio callbacks
 # ----------------------------
 MANAGER = ParakeetManager(device="cpu")
 def _parse_gr_audio(x) -> Tuple[np.ndarray, int]:
     if x is None:
         return np.zeros(0, dtype=np.float32), TARGET_SR
@@ -241,7 +263,6 @@ def _parse_gr_audio(x) -> Tuple[np.ndarray, int]:
     if isinstance(x, np.ndarray):
         return x.astype(np.float32, copy=False), TARGET_SR
     logger.error(f"unsupported_gr_audio_payload type={type(x)}"); raise ValueError("Unsupported audio payload")
 def mic_step(audio_chunk, sess_id: Optional[str]):
     if not sess_id or sess_id not in SESS:
         sess_id = _new_session_id()
@@ -255,14 +276,12 @@ def mic_step(audio_chunk, sess_id: Optional[str]):
     if wav.size:
         sess.add_audio(wav, sr)
     return sess_id, sess.text
 def mic_flush(sess_id: Optional[str]):
     if not sess_id or sess_id not in SESS:
         return None, ""
     text = SESS[sess_id].flush()
     logger.info("mic_flush ok")
     return None, text
 def files_run(files):
     n = 0 if not files else len(files)
     logger.info(f"files_ui start count={n}")
@@ -281,7 +300,6 @@ def files_run(files):
     table = [[os.path.basename(r["path"]), r["text"]] for r in results]
     logger.info("files_ui ok")
     return table
 # ----------------------------
 # UI
 # ----------------------------
@@ -290,15 +308,13 @@ with gr.Blocks(title="Parakeet-TDT v3 (Unified MALSD Beam)") as demo:
         mic = gr.Audio(sources=["microphone"], type="numpy", streaming=True, label="Speak")
         text_out = gr.Textbox(label="Transcript", lines=8)
         flush_btn = gr.Button("Flush")
-        state_id = gr.State()  # only a string id
         mic.stream(mic_step, inputs=[mic, state_id], outputs=[state_id, text_out])
         flush_btn.click(mic_flush, inputs=[state_id], outputs=[state_id, text_out])
     with gr.Tab("Files"):
         files = gr.File(file_count="multiple", type="filepath", label="Upload audio files")
         run_btn = gr.Button("Run")
         results_table = gr.Dataframe(headers=["file", "text"], label="Results",
                                      row_count=(0, "dynamic"), col_count=(2, "fixed"))
         run_btn.click(files_run, inputs=[files], outputs=[results_table])
 demo.queue().launch(ssr_mode=False)

 import uuid
 import logging
 from typing import List, Optional, Tuple, Dict
 # Reduce progress/log spam before heavy imports
 os.environ.setdefault("TQDM_DISABLE", "1")
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 import numpy as np
 import torch
 import torchaudio
 import soundfile as sf
 import gradio as gr
 # NeMo
 from nemo.collections.asr.models import ASRModel
+from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis # For hypothesis handling
 from omegaconf import OmegaConf
 from nemo.utils import logging as nemo_logging
 # ----------------------------
 # Config
 # ----------------------------
+MODEL_NAME = os.environ.get("PARAKEET_MODEL", "nvidia/parakeet-tdt-0.6b-v3")
+TARGET_SR = 16_000
+BEAM_SIZE = int(os.environ.get("PARAKEET_BEAM_SIZE", "32")) # Increased for subtle quality gains
 OFFLINE_BATCH= int(os.environ.get("PARAKEET_BATCH", "8"))
+CHUNK_S = float(os.environ.get("PARAKEET_CHUNK_S", "4.0"))
+FLUSH_PAD_S = float(os.environ.get("PARAKEET_FLUSH_PAD_S", "2.0"))
 # ----------------------------
 # Logging (unified)
 # ----------------------------
+LOG_LEVEL = os.environ.get("LOG_LEVEL", "DEBUG").upper()
 logger = logging.getLogger("parakeet_app")
 logger.setLevel(getattr(logging, LOG_LEVEL, logging.INFO))
 _handler = logging.StreamHandler()
 _handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(name)s: %(message)s"))
 logger.handlers = [_handler]
 logger.propagate = False
 # Quiet NeMo logs
 nemo_logging.setLevel(logging.ERROR)
 logging.getLogger("nemo").setLevel(logging.ERROR)
 logging.getLogger("nemo.collections.asr").setLevel(logging.ERROR)
 torch.set_grad_enabled(False)
 # ----------------------------
 # Audio utils
 # ----------------------------
     if x.ndim == 2:
         x = x.mean(axis=1)
     return x.astype(np.float32, copy=False)
 class ResamplerCache:
     def __init__(self):
         self._cache: Dict[int, torchaudio.transforms.Resample] = {}
             t = t.unsqueeze(0)
         y = self._cache[src_sr](t)
         return y.squeeze(0).numpy()
 RESAMPLER = ResamplerCache()
 def load_mono16k(path: str) -> np.ndarray:
     """Load any audio file, convert to mono float32 at 16 kHz."""
     try:
+        wav, sr = sf.read(path, dtype="float32", always_2d=True) # (T,C)
         wav = wav.mean(axis=1).astype(np.float32, copy=False)
         return RESAMPLER.resample(wav, sr)
     except Exception:
+        wav_t, sr = torchaudio.load(path) # (C,T)
         if wav_t.dtype != torch.float32:
             wav_t = wav_t.float()
         wav = wav_t.mean(dim=0).numpy()
         return RESAMPLER.resample(wav, int(sr))
 # ----------------------------
 # Model manager (MALSD batched beam everywhere, loop_labels=True)
 # ----------------------------
         self.model.eval()
         for p in self.model.parameters():
             p.requires_grad = False
         # Base decoding cfg differs by class
         if hasattr(self.model, "decoder") and hasattr(self.model.decoder, "decoder"):
             self._base_decoding = copy.deepcopy(self.model.decoder.decoder.cfg)
         else:
             self._base_decoding = copy.deepcopy(self.model.cfg.decoding)
         self._set_malsd_beam()
         # Enable encoder caching for better streaming context (per NeMo docs/tutorials)
         if hasattr(self.model.encoder, "set_default_att_context_size"):
+            self.model.encoder.set_default_att_context_size([512, 16]) # Large left for cumulative context, small right for buffering
             logger.info("encoder_caching_enabled left=512 right=16")
         logger.info(f"model_loaded strategy=malsd_batch beam_size={BEAM_SIZE}")
     def _set_malsd_beam(self):
         cfg = copy.deepcopy(self._base_decoding)
         cfg.strategy = "malsd_batch"
             "beam_size": BEAM_SIZE,
             "return_best_hypothesis": True,
             "score_norm": True,
+            "allow_cuda_graphs": False, # CPU-only
             "max_symbols_per_step": 10,
         })
         OmegaConf.set_struct(cfg, False)
         cfg["loop_labels"] = True
         cfg["fused_batch_size"] = -1
+        cfg["compute_timestamps"] = True  # Enabled for word-level timestamps
         if hasattr(cfg, "greedy"):
             cfg.greedy.use_cuda_graph_decoder = False
         self.model.change_decoding_strategy(cfg)
         logger.info("decoding_set strategy=malsd_batch loop_labels=True")
     def _transcribe(self, items: List, *, partial=None):
         with torch.inference_mode():
             return self.model.transcribe(
                 return_hypotheses=True,
                 partial_hypothesis=partial,
             )
     # Offline batch
     def transcribe_files(self, paths: List[str]):
         n = 0 if not paths else len(paths)
         for p, o in zip(paths, out):
             h = o[0] if isinstance(o, list) and o else o
             text = h if isinstance(h, str) else getattr(h, "text", "")
+            # Extract timestamps if available
+            if hasattr(h, 'timestep') and h.timestep:
+                word_timestamps = h.timestep.get('word', [])
+                if word_timestamps and text:
+                    # Format timed text
+                    words = text.split()
+                    if len(words) == len(word_timestamps):
+                        timed_parts = [f"{word} ({ts['start']}-{ts['end']}s)" for word, ts in zip(words, word_timestamps)]
+                        text = ' '.join(timed_parts)
+                    logger.debug(f"File timestamps for {p}: {word_timestamps}")
             results.append({"path": p, "text": text})
         logger.info("files_run ok")
         return results
     # Streaming step (rolling hypothesis)
     def stream_step(self, audio_16k: np.ndarray, prev_hyp) -> object:
         out = self._transcribe([audio_16k], partial=[prev_hyp] if prev_hyp is not None else None)
         h = out[0][0] if isinstance(out[0], list) else out[0]
+        return h # Hypothesis
 # ----------------------------
+# Helper for token merging
+# ----------------------------
+def common_prefix_len(a: list, b: list) -> int:
+    min_len = min(len(a), len(b))
+    for i in range(min_len):
+        if a[i] != b[i]:
+            return i
+    return min_len
+# ----------------------------
+# Streaming session (rolling hypothesis with token merging)
 # ----------------------------
 class StreamingSession:
     def __init__(self, manager: ParakeetManager, chunk_s: float, flush_pad_s: float):
         self.hyp = None
         self.pending = np.zeros(0, dtype=np.float32)
         self.text = ""
+        self.tokens: List[int] = [] # Track current token sequence for merging
         logger.info(f"mic_reset chunk={self.chunk_s}s flush_pad={self.flush_pad_s}s")
     def add_audio(self, audio: np.ndarray, src_sr: int):
         mono = to_mono_np(audio)
         res = RESAMPLER.resample(mono, src_sr)
+        # Normalize volume
+        if np.max(np.abs(res)) > 0:
+            res = res / np.max(np.abs(res)) * 0.95 # Scale to [-0.95, 0.95]
+        # Simple VAD (trim silence; use torchaudio's if import functional as F)
+        from torchaudio.functional import vad
+        res = vad(torch.from_numpy(res), sample_rate=TARGET_SR, trigger_level=7.0).numpy()
         self.pending = np.concatenate([self.pending, res]) if self.pending.size else res
         self._drain()
+    def _merge_tokens(self, new_hyp: Hypothesis) -> None:
+        """Merge new hypothesis tokens with existing, update text and hyp."""
+        # Handle all possible types: tensor, ndarray, list, None
+        if new_hyp.y_sequence is None:
+            new_tokens = []
+        elif isinstance(new_hyp.y_sequence, torch.Tensor):
+            new_tokens = new_hyp.y_sequence.cpu().tolist()
+        elif isinstance(new_hyp.y_sequence, np.ndarray):
+            new_tokens = new_hyp.y_sequence.tolist()
+        else:
+            new_tokens = list(new_hyp.y_sequence)
+        # Ensure self.tokens is list
+        self.tokens = list(self.tokens)
+        logger.debug(f"New hyp text: '{new_hyp.text}', y_sequence type: {type(new_hyp.y_sequence)}, len: {len(new_tokens) if new_tokens else 0}")
+        if len(new_tokens) > 0:
+            prefix_len = common_prefix_len(self.tokens, new_tokens)
+            if prefix_len < len(new_tokens): # Skip if no new tokens
+                merged_tokens = self.tokens + new_tokens[prefix_len:]
+                logger.debug(f"Prev tokens len: {len(self.tokens)}, New tokens len: {len(new_tokens)}, Prefix len: {prefix_len}, Merged tokens len: {len(merged_tokens)}")
+                self.text = self.mgr.model.tokenizer.ids_to_text(merged_tokens)
+                self.tokens = merged_tokens
+                # Update hyp for next partial (copy and set as tensor, as NeMo expects)
+                self.hyp = copy.deepcopy(new_hyp)
+                self.hyp.y_sequence = torch.tensor(merged_tokens, dtype=torch.long)
+                logger.debug(f"Merged tokens: len={len(merged_tokens)}") # For debug
+        # Log timestamps if available
+        if hasattr(new_hyp, 'timestep') and new_hyp.timestep:
+            word_timestamps = new_hyp.timestep.get('word', [])
+            if word_timestamps:
+                logger.debug(f"New hyp word timestamps: {word_timestamps}")
     def _drain(self):
         C = int(self.chunk_s * TARGET_SR)
         while self.pending.size >= C:
             chunk = self.pending[:C]
             self.pending = self.pending[C:]
             try:
+                new_hyp = self.mgr.stream_step(chunk, self.hyp)
+                logger.debug(f"Post-step hyp text: '{new_hyp.text}'")
+                self._merge_tokens(new_hyp)
             except Exception:
                 logger.exception("mic_step failed")
                 break
     def flush(self) -> str:
         if self.pending.size:
             pad = np.zeros(int(self.flush_pad_s * TARGET_SR), dtype=np.float32)
             final = np.concatenate([self.pending, pad])
             try:
+                new_hyp = self.mgr.stream_step(final, self.hyp)
+                self._merge_tokens(new_hyp)
+                if self.text: # Add period only if there's text
+                    self.text += '.'
             except Exception:
                 logger.exception("mic_flush failed")
         self.pending = np.zeros(0, dtype=np.float32)
         return self.text
 # ----------------------------
 # Simple session registry (avoid deepcopy in gr.State)
 # ----------------------------
 SESS: Dict[str, StreamingSession] = {}
 def _new_session_id() -> str:
     return uuid.uuid4().hex
 # ----------------------------
 # Gradio callbacks
 # ----------------------------
 MANAGER = ParakeetManager(device="cpu")
 def _parse_gr_audio(x) -> Tuple[np.ndarray, int]:
     if x is None:
         return np.zeros(0, dtype=np.float32), TARGET_SR
     if isinstance(x, np.ndarray):
         return x.astype(np.float32, copy=False), TARGET_SR
     logger.error(f"unsupported_gr_audio_payload type={type(x)}"); raise ValueError("Unsupported audio payload")
 def mic_step(audio_chunk, sess_id: Optional[str]):
     if not sess_id or sess_id not in SESS:
         sess_id = _new_session_id()
     if wav.size:
         sess.add_audio(wav, sr)
     return sess_id, sess.text
 def mic_flush(sess_id: Optional[str]):
     if not sess_id or sess_id not in SESS:
         return None, ""
     text = SESS[sess_id].flush()
     logger.info("mic_flush ok")
     return None, text
 def files_run(files):
     n = 0 if not files else len(files)
     logger.info(f"files_ui start count={n}")
     table = [[os.path.basename(r["path"]), r["text"]] for r in results]
     logger.info("files_ui ok")
     return table
 # ----------------------------
 # UI
 # ----------------------------
         mic = gr.Audio(sources=["microphone"], type="numpy", streaming=True, label="Speak")
         text_out = gr.Textbox(label="Transcript", lines=8)
         flush_btn = gr.Button("Flush")
+        state_id = gr.State() # only a string id
         mic.stream(mic_step, inputs=[mic, state_id], outputs=[state_id, text_out])
         flush_btn.click(mic_flush, inputs=[state_id], outputs=[state_id, text_out])
     with gr.Tab("Files"):
         files = gr.File(file_count="multiple", type="filepath", label="Upload audio files")
         run_btn = gr.Button("Run")
         results_table = gr.Dataframe(headers=["file", "text"], label="Results",
                                      row_count=(0, "dynamic"), col_count=(2, "fixed"))
         run_btn.click(files_run, inputs=[files], outputs=[results_table])
 demo.queue().launch(ssr_mode=False)