File size: 4,083 Bytes
4704268 fa1dce6 4704268 c913b1a 4704268 fa1dce6 4704268 fa1dce6 4704268 fa1dce6 4704268 c913b1a 4704268 c913b1a 4704268 c913b1a 4704268 c913b1a 4704268 9faef73 4704268 c913b1a 4704268 9faef73 4704268 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import gradio as gr
import nemo.collections.asr as nemo_asr
import numpy as np
import pandas as pd
from scipy import signal
TARGET_SR = 16_000 # Hz
TITLE = "NVIDIA's Parakeet TDT 0.6B v2 Demo"
DESCRIPTION = """## Description
NVIDIA's parakeet-tdt-0.6b-v2 is a 600-million-parameter automatic \
speech recognition (ASR) model designed for high-quality English \
transcription, featuring support for punctuation, capitalization, \
and accurate timestamp prediction. This is a state-of-the-art model \
ideal for: accurate word-level timestamp predictions, automatic \
punctuation and capitalization, robust performance on spoken numbers, \
and song lyrics transcription.
### License
The license is comercial friendly:
> GOVERNING TERMS: Use of this model is governed by the [CC-BY-4.0 license](https://creativecommons.org/licenses/by/4.0/deed.en).
### Contact
Need help adding transcription to your system? [Let's talk!](mailto:support@ridgerun.ai).\
At [RidgeRun.ai](https://ridgerun.ai) we'd love to help.
### Links of Interest
* [Model card](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2)
* [NVIDIA NeMO](https://github.com/NVIDIA/NeMo)
## Playground
"""
_model = None
def _to_float32(x: np.ndarray) -> np.ndarray:
"""
Convert any integer PCM array to float32 in the range [-1, 1].
Works for signed (int8/16/24/32) and unsigned (uint8) types,
without hard‑coded magic numbers.
"""
if not np.issubdtype(x.dtype, np.integer):
# Already float – just ensure dtype is float32
return x.astype(np.float32, copy=False)
info = np.iinfo(x.dtype)
x = x.astype(np.float32)
# signed PCM (e.g. int16, int32)
if info.min < 0:
# ‑32768..32767 -> ‑1..1
x /= max(abs(info.min), info.max)
# unsigned PCM (e.g. uint8 0..255)
else:
# 128.0 for uint8
midpoint = (info.max + 1) / 2
# 0..255 -> ‑1..1
x = (x - midpoint) / midpoint
return x
def _resample(audio: np.ndarray, rate: int, target_rate: int) -> np.ndarray:
if rate == target_rate:
return audio
# Use polyphase filtering for efficient, high‑quality resampling.
gcd = np.gcd(rate, target_rate)
up = target_rate // gcd
down = rate // gcd
resampled = signal.resample_poly(
_to_float32(audio), up=up, down=down, axis=0
)
return resampled
def _load_model():
global _model
if not _model:
_model = nemo_asr.models.ASRModel.from_pretrained(
model_name="nvidia/parakeet-tdt-0.6b-v2"
)
return _model
def _to_pandas(prediction, keyword):
return pd.DataFrame(prediction.timestamp[keyword])[
[keyword, "start", "end"]
]
def _invoke_model(model, audio: np.ndarray):
prediction = model.transcribe(audio=audio, timestamps=True)[0]
text = prediction.text
chars = _to_pandas(prediction, "char")
words = _to_pandas(prediction, "word")
segments = _to_pandas(prediction, "segment")
return text, chars, words, segments
def transcribe(audio: tuple[np.ndarray, int] | None):
if not audio:
return "No audio received. Please upload or record something"
rate, data = audio
model = _load_model()
data = _to_float32(data)
data = _resample(data, rate, TARGET_SR)
text, chars, words, segments = _invoke_model(model, data)
return text, segments, words, chars
app = gr.Interface(
fn=transcribe,
inputs=gr.Audio(
sources=["upload", "microphone"],
type="numpy",
label="Upload or record audio",
),
outputs=[
gr.Textbox(label="Transcription", show_copy_button=True),
gr.Dataframe(
label="Segments",
headers=["Segment", "Start", "End"],
),
gr.Dataframe(
label="Words",
headers=["Word", "Start", "End"],
),
gr.Dataframe(
label="Characters",
headers=["Character", "Start", "End"],
),
],
title=TITLE,
description=DESCRIPTION,
)
if __name__ == "__main__":
app.launch()
|