roman
commited on
Commit
•
09ec1f1
1
Parent(s):
3fa7eb8
2nd
Browse files- packages.txt +2 -0
- README.md +7 -7
- app.py +102 -0
- pre-requirements.txt +2 -0
- requirements.txt +1 -0
packages.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
libsndfile1
|
2 |
+
ffmpeg
|
README.md
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
---
|
2 |
-
title: Asr
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
-
sdk:
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
-
license:
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Streaming Asr Uk
|
3 |
+
emoji: 🏢
|
4 |
+
colorFrom: pink
|
5 |
+
colorTo: purple
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.50.2
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
license: bsd-3-clause
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import numpy as np
|
3 |
+
import librosa
|
4 |
+
import torch
|
5 |
+
|
6 |
+
from math import ceil
|
7 |
+
import nemo.collections.asr as nemo_asr
|
8 |
+
|
9 |
+
|
10 |
+
asr_model = nemo_asr.models.EncDecCTCModelBPE. \
|
11 |
+
from_pretrained("theodotus/stt_uk_squeezeformer_ctc_sm",map_location="cpu")
|
12 |
+
|
13 |
+
asr_model.preprocessor.featurizer.dither = 0.0
|
14 |
+
asr_model.preprocessor.featurizer.pad_to = 0
|
15 |
+
asr_model.eval()
|
16 |
+
asr_model.encoder.freeze()
|
17 |
+
asr_model.decoder.freeze()
|
18 |
+
|
19 |
+
|
20 |
+
buffer_len = 3.2
|
21 |
+
chunk_len = 0.8
|
22 |
+
total_buffer = round(buffer_len * asr_model.cfg.sample_rate)
|
23 |
+
overhead_len = round((buffer_len - chunk_len) * asr_model.cfg.sample_rate)
|
24 |
+
model_stride = 4
|
25 |
+
|
26 |
+
|
27 |
+
|
28 |
+
model_stride_in_secs = asr_model.cfg.preprocessor.window_stride * model_stride
|
29 |
+
tokens_per_chunk = ceil(chunk_len / model_stride_in_secs)
|
30 |
+
mid_delay = ceil((chunk_len + (buffer_len - chunk_len) / 2) / model_stride_in_secs)
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
def resample(audio):
|
35 |
+
audio_16k, sr = librosa.load(audio, sr = asr_model.cfg["sample_rate"],
|
36 |
+
mono=True, res_type='soxr_hq')
|
37 |
+
return audio_16k
|
38 |
+
|
39 |
+
|
40 |
+
def model(audio_16k):
|
41 |
+
logits, logits_len, greedy_predictions = asr_model.forward(
|
42 |
+
input_signal=torch.tensor([audio_16k]),
|
43 |
+
input_signal_length=torch.tensor([len(audio_16k)])
|
44 |
+
)
|
45 |
+
return logits
|
46 |
+
|
47 |
+
|
48 |
+
def decode_predictions(logits_list):
|
49 |
+
logits_len = logits_list[0].shape[1]
|
50 |
+
# cut overhead
|
51 |
+
cutted_logits = []
|
52 |
+
for idx in range(len(logits_list)):
|
53 |
+
start_cut = 0 if (idx==0) else logits_len - 1 - mid_delay
|
54 |
+
end_cut = -1 if (idx==len(logits_list)-1) else logits_len - 1 - mid_delay + tokens_per_chunk
|
55 |
+
logits = logits_list[idx][:, start_cut:end_cut]
|
56 |
+
cutted_logits.append(logits)
|
57 |
+
|
58 |
+
# join
|
59 |
+
logits = torch.cat(cutted_logits, axis=1)
|
60 |
+
logits_len = torch.tensor([logits.shape[1]])
|
61 |
+
current_hypotheses, all_hyp = asr_model.decoding.ctc_decoder_predictions_tensor(
|
62 |
+
logits, decoder_lengths=logits_len, return_hypotheses=False,
|
63 |
+
)
|
64 |
+
|
65 |
+
return current_hypotheses[0]
|
66 |
+
|
67 |
+
|
68 |
+
def transcribe(audio, state):
|
69 |
+
if state is None:
|
70 |
+
state = [np.array([], dtype=np.float32), []]
|
71 |
+
|
72 |
+
audio_16k = resample(audio)
|
73 |
+
|
74 |
+
# join to audio sequence
|
75 |
+
state[0] = np.concatenate([state[0], audio_16k])
|
76 |
+
|
77 |
+
while (len(state[0]) > total_buffer):
|
78 |
+
buffer = state[0][:total_buffer]
|
79 |
+
state[0] = state[0][total_buffer - overhead_len:]
|
80 |
+
# run model
|
81 |
+
logits = model(buffer)
|
82 |
+
# add logits
|
83 |
+
state[1].append(logits)
|
84 |
+
|
85 |
+
if len(state[1]) == 0:
|
86 |
+
text = ""
|
87 |
+
else:
|
88 |
+
text = decode_predictions(state[1])
|
89 |
+
return text, state
|
90 |
+
|
91 |
+
|
92 |
+
gr.Interface(
|
93 |
+
fn=transcribe,
|
94 |
+
inputs=[
|
95 |
+
gr.Audio(source="microphone", type="filepath", streaming=True),
|
96 |
+
gr.State(None)
|
97 |
+
],
|
98 |
+
outputs=[
|
99 |
+
"textbox",
|
100 |
+
"state"
|
101 |
+
],
|
102 |
+
live=True).launch()
|
pre-requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Cython
|
2 |
+
torch
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
nemo_toolkit[asr]
|