roman commited on
Commit
09ec1f1
1 Parent(s): 3fa7eb8
Files changed (5) hide show
  1. packages.txt +2 -0
  2. README.md +7 -7
  3. app.py +102 -0
  4. pre-requirements.txt +2 -0
  5. requirements.txt +1 -0
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ libsndfile1
2
+ ffmpeg
README.md CHANGED
@@ -1,13 +1,13 @@
1
  ---
2
- title: Asr
3
- emoji: 🌍
4
- colorFrom: red
5
- colorTo: red
6
- sdk: streamlit
7
- sdk_version: 1.35.0
8
  app_file: app.py
9
  pinned: false
10
- license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Streaming Asr Uk
3
+ emoji: 🏢
4
+ colorFrom: pink
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 3.50.2
8
  app_file: app.py
9
  pinned: false
10
+ license: bsd-3-clause
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import librosa
4
+ import torch
5
+
6
+ from math import ceil
7
+ import nemo.collections.asr as nemo_asr
8
+
9
+
10
+ asr_model = nemo_asr.models.EncDecCTCModelBPE. \
11
+ from_pretrained("theodotus/stt_uk_squeezeformer_ctc_sm",map_location="cpu")
12
+
13
+ asr_model.preprocessor.featurizer.dither = 0.0
14
+ asr_model.preprocessor.featurizer.pad_to = 0
15
+ asr_model.eval()
16
+ asr_model.encoder.freeze()
17
+ asr_model.decoder.freeze()
18
+
19
+
20
+ buffer_len = 3.2
21
+ chunk_len = 0.8
22
+ total_buffer = round(buffer_len * asr_model.cfg.sample_rate)
23
+ overhead_len = round((buffer_len - chunk_len) * asr_model.cfg.sample_rate)
24
+ model_stride = 4
25
+
26
+
27
+
28
+ model_stride_in_secs = asr_model.cfg.preprocessor.window_stride * model_stride
29
+ tokens_per_chunk = ceil(chunk_len / model_stride_in_secs)
30
+ mid_delay = ceil((chunk_len + (buffer_len - chunk_len) / 2) / model_stride_in_secs)
31
+
32
+
33
+
34
+ def resample(audio):
35
+ audio_16k, sr = librosa.load(audio, sr = asr_model.cfg["sample_rate"],
36
+ mono=True, res_type='soxr_hq')
37
+ return audio_16k
38
+
39
+
40
+ def model(audio_16k):
41
+ logits, logits_len, greedy_predictions = asr_model.forward(
42
+ input_signal=torch.tensor([audio_16k]),
43
+ input_signal_length=torch.tensor([len(audio_16k)])
44
+ )
45
+ return logits
46
+
47
+
48
+ def decode_predictions(logits_list):
49
+ logits_len = logits_list[0].shape[1]
50
+ # cut overhead
51
+ cutted_logits = []
52
+ for idx in range(len(logits_list)):
53
+ start_cut = 0 if (idx==0) else logits_len - 1 - mid_delay
54
+ end_cut = -1 if (idx==len(logits_list)-1) else logits_len - 1 - mid_delay + tokens_per_chunk
55
+ logits = logits_list[idx][:, start_cut:end_cut]
56
+ cutted_logits.append(logits)
57
+
58
+ # join
59
+ logits = torch.cat(cutted_logits, axis=1)
60
+ logits_len = torch.tensor([logits.shape[1]])
61
+ current_hypotheses, all_hyp = asr_model.decoding.ctc_decoder_predictions_tensor(
62
+ logits, decoder_lengths=logits_len, return_hypotheses=False,
63
+ )
64
+
65
+ return current_hypotheses[0]
66
+
67
+
68
+ def transcribe(audio, state):
69
+ if state is None:
70
+ state = [np.array([], dtype=np.float32), []]
71
+
72
+ audio_16k = resample(audio)
73
+
74
+ # join to audio sequence
75
+ state[0] = np.concatenate([state[0], audio_16k])
76
+
77
+ while (len(state[0]) > total_buffer):
78
+ buffer = state[0][:total_buffer]
79
+ state[0] = state[0][total_buffer - overhead_len:]
80
+ # run model
81
+ logits = model(buffer)
82
+ # add logits
83
+ state[1].append(logits)
84
+
85
+ if len(state[1]) == 0:
86
+ text = ""
87
+ else:
88
+ text = decode_predictions(state[1])
89
+ return text, state
90
+
91
+
92
+ gr.Interface(
93
+ fn=transcribe,
94
+ inputs=[
95
+ gr.Audio(source="microphone", type="filepath", streaming=True),
96
+ gr.State(None)
97
+ ],
98
+ outputs=[
99
+ "textbox",
100
+ "state"
101
+ ],
102
+ live=True).launch()
pre-requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Cython
2
+ torch
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ nemo_toolkit[asr]