Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,49 +1,89 @@
|
|
1 |
from transformers import pipeline
|
|
|
2 |
import gradio as gr
|
3 |
-
|
|
|
|
|
4 |
|
5 |
-
|
6 |
-
#decoder = BeamSearchDecoderCTC.load_from_hf_hub(lmID)
|
7 |
-
p = pipeline("automatic-speech-recognition", model="aware-ai/wav2vec2-xls-r-1b-5gram-german")
|
8 |
-
ttp = pipeline("text2text-generation", model="aware-ai/marian-german-grammar")
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
return
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
names = [
|
45 |
-
"ASR",
|
46 |
-
"GRAMMAR",
|
47 |
-
]
|
48 |
-
|
49 |
-
gr.TabbedInterface(interfaces, names).launch(server_name = "0.0.0.0", enable_queue=False)
|
|
|
1 |
from transformers import pipeline
|
2 |
+
import torch
|
3 |
import gradio as gr
|
4 |
+
import subprocess
|
5 |
+
import numpy as np
|
6 |
+
import time
|
7 |
|
8 |
+
p = pipeline("automatic-speech-recognition", model="aware-ai/wav2vec2-base-german")
|
|
|
|
|
|
|
9 |
|
10 |
+
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
|
11 |
+
model='silero_vad', force_reload=False, onnx=True)
|
12 |
+
|
13 |
+
def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
|
14 |
+
"""
|
15 |
+
Helper function to read an audio file through ffmpeg.
|
16 |
+
"""
|
17 |
+
ar = f"{sampling_rate}"
|
18 |
+
ac = "1"
|
19 |
+
format_for_conversion = "f32le"
|
20 |
+
ffmpeg_command = [
|
21 |
+
"ffmpeg",
|
22 |
+
"-i",
|
23 |
+
"pipe:0",
|
24 |
+
"-ac",
|
25 |
+
ac,
|
26 |
+
"-ar",
|
27 |
+
ar,
|
28 |
+
"-f",
|
29 |
+
format_for_conversion,
|
30 |
+
"-hide_banner",
|
31 |
+
"-loglevel",
|
32 |
+
"quiet",
|
33 |
+
"pipe:1",
|
34 |
+
]
|
35 |
|
36 |
+
try:
|
37 |
+
with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process:
|
38 |
+
output_stream = ffmpeg_process.communicate(bpayload)
|
39 |
+
except FileNotFoundError as error:
|
40 |
+
raise ValueError("ffmpeg was not found but is required to load audio files from filename") from error
|
41 |
+
out_bytes = output_stream[0]
|
42 |
+
audio = np.frombuffer(out_bytes, np.float32)
|
43 |
+
if audio.shape[0] == 0:
|
44 |
+
raise ValueError("Malformed soundfile")
|
45 |
+
return audio
|
46 |
+
|
47 |
+
(get_speech_timestamps,
|
48 |
+
_, read_audio,
|
49 |
+
*_) = utils
|
50 |
+
|
51 |
+
def is_speech(wav, sr):
|
52 |
+
speech_timestamps = get_speech_timestamps(wav, model,
|
53 |
+
sampling_rate=sr)
|
54 |
+
|
55 |
+
return len(speech_timestamps) > 0
|
56 |
+
|
57 |
+
def transcribe(audio, state={"text": "", "temp_text": "", "audio": ""}):
|
58 |
+
if state is None:
|
59 |
+
state={"text": "", "temp_text": "", "audio": ""}
|
60 |
+
with open(audio, "rb") as f:
|
61 |
+
payload = f.read()
|
62 |
+
audio = ffmpeg_read(payload, sampling_rate=16000)
|
63 |
+
_sr = 16000
|
64 |
+
|
65 |
+
speech = is_speech(wav_data, _sr)
|
66 |
+
if(speech):
|
67 |
+
if(state["audio"] is ""):
|
68 |
+
state["audio"] = wav_data
|
69 |
+
else:
|
70 |
+
state["audio"] = np.concatenate((state["audio"], wav_data))
|
71 |
+
else:
|
72 |
+
if(state["audio"] is not ""):
|
73 |
+
text = p(state["audio"])["text"] + "\n"
|
74 |
+
state["temp_text"] = text
|
75 |
|
76 |
+
state["text"] += state["temp_text"]
|
77 |
+
state["temp_text"] = ""
|
78 |
+
state["audio"] = ""
|
79 |
+
|
80 |
+
time.sleep(0.5)
|
81 |
+
return f'{state["text"]} ( {state["temp_text"]} )', state
|
82 |
+
|
83 |
+
gr.Interface(
|
84 |
+
transcribe,
|
85 |
+
[gr.Audio(source="microphone", type="filepath", streaming=True), "state"],
|
86 |
+
|
87 |
+
[gr.Textbox(),"state"],
|
88 |
+
live=True
|
89 |
+
).launch(server_name = "0.0.0.0")
|
|
|
|
|
|
|
|
|
|
|
|
|
|