Spaces:
Runtime error
Runtime error
viktor-enzell
commited on
Commit
•
341e55d
1
Parent(s):
3412de9
Added file conversion to uploaded files. Added chunking to allow transcribing long audio files.
Browse files- .gitignore +2 -0
- app.py +49 -36
- requirements.txt +1 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
.DS_Store
|
2 |
+
experiments.ipynb
|
app.py
CHANGED
@@ -1,25 +1,28 @@
|
|
1 |
import streamlit as st
|
2 |
-
from transformers import
|
3 |
-
import
|
4 |
import torchaudio
|
5 |
import torchaudio.functional as F
|
|
|
|
|
|
|
6 |
|
7 |
|
8 |
class ASR:
|
9 |
def __init__(self):
|
10 |
self.model_name = "viktor-enzell/wav2vec2-large-voxrex-swedish-4gram"
|
11 |
-
self.device =
|
12 |
-
"cuda" if torch.cuda.is_available() else "cpu")
|
13 |
self.model = None
|
14 |
-
self.processor = None
|
15 |
|
16 |
def load_model(self):
|
17 |
-
self.model =
|
18 |
-
self.model_name).to(self.device)
|
19 |
-
self.processor = Wav2Vec2ProcessorWithLM.from_pretrained(
|
20 |
-
self.model_name)
|
21 |
|
22 |
def run_inference(self, file):
|
|
|
|
|
|
|
|
|
|
|
23 |
waveform, sample_rate = torchaudio.load(file)
|
24 |
|
25 |
if sample_rate == 16_000:
|
@@ -27,17 +30,7 @@ class ASR:
|
|
27 |
else:
|
28 |
waveform = F.resample(waveform, sample_rate, 16_000)[0]
|
29 |
|
30 |
-
|
31 |
-
waveform,
|
32 |
-
sampling_rate=16_000,
|
33 |
-
return_tensors="pt",
|
34 |
-
padding=True
|
35 |
-
).to(self.device)
|
36 |
-
|
37 |
-
with torch.no_grad():
|
38 |
-
logits = self.model(**inputs).logits
|
39 |
-
|
40 |
-
return self.processor.batch_decode(logits.cpu().numpy()).text[0].lower()
|
41 |
|
42 |
|
43 |
@st.cache(allow_output_mutation=True, show_spinner=False)
|
@@ -52,6 +45,26 @@ def run_inference(asr, file):
|
|
52 |
return asr.run_inference(file)
|
53 |
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
if __name__ == "__main__":
|
56 |
st.set_page_config(
|
57 |
page_title="Swedish Speech-to-Text",
|
@@ -64,25 +77,25 @@ if __name__ == "__main__":
|
|
64 |
st.markdown("""
|
65 |
# Swedish Speech-to-text
|
66 |
|
67 |
-
Generate and download high-quality Swedish transcripts for your audio files. The speech-to-text model is KBLab's wav2vec 2.0 large VoxRex Swedish (C) with a 4-gram language model, which you can access [here](https://huggingface.co/viktor-enzell/wav2vec2-large-voxrex-swedish-4gram).
|
68 |
""")
|
69 |
|
70 |
with st.spinner(text="Loading model..."):
|
71 |
asr = load_model()
|
72 |
|
73 |
-
uploaded_file = st.file_uploader("Choose a file"
|
|
|
74 |
if uploaded_file is not None:
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
st.write(transcript)
|
|
|
1 |
import streamlit as st
|
2 |
+
from transformers import pipeline
|
3 |
+
from torch import cuda
|
4 |
import torchaudio
|
5 |
import torchaudio.functional as F
|
6 |
+
from pydub import AudioSegment
|
7 |
+
import logging
|
8 |
+
import io
|
9 |
|
10 |
|
11 |
class ASR:
|
12 |
def __init__(self):
|
13 |
self.model_name = "viktor-enzell/wav2vec2-large-voxrex-swedish-4gram"
|
14 |
+
self.device = cuda.current_device() if cuda.is_available() else -1
|
|
|
15 |
self.model = None
|
|
|
16 |
|
17 |
def load_model(self):
|
18 |
+
self.model = pipeline(model=self.model_name, device=self.device)
|
|
|
|
|
|
|
19 |
|
20 |
def run_inference(self, file):
|
21 |
+
audio = self.load_16khz_audio(file)
|
22 |
+
return self.model(audio, chunk_length_s=10)["text"].lower()
|
23 |
+
|
24 |
+
@staticmethod
|
25 |
+
def load_16khz_audio(file):
|
26 |
waveform, sample_rate = torchaudio.load(file)
|
27 |
|
28 |
if sample_rate == 16_000:
|
|
|
30 |
else:
|
31 |
waveform = F.resample(waveform, sample_rate, 16_000)[0]
|
32 |
|
33 |
+
return waveform.numpy()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
|
36 |
@st.cache(allow_output_mutation=True, show_spinner=False)
|
|
|
45 |
return asr.run_inference(file)
|
46 |
|
47 |
|
48 |
+
def convert_uploaded_file_to_wav(file):
|
49 |
+
try:
|
50 |
+
media_type = file.type.split("/")[0]
|
51 |
+
file_extension = file.name.split(".")[-1]
|
52 |
+
|
53 |
+
if media_type != "audio" and media_type != "video":
|
54 |
+
return None
|
55 |
+
|
56 |
+
if file_extension == "wav":
|
57 |
+
return file
|
58 |
+
|
59 |
+
audio = AudioSegment.from_file(file, file_extension)
|
60 |
+
in_memory_buffer = io.BytesIO()
|
61 |
+
return audio.export(in_memory_buffer, format="wav")
|
62 |
+
|
63 |
+
except Exception as e:
|
64 |
+
logging.exception(e)
|
65 |
+
return None
|
66 |
+
|
67 |
+
|
68 |
if __name__ == "__main__":
|
69 |
st.set_page_config(
|
70 |
page_title="Swedish Speech-to-Text",
|
|
|
77 |
st.markdown("""
|
78 |
# Swedish Speech-to-text
|
79 |
|
80 |
+
Generate and download high-quality Swedish transcripts for your audio and video files. The speech-to-text model is KBLab's wav2vec 2.0 large VoxRex Swedish (C) with a 4-gram language model, which you can access [here](https://huggingface.co/viktor-enzell/wav2vec2-large-voxrex-swedish-4gram).
|
81 |
""")
|
82 |
|
83 |
with st.spinner(text="Loading model..."):
|
84 |
asr = load_model()
|
85 |
|
86 |
+
uploaded_file = st.file_uploader("Choose a file")
|
87 |
+
|
88 |
if uploaded_file is not None:
|
89 |
+
file = convert_uploaded_file_to_wav(uploaded_file)
|
90 |
+
|
91 |
+
if file is None:
|
92 |
+
st.error(
|
93 |
+
"There was a problem handling the uploaded file. Try again using an audio or video file.")
|
94 |
+
else:
|
95 |
+
with st.spinner(text="Transcribing..."):
|
96 |
+
transcript = run_inference(asr, file)
|
97 |
+
st.download_button("Download transcript",
|
98 |
+
transcript, "transcript.txt")
|
99 |
+
|
100 |
+
with st.expander("Transcript", expanded=True):
|
101 |
+
st.write(transcript)
|
|
requirements.txt
CHANGED
@@ -3,3 +3,4 @@ torchaudio==0.10.1
|
|
3 |
transformers==4.19.2
|
4 |
pyctcdecode==0.3.0
|
5 |
https://github.com/kpu/kenlm/archive/master.zip
|
|
|
|
3 |
transformers==4.19.2
|
4 |
pyctcdecode==0.3.0
|
5 |
https://github.com/kpu/kenlm/archive/master.zip
|
6 |
+
pydub==0.25.1
|