viktor-enzell commited on
Commit
341e55d
1 Parent(s): 3412de9

Added file conversion to uploaded files. Added chunking to allow transcribing long audio files.

Browse files
Files changed (3) hide show
  1. .gitignore +2 -0
  2. app.py +49 -36
  3. requirements.txt +1 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .DS_Store
2
+ experiments.ipynb
app.py CHANGED
@@ -1,25 +1,28 @@
1
  import streamlit as st
2
- from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
3
- import torch
4
  import torchaudio
5
  import torchaudio.functional as F
 
 
 
6
 
7
 
8
  class ASR:
9
  def __init__(self):
10
  self.model_name = "viktor-enzell/wav2vec2-large-voxrex-swedish-4gram"
11
- self.device = torch.device(
12
- "cuda" if torch.cuda.is_available() else "cpu")
13
  self.model = None
14
- self.processor = None
15
 
16
  def load_model(self):
17
- self.model = Wav2Vec2ForCTC.from_pretrained(
18
- self.model_name).to(self.device)
19
- self.processor = Wav2Vec2ProcessorWithLM.from_pretrained(
20
- self.model_name)
21
 
22
  def run_inference(self, file):
 
 
 
 
 
23
  waveform, sample_rate = torchaudio.load(file)
24
 
25
  if sample_rate == 16_000:
@@ -27,17 +30,7 @@ class ASR:
27
  else:
28
  waveform = F.resample(waveform, sample_rate, 16_000)[0]
29
 
30
- inputs = self.processor(
31
- waveform,
32
- sampling_rate=16_000,
33
- return_tensors="pt",
34
- padding=True
35
- ).to(self.device)
36
-
37
- with torch.no_grad():
38
- logits = self.model(**inputs).logits
39
-
40
- return self.processor.batch_decode(logits.cpu().numpy()).text[0].lower()
41
 
42
 
43
  @st.cache(allow_output_mutation=True, show_spinner=False)
@@ -52,6 +45,26 @@ def run_inference(asr, file):
52
  return asr.run_inference(file)
53
 
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  if __name__ == "__main__":
56
  st.set_page_config(
57
  page_title="Swedish Speech-to-Text",
@@ -64,25 +77,25 @@ if __name__ == "__main__":
64
  st.markdown("""
65
  # Swedish Speech-to-text
66
 
67
- Generate and download high-quality Swedish transcripts for your audio files. The speech-to-text model is KBLab's wav2vec 2.0 large VoxRex Swedish (C) with a 4-gram language model, which you can access [here](https://huggingface.co/viktor-enzell/wav2vec2-large-voxrex-swedish-4gram).
68
  """)
69
 
70
  with st.spinner(text="Loading model..."):
71
  asr = load_model()
72
 
73
- uploaded_file = st.file_uploader("Choose a file", type=[".wav"])
 
74
  if uploaded_file is not None:
75
- if uploaded_file.type != "audio/wav":
76
- pass
77
- # TODO: convert to wav
78
- # bytes = uploaded_file.getvalue()
79
- # audio_input = ffmpeg.input(bytes).audio
80
- # audio_output = ffmpeg.output(audio_input, "tmp.wav", format="wav")
81
- # ffmpeg.run(audio_output)
82
-
83
- with st.spinner(text="Transcribing..."):
84
- transcript = run_inference(asr, uploaded_file)
85
- st.download_button("Download transcript", transcript, "transcript.txt")
86
-
87
- with st.expander("Transcript", expanded=True):
88
- st.write(transcript)
 
1
  import streamlit as st
2
+ from transformers import pipeline
3
+ from torch import cuda
4
  import torchaudio
5
  import torchaudio.functional as F
6
+ from pydub import AudioSegment
7
+ import logging
8
+ import io
9
 
10
 
11
  class ASR:
12
  def __init__(self):
13
  self.model_name = "viktor-enzell/wav2vec2-large-voxrex-swedish-4gram"
14
+ self.device = cuda.current_device() if cuda.is_available() else -1
 
15
  self.model = None
 
16
 
17
  def load_model(self):
18
+ self.model = pipeline(model=self.model_name, device=self.device)
 
 
 
19
 
20
  def run_inference(self, file):
21
+ audio = self.load_16khz_audio(file)
22
+ return self.model(audio, chunk_length_s=10)["text"].lower()
23
+
24
+ @staticmethod
25
+ def load_16khz_audio(file):
26
  waveform, sample_rate = torchaudio.load(file)
27
 
28
  if sample_rate == 16_000:
 
30
  else:
31
  waveform = F.resample(waveform, sample_rate, 16_000)[0]
32
 
33
+ return waveform.numpy()
 
 
 
 
 
 
 
 
 
 
34
 
35
 
36
  @st.cache(allow_output_mutation=True, show_spinner=False)
 
45
  return asr.run_inference(file)
46
 
47
 
48
+ def convert_uploaded_file_to_wav(file):
49
+ try:
50
+ media_type = file.type.split("/")[0]
51
+ file_extension = file.name.split(".")[-1]
52
+
53
+ if media_type != "audio" and media_type != "video":
54
+ return None
55
+
56
+ if file_extension == "wav":
57
+ return file
58
+
59
+ audio = AudioSegment.from_file(file, file_extension)
60
+ in_memory_buffer = io.BytesIO()
61
+ return audio.export(in_memory_buffer, format="wav")
62
+
63
+ except Exception as e:
64
+ logging.exception(e)
65
+ return None
66
+
67
+
68
  if __name__ == "__main__":
69
  st.set_page_config(
70
  page_title="Swedish Speech-to-Text",
 
77
  st.markdown("""
78
  # Swedish Speech-to-text
79
 
80
+ Generate and download high-quality Swedish transcripts for your audio and video files. The speech-to-text model is KBLab's wav2vec 2.0 large VoxRex Swedish (C) with a 4-gram language model, which you can access [here](https://huggingface.co/viktor-enzell/wav2vec2-large-voxrex-swedish-4gram).
81
  """)
82
 
83
  with st.spinner(text="Loading model..."):
84
  asr = load_model()
85
 
86
+ uploaded_file = st.file_uploader("Choose a file")
87
+
88
  if uploaded_file is not None:
89
+ file = convert_uploaded_file_to_wav(uploaded_file)
90
+
91
+ if file is None:
92
+ st.error(
93
+ "There was a problem handling the uploaded file. Try again using an audio or video file.")
94
+ else:
95
+ with st.spinner(text="Transcribing..."):
96
+ transcript = run_inference(asr, file)
97
+ st.download_button("Download transcript",
98
+ transcript, "transcript.txt")
99
+
100
+ with st.expander("Transcript", expanded=True):
101
+ st.write(transcript)
 
requirements.txt CHANGED
@@ -3,3 +3,4 @@ torchaudio==0.10.1
3
  transformers==4.19.2
4
  pyctcdecode==0.3.0
5
  https://github.com/kpu/kenlm/archive/master.zip
 
 
3
  transformers==4.19.2
4
  pyctcdecode==0.3.0
5
  https://github.com/kpu/kenlm/archive/master.zip
6
+ pydub==0.25.1