viktor-enzell's picture
Added file conversion to uploaded files. Added chunking to allow transcribing long audio files.
341e55d
import streamlit as st
from transformers import pipeline
from torch import cuda
import torchaudio
import torchaudio.functional as F
from pydub import AudioSegment
import logging
import io
class ASR:
def __init__(self):
self.model_name = "viktor-enzell/wav2vec2-large-voxrex-swedish-4gram"
self.device = cuda.current_device() if cuda.is_available() else -1
self.model = None
def load_model(self):
self.model = pipeline(model=self.model_name, device=self.device)
def run_inference(self, file):
audio = self.load_16khz_audio(file)
return self.model(audio, chunk_length_s=10)["text"].lower()
@staticmethod
def load_16khz_audio(file):
waveform, sample_rate = torchaudio.load(file)
if sample_rate == 16_000:
waveform = waveform[0]
else:
waveform = F.resample(waveform, sample_rate, 16_000)[0]
return waveform.numpy()
@st.cache(allow_output_mutation=True, show_spinner=False)
def load_model():
asr = ASR()
asr.load_model()
return asr
@st.cache(allow_output_mutation=True, hash_funcs={ASR: lambda _: None}, show_spinner=False)
def run_inference(asr, file):
return asr.run_inference(file)
def convert_uploaded_file_to_wav(file):
try:
media_type = file.type.split("/")[0]
file_extension = file.name.split(".")[-1]
if media_type != "audio" and media_type != "video":
return None
if file_extension == "wav":
return file
audio = AudioSegment.from_file(file, file_extension)
in_memory_buffer = io.BytesIO()
return audio.export(in_memory_buffer, format="wav")
except Exception as e:
logging.exception(e)
return None
if __name__ == "__main__":
st.set_page_config(
page_title="Swedish Speech-to-Text",
page_icon="๐ŸŽ™๏ธ"
)
st.image(
"https://emojipedia-us.s3.dualstack.us-west-1.amazonaws.com/thumbs/320/apple/325/studio-microphone_1f399-fe0f.png",
width=100,
)
st.markdown("""
# Swedish Speech-to-text
Generate and download high-quality Swedish transcripts for your audio and video files. The speech-to-text model is KBLab's wav2vec 2.0 large VoxRex Swedish (C) with a 4-gram language model, which you can access [here](https://huggingface.co/viktor-enzell/wav2vec2-large-voxrex-swedish-4gram).
""")
with st.spinner(text="Loading model..."):
asr = load_model()
uploaded_file = st.file_uploader("Choose a file")
if uploaded_file is not None:
file = convert_uploaded_file_to_wav(uploaded_file)
if file is None:
st.error(
"There was a problem handling the uploaded file. Try again using an audio or video file.")
else:
with st.spinner(text="Transcribing..."):
transcript = run_inference(asr, file)
st.download_button("Download transcript",
transcript, "transcript.txt")
with st.expander("Transcript", expanded=True):
st.write(transcript)