|
import streamlit as st |
|
import torch |
|
from transformers import pipeline |
|
import soundfile as sf |
|
import io |
|
import numpy as np |
|
from datetime import timedelta |
|
|
|
|
|
st.set_page_config( |
|
page_title="Audio Transcription with Whisper", |
|
page_icon="ποΈ", |
|
layout="wide" |
|
) |
|
|
|
|
|
@st.cache_resource |
|
def load_model(): |
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
pipe = pipeline( |
|
"automatic-speech-recognition", |
|
model="openai/whisper-small", |
|
chunk_length_s=30, |
|
device=device, |
|
) |
|
return pipe |
|
|
|
def format_timestamp(seconds): |
|
"""Convert seconds to HH:MM:SS format""" |
|
return str(timedelta(seconds=seconds)).split('.')[0] |
|
|
|
def process_audio(audio_file, return_timestamps=False): |
|
"""Process audio file and return transcription""" |
|
try: |
|
|
|
audio_bytes = audio_file.read() |
|
audio_array, sampling_rate = sf.read(io.BytesIO(audio_bytes)) |
|
|
|
|
|
if len(audio_array.shape) > 1: |
|
audio_array = audio_array.mean(axis=1) |
|
|
|
|
|
pipe = load_model() |
|
if return_timestamps: |
|
result = pipe({"sampling_rate": sampling_rate, "raw": audio_array}, |
|
batch_size=8, return_timestamps=True) |
|
return result["chunks"] |
|
else: |
|
result = pipe({"sampling_rate": sampling_rate, "raw": audio_array}, |
|
batch_size=8) |
|
return result["text"] |
|
|
|
except Exception as e: |
|
return f"Error processing audio: {str(e)}" |
|
|
|
|
|
with st.sidebar: |
|
st.header("Settings") |
|
show_timestamps = st.checkbox("Show timestamps", value=False) |
|
|
|
st.markdown("---") |
|
st.markdown("### Model Information") |
|
st.markdown(""" |
|
- Model: OpenAI Whisper (small) |
|
- Device: {device} |
|
- Max chunk length: 30 seconds |
|
""".format(device="GPU" if torch.cuda.is_available() else "CPU")) |
|
|
|
st.markdown("---") |
|
st.markdown("### Supported Audio Formats") |
|
st.markdown(""" |
|
- WAV |
|
- FLAC |
|
- MP3 |
|
- OGG |
|
""") |
|
|
|
|
|
st.title("ποΈ Audio Transcription with Whisper") |
|
st.markdown("Upload an audio file to get its transcription") |
|
|
|
|
|
audio_file = st.file_uploader("Choose an audio file", |
|
type=['wav', 'mp3', 'ogg', 'flac']) |
|
|
|
if audio_file is not None: |
|
|
|
col1, col2 = st.columns([1, 1]) |
|
|
|
with col1: |
|
st.subheader("Audio File") |
|
st.audio(audio_file) |
|
|
|
file_details = { |
|
"Filename": audio_file.name, |
|
"File size": f"{audio_file.size / 1024:.2f} KB", |
|
"File type": audio_file.type |
|
} |
|
|
|
st.json(file_details) |
|
|
|
with col2: |
|
st.subheader("Transcription") |
|
|
|
if st.button("Start Transcription"): |
|
with st.spinner("Processing audio..."): |
|
if show_timestamps: |
|
results = process_audio(audio_file, return_timestamps=True) |
|
|
|
if isinstance(results, list): |
|
for chunk in results: |
|
st.markdown(f""" |
|
**[{format_timestamp(chunk['timestamp'][0])} - {format_timestamp(chunk['timestamp'][1])}]** |
|
{chunk['text']} |
|
""") |
|
else: |
|
st.error(results) |
|
else: |
|
transcription = process_audio(audio_file, return_timestamps=False) |
|
if not transcription.startswith("Error"): |
|
st.write(transcription) |
|
else: |
|
st.error(transcription) |
|
|
|
|
|
if 'transcription' in locals(): |
|
if not transcription.startswith("Error"): |
|
st.download_button( |
|
label="Download Transcription", |
|
data=transcription, |
|
file_name=f"{audio_file.name}_transcription.txt", |
|
mime="text/plain" |
|
) |
|
|
|
|
|
st.markdown("---") |
|
st.markdown(""" |
|
Built with: |
|
- Streamlit |
|
- OpenAI Whisper |
|
- π€ Transformers |
|
""") |
|
|
|
|
|
st.sidebar.markdown("---") |
|
if not torch.cuda.is_available(): |
|
st.sidebar.warning("β οΈ Running on CPU. Processing might be slower.") |