DexterSptizu's picture
Create app.py
fd95270 verified
import streamlit as st
import torch
from transformers import pipeline
import soundfile as sf
import io
import numpy as np
from datetime import timedelta
# Set page configuration
st.set_page_config(
page_title="Audio Transcription with Whisper",
page_icon="πŸŽ™οΈ",
layout="wide"
)
# Initialize speech recognition pipeline
@st.cache_resource
def load_model():
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline(
"automatic-speech-recognition",
model="openai/whisper-small",
chunk_length_s=30,
device=device,
)
return pipe
def format_timestamp(seconds):
"""Convert seconds to HH:MM:SS format"""
return str(timedelta(seconds=seconds)).split('.')[0]
def process_audio(audio_file, return_timestamps=False):
"""Process audio file and return transcription"""
try:
# Read audio file
audio_bytes = audio_file.read()
audio_array, sampling_rate = sf.read(io.BytesIO(audio_bytes))
# Convert to mono if stereo
if len(audio_array.shape) > 1:
audio_array = audio_array.mean(axis=1)
# Get predictions
pipe = load_model()
if return_timestamps:
result = pipe({"sampling_rate": sampling_rate, "raw": audio_array},
batch_size=8, return_timestamps=True)
return result["chunks"]
else:
result = pipe({"sampling_rate": sampling_rate, "raw": audio_array},
batch_size=8)
return result["text"]
except Exception as e:
return f"Error processing audio: {str(e)}"
# Sidebar
with st.sidebar:
st.header("Settings")
show_timestamps = st.checkbox("Show timestamps", value=False)
st.markdown("---")
st.markdown("### Model Information")
st.markdown("""
- Model: OpenAI Whisper (small)
- Device: {device}
- Max chunk length: 30 seconds
""".format(device="GPU" if torch.cuda.is_available() else "CPU"))
st.markdown("---")
st.markdown("### Supported Audio Formats")
st.markdown("""
- WAV
- FLAC
- MP3
- OGG
""")
# Main app
st.title("πŸŽ™οΈ Audio Transcription with Whisper")
st.markdown("Upload an audio file to get its transcription")
# File uploader
audio_file = st.file_uploader("Choose an audio file",
type=['wav', 'mp3', 'ogg', 'flac'])
if audio_file is not None:
# Create two columns
col1, col2 = st.columns([1, 1])
with col1:
st.subheader("Audio File")
st.audio(audio_file)
file_details = {
"Filename": audio_file.name,
"File size": f"{audio_file.size / 1024:.2f} KB",
"File type": audio_file.type
}
st.json(file_details)
with col2:
st.subheader("Transcription")
if st.button("Start Transcription"):
with st.spinner("Processing audio..."):
if show_timestamps:
results = process_audio(audio_file, return_timestamps=True)
if isinstance(results, list):
for chunk in results:
st.markdown(f"""
**[{format_timestamp(chunk['timestamp'][0])} - {format_timestamp(chunk['timestamp'][1])}]**
{chunk['text']}
""")
else:
st.error(results)
else:
transcription = process_audio(audio_file, return_timestamps=False)
if not transcription.startswith("Error"):
st.write(transcription)
else:
st.error(transcription)
# Download transcription
if 'transcription' in locals():
if not transcription.startswith("Error"):
st.download_button(
label="Download Transcription",
data=transcription,
file_name=f"{audio_file.name}_transcription.txt",
mime="text/plain"
)
# Footer
st.markdown("---")
st.markdown("""
Built with:
- Streamlit
- OpenAI Whisper
- πŸ€— Transformers
""")
# Display warnings/info
st.sidebar.markdown("---")
if not torch.cuda.is_available():
st.sidebar.warning("⚠️ Running on CPU. Processing might be slower.")