import streamlit as st import torch from transformers import pipeline import soundfile as sf import io import numpy as np from datetime import timedelta # Set page configuration st.set_page_config( page_title="Audio Transcription with Whisper", page_icon="🎙️", layout="wide" ) # Initialize speech recognition pipeline @st.cache_resource def load_model(): device = "cuda:0" if torch.cuda.is_available() else "cpu" pipe = pipeline( "automatic-speech-recognition", model="openai/whisper-small", chunk_length_s=30, device=device, ) return pipe def format_timestamp(seconds): """Convert seconds to HH:MM:SS format""" return str(timedelta(seconds=seconds)).split('.')[0] def process_audio(audio_file, return_timestamps=False): """Process audio file and return transcription""" try: # Read audio file audio_bytes = audio_file.read() audio_array, sampling_rate = sf.read(io.BytesIO(audio_bytes)) # Convert to mono if stereo if len(audio_array.shape) > 1: audio_array = audio_array.mean(axis=1) # Get predictions pipe = load_model() if return_timestamps: result = pipe({"sampling_rate": sampling_rate, "raw": audio_array}, batch_size=8, return_timestamps=True) return result["chunks"] else: result = pipe({"sampling_rate": sampling_rate, "raw": audio_array}, batch_size=8) return result["text"] except Exception as e: return f"Error processing audio: {str(e)}" # Sidebar with st.sidebar: st.header("Settings") show_timestamps = st.checkbox("Show timestamps", value=False) st.markdown("---") st.markdown("### Model Information") st.markdown(""" - Model: OpenAI Whisper (small) - Device: {device} - Max chunk length: 30 seconds """.format(device="GPU" if torch.cuda.is_available() else "CPU")) st.markdown("---") st.markdown("### Supported Audio Formats") st.markdown(""" - WAV - FLAC - MP3 - OGG """) # Main app st.title("🎙️ Audio Transcription with Whisper") st.markdown("Upload an audio file to get its transcription") # File uploader audio_file = st.file_uploader("Choose an audio file", type=['wav', 'mp3', 'ogg', 'flac']) if audio_file is not None: # Create two columns col1, col2 = st.columns([1, 1]) with col1: st.subheader("Audio File") st.audio(audio_file) file_details = { "Filename": audio_file.name, "File size": f"{audio_file.size / 1024:.2f} KB", "File type": audio_file.type } st.json(file_details) with col2: st.subheader("Transcription") if st.button("Start Transcription"): with st.spinner("Processing audio..."): if show_timestamps: results = process_audio(audio_file, return_timestamps=True) if isinstance(results, list): for chunk in results: st.markdown(f""" **[{format_timestamp(chunk['timestamp'][0])} - {format_timestamp(chunk['timestamp'][1])}]** {chunk['text']} """) else: st.error(results) else: transcription = process_audio(audio_file, return_timestamps=False) if not transcription.startswith("Error"): st.write(transcription) else: st.error(transcription) # Download transcription if 'transcription' in locals(): if not transcription.startswith("Error"): st.download_button( label="Download Transcription", data=transcription, file_name=f"{audio_file.name}_transcription.txt", mime="text/plain" ) # Footer st.markdown("---") st.markdown(""" Built with: - Streamlit - OpenAI Whisper - 🤗 Transformers """) # Display warnings/info st.sidebar.markdown("---") if not torch.cuda.is_available(): st.sidebar.warning("⚠️ Running on CPU. Processing might be slower.")