Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import torch | |
| import torchaudio | |
| from transformers import pipeline | |
| from streamlit_mic_recorder import mic_recorder,speech_to_text | |
| import numpy as np | |
| option = st.selectbox("How do you want to import the audio file?",("Microphone","Upload file")) | |
| if option == "Microphone": | |
| # Load your own audio file | |
| st.write("Record your voice, and play the recorded audio:") | |
| audio = mic_recorder(start_prompt="Press the botton to start recording ⏺️",stop_prompt="Press the botton to stop to stop the recording⏹️",key='recorder') | |
| if audio == None: | |
| st.write("Please start the recording in the box above") | |
| else: | |
| st.audio(audio["bytes"]) | |
| audio = audio['bytes'] | |
| elif option == "Upload file": | |
| audio = st.file_uploader(label="Upload your audio file here",type=["wav",'mp3']) | |
| if audio: | |
| st.audio(audio) | |
| option_language = st.selectbox( | |
| 'Select the language of your audio', | |
| ('English', 'Spanish', 'German','French','Chinese')) | |
| if audio == None: | |
| st.write("Please upload the audio in the box above") | |
| else: | |
| if option_language == "English": | |
| def transcribe_audio(audio_file): | |
| # Load the audio file | |
| waveform, sample_rate = torchaudio.load(audio_file) | |
| # Ensure mono-channel audio | |
| if waveform.shape[0] > 1: | |
| waveform = torch.mean(waveform, dim=0, keepdim=True) | |
| # Convert to a 16kHz sample rate if not already | |
| if sample_rate != 16000: | |
| waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform) | |
| # Convert to a list of integers | |
| audio_input = waveform.squeeze().numpy().astype(int).tolist() | |
| # Use Hugging Face's ASR pipeline | |
| asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2") | |
| # Transcribe the audio | |
| transcript = asr_pipeline(waveform.numpy()[0]) | |
| return transcript | |
| transcription = transcribe_audio(audio) | |
| st.write("Here is your transcription:") | |
| st.write(transcription) | |
| elif option_language == 'Spanish': | |
| def transcribe_audio(audio_file): | |
| # Load the audio file | |
| waveform, sample_rate = torchaudio.load(audio_file) | |
| # Ensure mono-channel audio | |
| if waveform.shape[0] > 1: | |
| waveform = torch.mean(waveform, dim=0, keepdim=True) | |
| # Convert to a 16kHz sample rate if not already | |
| if sample_rate != 16000: | |
| waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform) | |
| # Convert to a list of integers | |
| audio_input = waveform.squeeze().numpy().astype(int).tolist() | |
| # Use Hugging Face's ASR pipeline | |
| asr_pipeline = pipeline("automatic-speech-recognition", model="Sandiago21/whisper-large-v2-spanish") | |
| # Transcribe the audio | |
| transcript = asr_pipeline(waveform.numpy()[0]) | |
| return transcript | |
| transcription = transcribe_audio(audio) | |
| st.write("Aqui tienes tu transcripcion:") | |
| st.write(transcription) | |
| elif option_language == 'German': | |
| def transcribe_audio(audio_file): | |
| # Load the audio file | |
| waveform, sample_rate = torchaudio.load(audio_file) | |
| # Ensure mono-channel audio | |
| if waveform.shape[0] > 1: | |
| waveform = torch.mean(waveform, dim=0, keepdim=True) | |
| # Convert to a 16kHz sample rate if not already | |
| if sample_rate != 16000: | |
| waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform) | |
| # Convert to a list of integers | |
| audio_input = waveform.squeeze().numpy().astype(int).tolist() | |
| # Use Hugging Face's ASR pipeline | |
| asr_pipeline = pipeline("automatic-speech-recognition", model="primeline/whisper-large-v3-german") | |
| # Transcribe the audio | |
| transcript = asr_pipeline(waveform.numpy()[0]) | |
| return transcript | |
| transcription = transcribe_audio(audio) | |
| st.write("Hier ist Ihre Transkription:") | |
| st.write(transcription) | |
| elif option_language == "French": | |
| def transcribe_audio(audio_file): | |
| # Load the audio file | |
| waveform, sample_rate = torchaudio.load(audio_file) | |
| # Ensure mono-channel audio | |
| if waveform.shape[0] > 1: | |
| waveform = torch.mean(waveform, dim=0, keepdim=True) | |
| # Convert to a 16kHz sample rate if not already | |
| if sample_rate != 16000: | |
| waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform) | |
| # Convert to a list of integers | |
| audio_input = waveform.squeeze().numpy().astype(int).tolist() | |
| # Use Hugging Face's ASR pipeline | |
| asr_pipeline = pipeline("automatic-speech-recognition", model="bofenghuang/whisper-large-v2-french") | |
| # Transcribe the audio | |
| transcript = asr_pipeline(waveform.numpy()[0]) | |
| return transcript | |
| transcription = transcribe_audio(audio) | |
| st.write("Ici, vous avez votre transcription") | |
| st.write(transcription) | |
| elif option_language == "Chinese": | |
| def transcribe_audio(audio_file): | |
| # Load the audio file | |
| waveform, sample_rate = torchaudio.load(audio_file) | |
| # Ensure mono-channel audio | |
| if waveform.shape[0] > 1: | |
| waveform = torch.mean(waveform, dim=0, keepdim=True) | |
| # Convert to a 16kHz sample rate if not already | |
| if sample_rate != 16000: | |
| waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform) | |
| # Convert to a list of integers | |
| audio_input = waveform.squeeze().numpy().astype(int).tolist() | |
| # Use Hugging Face's ASR pipeline | |
| asr_pipeline = pipeline("automatic-speech-recognition", model="yi-ching/whisper-tiny-chinese-test") | |
| # Transcribe the audio | |
| transcript = asr_pipeline(waveform.numpy()[0]) | |
| return transcript | |
| transcription = transcribe_audio(audio) | |
| st.write("这是您的转录。") | |
| st.write(transcription) | |