retrAIced / pages /Speech Recognition.py
JavierGon12's picture
Change intro and add audio from microphone to speech recognition
5c97133
raw
history blame
6.73 kB
from transformers import BartForConditionalGeneration, BartTokenizer
import streamlit as st
import torch
from transformers import AutoProcessor, WhisperForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torchaudio
from transformers import pipeline
from streamlit_mic_recorder import mic_recorder,speech_to_text
import numpy as np
option = st.selectbox("How do you want to import the audio file?",("Microphone","Upload file"))
if option == "Microphone":
# Load your own audio file
st.write("Record your voice, and play the recorded audio:")
audio = mic_recorder(start_prompt="Press the botton to start recording ⏺️",stop_prompt="Press the botton to stop to stop the recording⏹️",key='recorder')
if audio == None:
st.write("Please start the recording in the box above")
else:
st.audio(audio["bytes"])
audio = audio['bytes']
elif option == "Upload file":
audio = st.file_uploader(label="Upload your audio file here",type=["wav",'mp3'])
if audio:
st.audio(audio)
option_language = st.selectbox(
'Select the language of your audio',
('English', 'Spanish', 'German','French','Chinese'))
if audio == None:
st.write("Please upload the audio in the box above")
else:
if option_language == "English":
def transcribe_audio(audio_file):
# Load the audio file
waveform, sample_rate = torchaudio.load(audio_file)
# Ensure mono-channel audio
if waveform.shape[0] > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
# Convert to a 16kHz sample rate if not already
if sample_rate != 16000:
waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
# Convert to a list of integers
audio_input = waveform.squeeze().numpy().astype(int).tolist()
# Use Hugging Face's ASR pipeline
asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2")
# Transcribe the audio
transcript = asr_pipeline(waveform.numpy()[0])
return transcript
transcription = transcribe_audio(audio)
st.write("Here is your transcription:")
st.write(transcription)
elif option_language == 'Spanish':
def transcribe_audio(audio_file):
# Load the audio file
waveform, sample_rate = torchaudio.load(audio_file)
# Ensure mono-channel audio
if waveform.shape[0] > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
# Convert to a 16kHz sample rate if not already
if sample_rate != 16000:
waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
# Convert to a list of integers
audio_input = waveform.squeeze().numpy().astype(int).tolist()
# Use Hugging Face's ASR pipeline
asr_pipeline = pipeline("automatic-speech-recognition", model="Sandiago21/whisper-large-v2-spanish")
# Transcribe the audio
transcript = asr_pipeline(waveform.numpy()[0])
return transcript
transcription = transcribe_audio(audio)
st.write("Aqui tienes tu transcripcion:")
st.write(transcription)
elif option_language == 'German':
def transcribe_audio(audio_file):
# Load the audio file
waveform, sample_rate = torchaudio.load(audio_file)
# Ensure mono-channel audio
if waveform.shape[0] > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
# Convert to a 16kHz sample rate if not already
if sample_rate != 16000:
waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
# Convert to a list of integers
audio_input = waveform.squeeze().numpy().astype(int).tolist()
# Use Hugging Face's ASR pipeline
asr_pipeline = pipeline("automatic-speech-recognition", model="primeline/whisper-large-v3-german")
# Transcribe the audio
transcript = asr_pipeline(waveform.numpy()[0])
return transcript
transcription = transcribe_audio(audio)
st.write("Hier ist Ihre Transkription:")
st.write(transcription)
elif option_language == "French":
def transcribe_audio(audio_file):
# Load the audio file
waveform, sample_rate = torchaudio.load(audio_file)
# Ensure mono-channel audio
if waveform.shape[0] > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
# Convert to a 16kHz sample rate if not already
if sample_rate != 16000:
waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
# Convert to a list of integers
audio_input = waveform.squeeze().numpy().astype(int).tolist()
# Use Hugging Face's ASR pipeline
asr_pipeline = pipeline("automatic-speech-recognition", model="bofenghuang/whisper-large-v2-french")
# Transcribe the audio
transcript = asr_pipeline(waveform.numpy()[0])
return transcript
transcription = transcribe_audio(audio)
st.write("Ici, vous avez votre transcription")
st.write(transcription)
elif option_language == "Chinese":
def transcribe_audio(audio_file):
# Load the audio file
waveform, sample_rate = torchaudio.load(audio_file)
# Ensure mono-channel audio
if waveform.shape[0] > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
# Convert to a 16kHz sample rate if not already
if sample_rate != 16000:
waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
# Convert to a list of integers
audio_input = waveform.squeeze().numpy().astype(int).tolist()
# Use Hugging Face's ASR pipeline
asr_pipeline = pipeline("automatic-speech-recognition", model="yi-ching/whisper-tiny-chinese-test")
# Transcribe the audio
transcript = asr_pipeline(waveform.numpy()[0])
return transcript
transcription = transcribe_audio(audio)
st.write("这是您的转录。")
st.write(transcription)