Spaces:

JavierGon12
/

retrAIced

Runtime error

File size: 6,725 Bytes

from transformers import BartForConditionalGeneration, BartTokenizer
import streamlit as st
import torch
from transformers import AutoProcessor, WhisperForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torchaudio
from transformers import pipeline
from streamlit_mic_recorder import mic_recorder,speech_to_text
import numpy as np


option = st.selectbox("How do you want to import the audio file?",("Microphone","Upload file"))
if option == "Microphone":
# Load your own audio file
    st.write("Record your voice, and play the recorded audio:")
    audio = mic_recorder(start_prompt="Press the botton to start recording ⏺️",stop_prompt="Press the botton to stop to stop the recording⏹️",key='recorder')
    
    if audio == None:
        st.write("Please start the recording in the box above")
    else:       
        st.audio(audio["bytes"])
        audio = audio['bytes']

elif option == "Upload file":
    audio = st.file_uploader(label="Upload your audio file here",type=["wav",'mp3'])
    if audio:
        st.audio(audio)

option_language = st.selectbox(
    'Select the language of your audio',
    ('English', 'Spanish', 'German','French','Chinese'))


if audio == None:
    st.write("Please upload the audio in the box above")


else:
    if option_language == "English":
        def transcribe_audio(audio_file):
            # Load the audio file
            waveform, sample_rate = torchaudio.load(audio_file)

            # Ensure mono-channel audio
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)

            # Convert to a 16kHz sample rate if not already
            if sample_rate != 16000:
                waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
            
            # Convert to a list of integers
            audio_input = waveform.squeeze().numpy().astype(int).tolist()

            # Use Hugging Face's ASR pipeline
            asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2")
            
            # Transcribe the audio
            transcript = asr_pipeline(waveform.numpy()[0])

            return transcript

        transcription = transcribe_audio(audio)
        st.write("Here is your transcription:")
        st.write(transcription)

    elif option_language == 'Spanish':

        def transcribe_audio(audio_file):
            
            # Load the audio file
            waveform, sample_rate = torchaudio.load(audio_file)

            # Ensure mono-channel audio
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)

            # Convert to a 16kHz sample rate if not already
            if sample_rate != 16000:
                waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
            
            # Convert to a list of integers
            audio_input = waveform.squeeze().numpy().astype(int).tolist()

            # Use Hugging Face's ASR pipeline
            asr_pipeline = pipeline("automatic-speech-recognition", model="Sandiago21/whisper-large-v2-spanish")
            
            # Transcribe the audio
            transcript = asr_pipeline(waveform.numpy()[0])

            return transcript
        
        transcription = transcribe_audio(audio)
        st.write("Aqui tienes tu transcripcion:")
        st.write(transcription)
    elif option_language == 'German':
        def transcribe_audio(audio_file):
            
            # Load the audio file
            waveform, sample_rate = torchaudio.load(audio_file)

            # Ensure mono-channel audio
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)

            # Convert to a 16kHz sample rate if not already
            if sample_rate != 16000:
                waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
            
            # Convert to a list of integers
            audio_input = waveform.squeeze().numpy().astype(int).tolist()

            # Use Hugging Face's ASR pipeline
            asr_pipeline = pipeline("automatic-speech-recognition", model="primeline/whisper-large-v3-german")
            
            # Transcribe the audio
            transcript = asr_pipeline(waveform.numpy()[0])

            return transcript
        
        transcription = transcribe_audio(audio)
        st.write("Hier ist Ihre Transkription:")
        st.write(transcription)
    elif option_language == "French":
        def transcribe_audio(audio_file):
            
            # Load the audio file
            waveform, sample_rate = torchaudio.load(audio_file)

            # Ensure mono-channel audio
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)

            # Convert to a 16kHz sample rate if not already
            if sample_rate != 16000:
                waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
            
            # Convert to a list of integers
            audio_input = waveform.squeeze().numpy().astype(int).tolist()

            # Use Hugging Face's ASR pipeline
            asr_pipeline = pipeline("automatic-speech-recognition", model="bofenghuang/whisper-large-v2-french")
            
            # Transcribe the audio
            transcript = asr_pipeline(waveform.numpy()[0])

            return transcript
        
        transcription = transcribe_audio(audio)
        st.write("Ici, vous avez votre transcription")
        st.write(transcription)

    elif option_language == "Chinese":
        def transcribe_audio(audio_file):
            
            # Load the audio file
            waveform, sample_rate = torchaudio.load(audio_file)

            # Ensure mono-channel audio
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)

            # Convert to a 16kHz sample rate if not already
            if sample_rate != 16000:
                waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
            
            # Convert to a list of integers
            audio_input = waveform.squeeze().numpy().astype(int).tolist()

            # Use Hugging Face's ASR pipeline
            asr_pipeline = pipeline("automatic-speech-recognition", model="yi-ching/whisper-tiny-chinese-test")
            
            # Transcribe the audio
            transcript = asr_pipeline(waveform.numpy()[0])

            return transcript
        
        transcription = transcribe_audio(audio)
        st.write("这是您的转录。")
        st.write(transcription)