File size: 6,725 Bytes
d8e07ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c97133
d8e07ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
from transformers import BartForConditionalGeneration, BartTokenizer
import streamlit as st
import torch
from transformers import AutoProcessor, WhisperForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torchaudio
from transformers import pipeline
from streamlit_mic_recorder import mic_recorder,speech_to_text
import numpy as np


option = st.selectbox("How do you want to import the audio file?",("Microphone","Upload file"))
if option == "Microphone":
# Load your own audio file
    st.write("Record your voice, and play the recorded audio:")
    audio = mic_recorder(start_prompt="Press the botton to start recording ⏺️",stop_prompt="Press the botton to stop to stop the recording⏹️",key='recorder')
    
    if audio == None:
        st.write("Please start the recording in the box above")
    else:       
        st.audio(audio["bytes"])
        audio = audio['bytes']

elif option == "Upload file":
    audio = st.file_uploader(label="Upload your audio file here",type=["wav",'mp3'])
    if audio:
        st.audio(audio)

option_language = st.selectbox(
    'Select the language of your audio',
    ('English', 'Spanish', 'German','French','Chinese'))


if audio == None:
    st.write("Please upload the audio in the box above")


else:
    if option_language == "English":
        def transcribe_audio(audio_file):
            # Load the audio file
            waveform, sample_rate = torchaudio.load(audio_file)

            # Ensure mono-channel audio
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)

            # Convert to a 16kHz sample rate if not already
            if sample_rate != 16000:
                waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
            
            # Convert to a list of integers
            audio_input = waveform.squeeze().numpy().astype(int).tolist()

            # Use Hugging Face's ASR pipeline
            asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2")
            
            # Transcribe the audio
            transcript = asr_pipeline(waveform.numpy()[0])

            return transcript

        transcription = transcribe_audio(audio)
        st.write("Here is your transcription:")
        st.write(transcription)

    elif option_language == 'Spanish':

        def transcribe_audio(audio_file):
            
            # Load the audio file
            waveform, sample_rate = torchaudio.load(audio_file)

            # Ensure mono-channel audio
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)

            # Convert to a 16kHz sample rate if not already
            if sample_rate != 16000:
                waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
            
            # Convert to a list of integers
            audio_input = waveform.squeeze().numpy().astype(int).tolist()

            # Use Hugging Face's ASR pipeline
            asr_pipeline = pipeline("automatic-speech-recognition", model="Sandiago21/whisper-large-v2-spanish")
            
            # Transcribe the audio
            transcript = asr_pipeline(waveform.numpy()[0])

            return transcript
        
        transcription = transcribe_audio(audio)
        st.write("Aqui tienes tu transcripcion:")
        st.write(transcription)
    elif option_language == 'German':
        def transcribe_audio(audio_file):
            
            # Load the audio file
            waveform, sample_rate = torchaudio.load(audio_file)

            # Ensure mono-channel audio
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)

            # Convert to a 16kHz sample rate if not already
            if sample_rate != 16000:
                waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
            
            # Convert to a list of integers
            audio_input = waveform.squeeze().numpy().astype(int).tolist()

            # Use Hugging Face's ASR pipeline
            asr_pipeline = pipeline("automatic-speech-recognition", model="primeline/whisper-large-v3-german")
            
            # Transcribe the audio
            transcript = asr_pipeline(waveform.numpy()[0])

            return transcript
        
        transcription = transcribe_audio(audio)
        st.write("Hier ist Ihre Transkription:")
        st.write(transcription)
    elif option_language == "French":
        def transcribe_audio(audio_file):
            
            # Load the audio file
            waveform, sample_rate = torchaudio.load(audio_file)

            # Ensure mono-channel audio
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)

            # Convert to a 16kHz sample rate if not already
            if sample_rate != 16000:
                waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
            
            # Convert to a list of integers
            audio_input = waveform.squeeze().numpy().astype(int).tolist()

            # Use Hugging Face's ASR pipeline
            asr_pipeline = pipeline("automatic-speech-recognition", model="bofenghuang/whisper-large-v2-french")
            
            # Transcribe the audio
            transcript = asr_pipeline(waveform.numpy()[0])

            return transcript
        
        transcription = transcribe_audio(audio)
        st.write("Ici, vous avez votre transcription")
        st.write(transcription)

    elif option_language == "Chinese":
        def transcribe_audio(audio_file):
            
            # Load the audio file
            waveform, sample_rate = torchaudio.load(audio_file)

            # Ensure mono-channel audio
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)

            # Convert to a 16kHz sample rate if not already
            if sample_rate != 16000:
                waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
            
            # Convert to a list of integers
            audio_input = waveform.squeeze().numpy().astype(int).tolist()

            # Use Hugging Face's ASR pipeline
            asr_pipeline = pipeline("automatic-speech-recognition", model="yi-ching/whisper-tiny-chinese-test")
            
            # Transcribe the audio
            transcript = asr_pipeline(waveform.numpy()[0])

            return transcript
        
        transcription = transcribe_audio(audio)
        st.write("这是您的转录。")
        st.write(transcription)