File size: 1,896 Bytes
63e4766
95da6fb
 
 
63e4766
95da6fb
 
63e4766
d20136b
 
 
1db7f41
95da6fb
d20136b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95da6fb
4659c06
de5f056
63e4766
d20136b
06b7f96
f9f786c
 
 
 
 
 
aeba14a
f9f786c
 
 
 
 
63e4766
 
06b7f96
f9f786c
 
 
95da6fb
f9f786c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import gradio as gr
from transformers import pipeline
import librosa
import traceback

# Initialize the ASR pipeline
asr_pipeline = pipeline(task="automatic-speech-recognition", model="distil-whisper/distil-small.en")

def transcribe_long_form(file_info):
    if not file_info:
        return "No audio file provided."

    try:
        # Unpack the audio array and sample rate from the tuple
        audio_array, sample_rate = file_info

        # Check if the audio array is multi-channel (stereo) and convert it to mono
        if audio_array.ndim > 1:
            audio_mono = librosa.to_mono(audio_array)
        else:
            audio_mono = audio_array

        # Resample the audio to 16 kHz if the current sample rate is different
        if sample_rate != 16000:
            audio_mono = librosa.resample(audio_mono, orig_sr=sample_rate, target_sr=16000)

        # Transcribe the audio using the ASR pipeline
        result = asr_pipeline(audio_mono, sampling_rate=16000)
        # Access the first result's 'text' field
        return result[0]['text']
    except Exception as e:
        traceback.print_exc()
        return f"An error occurred: {str(e)}"


# Define Gradio interfaces for microphone and file upload
mic_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=gr.Textbox(label="Transcription", lines=3),
    allow_flagging="never"
)

file_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=gr.Textbox(label="Transcription", lines=3),
    allow_flagging="never"
)

# Setup the main Gradio app with tabbed interfaces for different input sources
with gr.Blocks() as demo:
    gr.TabbedInterface([mic_transcribe, file_transcribe], ["Transcribe Microphone", "Transcribe Audio File"])

# Launch the Gradio app
demo.launch(share=True)