Spaces:

YMEA
/

bam-fr

Runtime error

File size: 3,649 Bytes

dc26e56
 
 
a35dd1c
cfbd7a9
 
 
 
 
a35dd1c
dc26e56
cfbd7a9
 
 
 
 
dc26e56
 
 
cfbd7a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc26e56
 
 
cfbd7a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc26e56
 
 
 
cfbd7a9
dc26e56
 
cfbd7a9
dc26e56
 
cfbd7a9
 
 
 
 
 
 
 
dc26e56
 
cfbd7a9
 
 
dc26e56

# Import necessary libraries
import gradio as gr
import numpy as np
import torch
import torchaudio  # Import torchaudio
import soundfile as sf  # pour sauvegarder l'audio
import os
from pydub import AudioSegment
from scipy.io import wavfile


# Load the processor and model
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("YMEA/bambara-fr-asr-whisper_25_v2_15k")

# Function to record and save audio in WAV format, resampled to 16kHz
def record_audio(audio):
    if audio is not None:
        sr, data = audio
        # Ensure the audio is resampled to 16kHz
        if sr != 16000:
            from scipy.signal import resample
            data = resample(data, int(len(data) * 16000 / sr))
            sr = 16000
        
        # Save the audio to a temporary file in WAV format
        temp_audio_path = "temp_recorded_audio.wav"
        sf.write(temp_audio_path, data, sr)
        
        # Use PyDub to process audio
        sound = AudioSegment.from_wav(temp_audio_path)
        
        # Normalize volume and increase slightly to reduce background noise impact
        normalized_sound = sound.apply_gain(-sound.max_dBFS).apply_gain(5)  # Adjust gain as needed
        
        # Export the processed audio
        processed_audio_path = "processed_audio.wav"
        normalized_sound.export(processed_audio_path, format="wav")
        
        # Remove the temporary file
        os.remove(temp_audio_path)
        
        return processed_audio_path
    else:
        return None

# Function to transcribe audio
def transcribe_audio(audio_path):
    if audio_path is None:
        return "No audio was recorded."
    
    # Load the audio data using torchaudio
    waveform, sample_rate = torchaudio.load(audio_path)
    
    # Ensure the audio is at 16kHz
    if sample_rate != 16000:
        waveform = torchaudio.functional.resample(waveform, sample_rate, 16000)
        sample_rate = 16000
    
    # Preprocess the audio data, ensuring correct input shape
    if waveform.shape[0] == 2:  # Check if it's stereo
        waveform = waveform.mean(dim=0, keepdim=True)  # Convert to mono
    
    # Pass waveform to the processor
    audio_input = processor(waveform.squeeze(), sampling_rate=sample_rate, return_tensors="pt")
    
    # Generate the transcription
    with torch.no_grad():
        input_features = audio_input.input_features
        generated_ids = model.generate(inputs=input_features)
    
    # Decode the generated IDs to text
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
    
    return transcription[0]

# Create the Gradio interface
with gr.Blocks() as demo:
    # Create an audio input component with microphone source and no format conversion
    audio_input = gr.Audio(sources=["microphone"], type="numpy")

    # Create a button to trigger the recording
    record_button = gr.Button("Record Audio")

    # Create an output component to display the recorded audio
    audio_output = gr.Audio(type="filepath")

    # Create a button for transcription
    transcribe_button = gr.Button("Transcribe")

    # Create a text box to display the transcription
    transcription_output = gr.Textbox(label="Transcription", lines=3)

    # Set up the event listener for the recording button click
    record_button.click(fn=record_audio, inputs=audio_input, outputs=audio_output)

    # Set up the event listener for the transcription button click
    transcribe_button.click(fn=transcribe_audio, inputs=audio_output, outputs=transcription_output)

# Launch the Gradio app
demo.launch(show_error=True)