File size: 3,649 Bytes
dc26e56 a35dd1c cfbd7a9 a35dd1c dc26e56 cfbd7a9 dc26e56 cfbd7a9 dc26e56 cfbd7a9 dc26e56 cfbd7a9 dc26e56 cfbd7a9 dc26e56 cfbd7a9 dc26e56 cfbd7a9 dc26e56 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
# Import necessary libraries
import gradio as gr
import numpy as np
import torch
import torchaudio # Import torchaudio
import soundfile as sf # pour sauvegarder l'audio
import os
from pydub import AudioSegment
from scipy.io import wavfile
# Load the processor and model
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("YMEA/bambara-fr-asr-whisper_25_v2_15k")
# Function to record and save audio in WAV format, resampled to 16kHz
def record_audio(audio):
if audio is not None:
sr, data = audio
# Ensure the audio is resampled to 16kHz
if sr != 16000:
from scipy.signal import resample
data = resample(data, int(len(data) * 16000 / sr))
sr = 16000
# Save the audio to a temporary file in WAV format
temp_audio_path = "temp_recorded_audio.wav"
sf.write(temp_audio_path, data, sr)
# Use PyDub to process audio
sound = AudioSegment.from_wav(temp_audio_path)
# Normalize volume and increase slightly to reduce background noise impact
normalized_sound = sound.apply_gain(-sound.max_dBFS).apply_gain(5) # Adjust gain as needed
# Export the processed audio
processed_audio_path = "processed_audio.wav"
normalized_sound.export(processed_audio_path, format="wav")
# Remove the temporary file
os.remove(temp_audio_path)
return processed_audio_path
else:
return None
# Function to transcribe audio
def transcribe_audio(audio_path):
if audio_path is None:
return "No audio was recorded."
# Load the audio data using torchaudio
waveform, sample_rate = torchaudio.load(audio_path)
# Ensure the audio is at 16kHz
if sample_rate != 16000:
waveform = torchaudio.functional.resample(waveform, sample_rate, 16000)
sample_rate = 16000
# Preprocess the audio data, ensuring correct input shape
if waveform.shape[0] == 2: # Check if it's stereo
waveform = waveform.mean(dim=0, keepdim=True) # Convert to mono
# Pass waveform to the processor
audio_input = processor(waveform.squeeze(), sampling_rate=sample_rate, return_tensors="pt")
# Generate the transcription
with torch.no_grad():
input_features = audio_input.input_features
generated_ids = model.generate(inputs=input_features)
# Decode the generated IDs to text
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
return transcription[0]
# Create the Gradio interface
with gr.Blocks() as demo:
# Create an audio input component with microphone source and no format conversion
audio_input = gr.Audio(sources=["microphone"], type="numpy")
# Create a button to trigger the recording
record_button = gr.Button("Record Audio")
# Create an output component to display the recorded audio
audio_output = gr.Audio(type="filepath")
# Create a button for transcription
transcribe_button = gr.Button("Transcribe")
# Create a text box to display the transcription
transcription_output = gr.Textbox(label="Transcription", lines=3)
# Set up the event listener for the recording button click
record_button.click(fn=record_audio, inputs=audio_input, outputs=audio_output)
# Set up the event listener for the transcription button click
transcribe_button.click(fn=transcribe_audio, inputs=audio_output, outputs=transcription_output)
# Launch the Gradio app
demo.launch(show_error=True) |