|
import gradio as gr |
|
import torch |
|
from transformers import WhisperProcessor, WhisperForConditionalGeneration |
|
from peft import PeftModel |
|
import torchaudio |
|
|
|
|
|
MODEL = "openai/whisper-small.en" |
|
ADAPTER_DIR = "./checkpoint-60" |
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
|
SAMPLE_RATE = 16000 |
|
CHUNK_LENGTH = 30 |
|
|
|
|
|
processor = WhisperProcessor.from_pretrained(MODEL) |
|
base_model = WhisperForConditionalGeneration.from_pretrained(MODEL) |
|
finetuned_model = PeftModel.from_pretrained(base_model, ADAPTER_DIR) |
|
finetuned_model = finetuned_model.merge_and_unload().to(DEVICE) |
|
|
|
|
|
def load_audio(audio_path: str): |
|
"""Load and preprocess the audio file.""" |
|
speech_array, sampling_rate = torchaudio.load(audio_path) |
|
|
|
|
|
if speech_array.shape[0] > 1: |
|
speech_array = torch.mean(speech_array, dim=0, keepdim=True) |
|
|
|
|
|
if sampling_rate != SAMPLE_RATE: |
|
resampler = torchaudio.transforms.Resample(sampling_rate, SAMPLE_RATE) |
|
speech_array = resampler(speech_array) |
|
|
|
return speech_array.squeeze().numpy() |
|
|
|
|
|
def chunk_audio(audio, chunk_length=CHUNK_LENGTH): |
|
"""Split the audio into chunks of specified length in seconds.""" |
|
chunk_samples = chunk_length * SAMPLE_RATE |
|
return [audio[i : i + chunk_samples] for i in range(0, len(audio), chunk_samples)] |
|
|
|
|
|
def transcribe_chunk(chunk): |
|
"""Transcribe a single audio chunk.""" |
|
inputs = processor(chunk, sampling_rate=SAMPLE_RATE, return_tensors="pt") |
|
input_features = inputs.input_features.to(DEVICE) |
|
|
|
with torch.no_grad(): |
|
predicted_ids = finetuned_model.generate(input_features) |
|
|
|
return processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] |
|
|
|
|
|
def transcribe_audio(audio_path: str) -> str: |
|
"""Transcribe the given audio file using the specified Whisper model.""" |
|
audio = load_audio(audio_path) |
|
audio_chunks = chunk_audio(audio) |
|
transcriptions = [transcribe_chunk(chunk) for chunk in audio_chunks] |
|
return " ".join(transcriptions) |
|
|
|
|
|
examples = [["apollo11_example.mp3"], ["mock_operator_example.wav"]] |
|
|
|
ui = gr.Interface( |
|
fn=transcribe_audio, |
|
inputs=gr.Audio( |
|
sources=["microphone", "upload"], |
|
type="filepath", |
|
label="Input Audio", |
|
), |
|
outputs=gr.Textbox( |
|
label="Transcription", |
|
placeholder="The transcribed text will appear here...", |
|
), |
|
title="ECHO V0.1", |
|
description=""" |
|
This is a demo of the transcription capabilities of "ECHO". This could be adapted to run real-time transcription on a live audio stream like ISS communications. |
|
|
|
### How to use: |
|
1. **Record or Upload**: Click on the microphone icon 🎙️ to record audio, using your microphone, or click on the upload button ⬆️ to upload an audio file. |
|
You can also use the **Examples** provided below, as inputs, by clicking on them. |
|
2. **Click Submit**: Clicking the submit button will transcribe the audio. |
|
3. **Read the Transcription**: The transcribed text will appear in the text box below the audio input section. |
|
""", |
|
examples=examples, |
|
) |
|
|
|
ui.launch(share=False) |
|
|