ECHO_Demo / app.py
ItsNotSoftware's picture
Update app.py
da8cce3 verified
import gradio as gr
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from peft import PeftModel
import torchaudio
# Constants
MODEL = "openai/whisper-small.en"
ADAPTER_DIR = "./checkpoint-60"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SAMPLE_RATE = 16000
CHUNK_LENGTH = 30 # Length of each audio chunk in seconds
# Load processor and model
processor = WhisperProcessor.from_pretrained(MODEL)
base_model = WhisperForConditionalGeneration.from_pretrained(MODEL)
finetuned_model = PeftModel.from_pretrained(base_model, ADAPTER_DIR)
finetuned_model = finetuned_model.merge_and_unload().to(DEVICE)
def load_audio(audio_path: str):
"""Load and preprocess the audio file."""
speech_array, sampling_rate = torchaudio.load(audio_path)
# Convert stereo to mono by averaging the two channels
if speech_array.shape[0] > 1:
speech_array = torch.mean(speech_array, dim=0, keepdim=True)
# Resample to the model's required sample rate
if sampling_rate != SAMPLE_RATE:
resampler = torchaudio.transforms.Resample(sampling_rate, SAMPLE_RATE)
speech_array = resampler(speech_array)
return speech_array.squeeze().numpy()
def chunk_audio(audio, chunk_length=CHUNK_LENGTH):
"""Split the audio into chunks of specified length in seconds."""
chunk_samples = chunk_length * SAMPLE_RATE
return [audio[i : i + chunk_samples] for i in range(0, len(audio), chunk_samples)]
def transcribe_chunk(chunk):
"""Transcribe a single audio chunk."""
inputs = processor(chunk, sampling_rate=SAMPLE_RATE, return_tensors="pt")
input_features = inputs.input_features.to(DEVICE)
with torch.no_grad():
predicted_ids = finetuned_model.generate(input_features)
return processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
def transcribe_audio(audio_path: str) -> str:
"""Transcribe the given audio file using the specified Whisper model."""
audio = load_audio(audio_path)
audio_chunks = chunk_audio(audio)
transcriptions = [transcribe_chunk(chunk) for chunk in audio_chunks]
return " ".join(transcriptions)
examples = [["apollo11_example.mp3"], ["mock_operator_example.wav"]]
ui = gr.Interface(
fn=transcribe_audio,
inputs=gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="Input Audio",
),
outputs=gr.Textbox(
label="Transcription",
placeholder="The transcribed text will appear here...",
),
title="ECHO V0.1",
description="""
This is a demo of the transcription capabilities of "ECHO". This could be adapted to run real-time transcription on a live audio stream like ISS communications.
### How to use:
1. **Record or Upload**: Click on the microphone icon 🎙️ to record audio, using your microphone, or click on the upload button ⬆️ to upload an audio file.
You can also use the **Examples** provided below, as inputs, by clicking on them.
2. **Click Submit**: Clicking the submit button will transcribe the audio.
3. **Read the Transcription**: The transcribed text will appear in the text box below the audio input section.
""",
examples=examples,
)
ui.launch(share=False)