|
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq |
|
import librosa |
|
import torch |
|
import gradio as gr |
|
|
|
|
|
print("Loading model...") |
|
processor = AutoProcessor.from_pretrained("jsbeaudry/whisper-medium-oswald") |
|
model = AutoModelForSpeechSeq2Seq.from_pretrained("jsbeaudry/whisper-medium-oswald") |
|
model.eval() |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model.to(device) |
|
print("Model loaded successfully.") |
|
|
|
|
|
def transcribe(audio): |
|
if audio is None: |
|
return "Please upload or record an audio file first." |
|
|
|
|
|
sr, data = audio |
|
|
|
|
|
if len(data.shape) == 2: |
|
data = librosa.to_mono(data.T) |
|
|
|
|
|
if sr != 16000: |
|
data = librosa.resample(data, orig_sr=sr, target_sr=16000) |
|
sr = 16000 |
|
|
|
|
|
input_features = processor(data, sampling_rate=sr, return_tensors="pt").input_features.to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
predicted_ids = model.generate(input_features) |
|
|
|
|
|
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] |
|
return transcription |
|
|
|
|
|
def create_interface(): |
|
with gr.Blocks(title="Whisper Medium - Haitian Creole") as demo: |
|
gr.Markdown("# ποΈ Whisper Medium Creole ASR") |
|
gr.Markdown( |
|
"Upload or record your voice in Haitian Creole. Then click **Transcribe** to get the text." |
|
) |
|
|
|
with gr.Row(): |
|
audio_input = gr.Audio(label="π§ Upload or Record Audio", type="numpy", format="wav") |
|
transcribe_button = gr.Button("π Transcribe") |
|
output_text = gr.Textbox(label="π Transcribed Text", lines=4) |
|
|
|
transcribe_button.click(fn=transcribe, inputs=audio_input, outputs=output_text) |
|
|
|
return demo |
|
|
|
if __name__ == "__main__": |
|
interface = create_interface() |
|
interface.launch() |
|
|