import torch import torchaudio from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import gradio as gr model = Wav2Vec2ForCTC.from_pretrained("tacab/tacab_asr_somali") processor = Wav2Vec2Processor.from_pretrained("tacab/tacab_asr_somali") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) def transcribe(audio): waveform, sample_rate = torchaudio.load(audio) if sample_rate != 16000: waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform) if waveform.shape[0] > 1: waveform = waveform.mean(dim=0, keepdim=True) inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt") input_values = inputs.input_values.to(device) with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids)[0] return transcription.lower() gr.Interface( fn=transcribe, inputs=gr.Audio(type="filepath", label="🎙️ Ku hadal Af Soomaali"), outputs=gr.Text(label="📄 Qoraalka la helay"), title="Tacab ASR Somali", description="ASR model for Somali speech-to-text using Wav2Vec2.", ).launch()