import os import subprocess # Run the setup script subprocess.run(['bash', 'setup.sh'], check=True) import gradio as gr import torch from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import librosa # Load pre-trained model and processor model_name = "facebook/wav2vec2-base-960h" processor = Wav2Vec2Processor.from_pretrained(model_name) model = Wav2Vec2ForCTC.from_pretrained(model_name) def transcribe(audio): # Load audio audio_input, _ = librosa.load(audio, sr=16000) # Tokenize and process inputs = processor(audio_input, sampling_rate=16000, return_tensors="pt", padding=True) with torch.no_grad(): logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits # Get predicted ids predicted_ids = torch.argmax(logits, dim=-1) # Decode the ids to text transcription = processor.batch_decode(predicted_ids) return transcription[0] # Define the Gradio interface iface = gr.Interface( fn=transcribe, inputs=gr.Audio(source="microphone", type="filepath"), outputs="text" ) if __name__ == "__main__": iface.launch()