import torch import librosa from transformers import AutoModelForCTC, Wav2Vec2Processor # Load the model and processor model = AutoModelForCTC.from_pretrained("aoxo/wav2vec2-large-mal") processor = Wav2Vec2Processor.from_pretrained("aoxo/wav2vec2-large-mal") # Function to transcribe audio def transcribe_audio(audio_path): # Load the audio file # Resample to 16kHz if needed waveform, _ = librosa.load(audio_path, sr=16000) # Process the audio inputs = processor(waveform, sampling_rate=16000, return_tensors="pt") # Perform inference with torch.no_grad(): logits = model(inputs.input_values).logits # Decode the prediction predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids)[0] return transcription # Example usage audio_path = "path/to/your/audio/file.wav" transcription = transcribe_audio(audio_path) print("Transcription:", transcription)