import torch
import librosa
from transformers import AutoModelForCTC, Wav2Vec2Processor

# Load the model and processor
model = AutoModelForCTC.from_pretrained("aoxo/wav2vec2-large-mal")
processor = Wav2Vec2Processor.from_pretrained("aoxo/wav2vec2-large-mal")

# Function to transcribe audio
def transcribe_audio(audio_path):
    # Load the audio file
    # Resample to 16kHz if needed
    waveform, _ = librosa.load(audio_path, sr=16000)
    
    # Process the audio
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt")
    
    # Perform inference
    with torch.no_grad():
        logits = model(inputs.input_values).logits
    
    # Decode the prediction
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    
    return transcription

# Example usage
audio_path = "path/to/your/audio/file.wav"
transcription = transcribe_audio(audio_path)
print("Transcription:", transcription)