wav2vec2-base-mal / inference.py
aoxo's picture
Create inference.py
ae42d95 verified
raw
history blame
963 Bytes
import torch
import librosa
from transformers import AutoModelForCTC, Wav2Vec2Processor
# Load the model and processor
model = AutoModelForCTC.from_pretrained("aoxo/wav2vec2-base-mal")
processor = Wav2Vec2Processor.from_pretrained("aoxo/wav2vec2-base-mal")
# Function to transcribe audio
def transcribe_audio(audio_path):
# Load the audio file
# Resample to 16kHz if needed
waveform, _ = librosa.load(audio_path, sr=16000)
# Process the audio
inputs = processor(waveform, sampling_rate=16000, return_tensors="pt")
# Perform inference
with torch.no_grad():
logits = model(inputs.input_values).logits
# Decode the prediction
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
return transcription
# Example usage
audio_path = "path/to/your/audio/file.wav"
transcription = transcribe_audio(audio_path)
print("Transcription:", transcription)