import torch | |
from datasets import load_dataset | |
from transformers import AutoModelForCTC, AutoProcessor | |
import torchaudio.functional as F | |
model_id = "cahya/wav2vec2-base-turkish" | |
sample_iter = iter(load_dataset("common_voice", "tr", split="test", streaming=True)) | |
sample = next(sample_iter) | |
resampled_audio = F.resample(torch.tensor(sample["audio"]["array"]), 48_000, 16_000).numpy() | |
model = AutoModelForCTC.from_pretrained(model_id) | |
processor = AutoProcessor.from_pretrained(model_id) | |
input_values = processor(resampled_audio, return_tensors="pt").input_values | |
with torch.no_grad(): | |
logits = model(input_values).logits | |
transcription = processor.batch_decode(logits.numpy()).text | |
print(transcription) |