|
--- |
|
license: mit |
|
--- |
|
|
|
The model was fine-tuned on 300h of public and private speech data. More information will be given once the underlying paper gets published. |
|
|
|
``` |
|
import librosa |
|
from transformers import Wav2Vec2Processor, AutoModelForCTC |
|
import torch |
|
|
|
audio, _ = librosa.load("[audio_path]", sr=16000) |
|
model = AutoModelForCTC.from_pretrained("racai/wav2vec2-base-100k-voxpopuli-romanian") |
|
processor = Wav2Vec2Processor.from_pretrained("racai/wav2vec2-base-100k-voxpopuli-romanian") |
|
|
|
input_dict = processor(audio, sampling_rate=16000, return_tensors="pt") |
|
|
|
with torch.inference_mode(): |
|
logits = model(input_dict.input_values).logits |
|
|
|
predicted_ids = torch.argmax(logits, dim=-1) |
|
predicted_sentence = processor.batch_decode(predicted_ids)[0] |
|
|
|
print("Prediction:", predicted_sentence) |
|
``` |