Spaces:
Running
Running
import torch | |
from transformers import AutoProcessor, AutoModelForCTC | |
import librosa | |
processor = AutoProcessor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h") | |
model = AutoModelForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h") | |
def transcribe(audio_data, sampling_rate): | |
if len(audio_data.shape) > 1: | |
audio_data = audio_data.mean(axis=1) | |
if sampling_rate != 16000: | |
raise ValueError("Sampling rate phải là 16 kHz") | |
inputs = processor(audio_data, | |
sampling_rate=sampling_rate, | |
return_tensors="pt", | |
padding="longest") | |
with torch.no_grad(): | |
logits = model(input_values=inputs.input_values).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
transcription = processor.batch_decode(predicted_ids) | |
return transcription[0] | |
def transcribe_file(file_path): | |
audio_data, sampling_rate = librosa.load(file_path, sr=16000) | |
return transcribe(audio_data, sampling_rate) | |
# if __name__ == "__main__": | |
# file_path = "vn.wav" | |
# result = transcribe_file(file_path) | |
# print(result) |