Wealthmate / speech_transcribe /speech_transcribe.py
kennethduong's picture
initial config
2013214
import torch
from transformers import AutoProcessor, AutoModelForCTC
import librosa
processor = AutoProcessor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h")
model = AutoModelForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h")
def transcribe(audio_data, sampling_rate):
if len(audio_data.shape) > 1:
audio_data = audio_data.mean(axis=1)
if sampling_rate != 16000:
raise ValueError("Sampling rate phải là 16 kHz")
inputs = processor(audio_data,
sampling_rate=sampling_rate,
return_tensors="pt",
padding="longest")
with torch.no_grad():
logits = model(input_values=inputs.input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
return transcription[0]
def transcribe_file(file_path):
audio_data, sampling_rate = librosa.load(file_path, sr=16000)
return transcribe(audio_data, sampling_rate)
# if __name__ == "__main__":
# file_path = "vn.wav"
# result = transcribe_file(file_path)
# print(result)