#!/usr/bin/python3 # -*- coding: utf-8 -*- import argparse import os from pathlib import Path import sys import tempfile pwd = os.path.abspath(os.path.dirname(__file__)) sys.path.append(os.path.join(pwd, "../../")) import librosa import numpy as np import sherpa from scipy.io import wavfile import torch import torchaudio from project_settings import project_path, temp_directory from toolbox.k2_sherpa.utils import audio_convert from toolbox.k2_sherpa import decode, nn_models def get_args(): parser = argparse.ArgumentParser() parser.add_argument( "--model_dir", default=(project_path / "pretrained_models/huggingface/csukuangfj/wenet-chinese-model").as_posix(), type=str ) parser.add_argument( "--in_filename", default=(project_path / "data/test_wavs/paraformer-zh/si_chuan_hua.wav").as_posix(), type=str ) parser.add_argument("--sample_rate", default=16000, type=int) args = parser.parse_args() return args def main(): args = get_args() # audio convert in_filename = Path(args.in_filename) out_filename = Path(tempfile.gettempdir()) / "asr" / in_filename.name out_filename.parent.mkdir(parents=True, exist_ok=True) audio_convert(in_filename=in_filename.as_posix(), out_filename=out_filename.as_posix(), ) # load recognizer m_dict = nn_models.model_map["Chinese"][0] local_model_dir = Path(args.model_dir) nn_model_file = local_model_dir / m_dict["nn_model_file"] tokens_file = local_model_dir / m_dict["tokens_file"] recognizer = nn_models.load_recognizer( repo_id=m_dict["repo_id"], nn_model_file=nn_model_file.as_posix(), tokens_file=tokens_file.as_posix(), sub_folder=m_dict["sub_folder"], local_model_dir=local_model_dir, recognizer_type=m_dict["recognizer_type"], decoding_method="greedy_search", num_active_paths=2, ) text = decode.decode_by_recognizer(recognizer=recognizer, filename=out_filename.as_posix(), ) print("text: {}".format(text)) return if __name__ == "__main__": main()