#!/usr/bin/python3 # -*- coding: utf-8 -*- import argparse import os from pathlib import Path import sys pwd = os.path.abspath(os.path.dirname(__file__)) sys.path.append(os.path.join(pwd, "../../")) import librosa import numpy as np import sherpa from scipy.io import wavfile import torch import torchaudio from project_settings import project_path, temp_directory def get_args(): parser = argparse.ArgumentParser() parser.add_argument( "--model_dir", default=(project_path / "pretrained_models/huggingface/csukuangfj/wenet-chinese-model").as_posix(), type=str ) parser.add_argument( "--filename", default=(project_path / "data/test_wavs/paraformer-zh/si_chuan_hua.wav").as_posix(), type=str ) parser.add_argument("--sample_rate", default=16000, type=int) args = parser.parse_args() return args def main(): args = get_args() model_dir = Path(args.model_dir) nn_model_file = model_dir / "final.zip" tokens_file = model_dir / "units.txt" print("nn_model_file: {}".format(nn_model_file)) print("tokens_file: {}".format(tokens_file)) feat_config = sherpa.FeatureConfig(normalize_samples=False) feat_config.fbank_opts.frame_opts.samp_freq = args.sample_rate feat_config.fbank_opts.mel_opts.num_bins = 80 feat_config.fbank_opts.frame_opts.dither = 0 config = sherpa.OfflineRecognizerConfig( nn_model=nn_model_file.as_posix(), tokens=tokens_file.as_posix(), use_gpu=False, feat_config=feat_config, decoding_method="greedy_search", num_active_paths=2, ) recognizer = sherpa.OfflineRecognizer(config) signal, sample_rate = librosa.load(args.filename, sr=args.sample_rate) signal *= 32768.0 signal = np.array(signal, dtype=np.int16) temp_file = temp_directory / "temp.wav" wavfile.write( temp_file.as_posix(), rate=args.sample_rate, data=signal ) s = recognizer.create_stream() s.accept_wave_file( temp_file.as_posix() ) recognizer.decode_stream(s) text = s.result.text.strip() text = text.lower() print("text: {}".format(text)) return if __name__ == "__main__": main()