import os import tensorflow as tf from functools import lru_cache from huggingface_hub import hf_hub_download from hyperpyyaml import load_hyperpyyaml from typing import Union os.environ["CUDA_VISIBLE_DEVICES"] = "-1" def _get_checkpoint_filename( repo_id: str, filename: str, local_dir: str = None, local_dir_use_symlinks: Union[bool, str] = "auto", subfolder: str = "checkpoints" ) -> str: model_filename = hf_hub_download( repo_id=repo_id, filename=filename, subfolder=subfolder, local_dir=local_dir, local_dir_use_symlinks=local_dir_use_symlinks, ) return model_filename def _get_bpe_model_filename( repo_id: str, filename: str, local_dir: str = None, local_dir_use_symlinks: Union[bool, str] = "auto", subfolder: str = "vocabs" ) -> str: bpe_model_filename = hf_hub_download( repo_id=repo_id, filename=filename, subfolder=subfolder, local_dir=local_dir, local_dir_use_symlinks=local_dir_use_symlinks, ) return bpe_model_filename @lru_cache(maxsize=1) def _get_conformer_pre_trained_model(repo_id: str, checkpoint_dir: str = "checkpoints"): for postfix in ["index", "data-00000-of-00001"]: tmp = _get_checkpoint_filename( repo_id=repo_id, filename="avg_top5_27-32.ckpt.{}".format(postfix), subfolder=checkpoint_dir, local_dir=os.path.dirname(__file__), # noqa local_dir_use_symlinks=True, ) print(tmp) for postfix in ["model", "vocab"]: tmp = _get_bpe_model_filename( repo_id=repo_id, filename="subword_vietnamese_500.{}".format(postfix), local_dir=os.path.dirname(__file__), # noqa local_dir_use_symlinks=True, ) print(tmp) config_path = hf_hub_download( repo_id=repo_id, filename="config.yaml", local_dir=os.path.dirname(__file__), # noqa local_dir_use_symlinks=True, ) print(config_path) with open(config_path, "r") as f: config = load_hyperpyyaml(f) encoder_model = config["encoder_model"] searcher = config["decoder"] model = config["model"] audio_encoder = config["audio_encoder"] model.load_weights(os.path.join(checkpoint_dir, "avg_top5_27-32.ckpt")).expect_partial() return audio_encoder, encoder_model, searcher, model def read_audio(in_filename: str): audio = tf.io.read_file(in_filename) audio = tf.audio.decode_wav(audio)[0] audio = tf.expand_dims(tf.squeeze(audio, axis=-1), axis=0) return audio class UETASRModel: def __init__(self, repo_id: str): self.featurizer, self.encoder_model, self.searcher, self.model = _get_conformer_pre_trained_model(repo_id) def predict(self, in_filename: str): inputs = read_audio(in_filename) features = self.featurizer(inputs) features = self.model.cmvn(features) if self.model.use_cmvn else features batch_size = tf.shape(features)[0] dim = tf.shape(features)[-1] mask = tf.sequence_mask([tf.shape(features)[1]], maxlen=tf.shape(features)[1]) mask = tf.expand_dims(mask, axis=1) encoder_outputs, encoder_masks = self.encoder_model( features, mask, training=False) encoder_mask = tf.squeeze(encoder_masks, axis=1) features_length = tf.math.reduce_sum( tf.cast(encoder_mask, tf.int32), axis=1 ) outputs = self.searcher(encoder_outputs, features_length) outputs = tf.compat.as_str_any(outputs.numpy()) return outputs