Spaces:
Running
Running
# This module is from [WeNet](https://github.com/wenet-e2e/wenet). | |
# ## Citations | |
# ```bibtex | |
# @inproceedings{yao2021wenet, | |
# title={WeNet: Production oriented Streaming and Non-streaming End-to-End Speech Recognition Toolkit}, | |
# author={Yao, Zhuoyuan and Wu, Di and Wang, Xiong and Zhang, Binbin and Yu, Fan and Yang, Chao and Peng, Zhendong and Chen, Xiaoyu and Xie, Lei and Lei, Xin}, | |
# booktitle={Proc. Interspeech}, | |
# year={2021}, | |
# address={Brno, Czech Republic }, | |
# organization={IEEE} | |
# } | |
# @article{zhang2022wenet, | |
# title={WeNet 2.0: More Productive End-to-End Speech Recognition Toolkit}, | |
# author={Zhang, Binbin and Wu, Di and Peng, Zhendong and Song, Xingchen and Yao, Zhuoyuan and Lv, Hang and Xie, Lei and Yang, Chao and Pan, Fuping and Niu, Jianwei}, | |
# journal={arXiv preprint arXiv:2203.15455}, | |
# year={2022} | |
# } | |
# | |
import torch | |
from modules.wenet_extractor.transducer.joint import TransducerJoint | |
from modules.wenet_extractor.transducer.predictor import ( | |
ConvPredictor, | |
EmbeddingPredictor, | |
RNNPredictor, | |
) | |
from modules.wenet_extractor.transducer.transducer import Transducer | |
from modules.wenet_extractor.transformer.asr_model import ASRModel | |
from modules.wenet_extractor.transformer.cmvn import GlobalCMVN | |
from modules.wenet_extractor.transformer.ctc import CTC | |
from modules.wenet_extractor.transformer.decoder import ( | |
BiTransformerDecoder, | |
TransformerDecoder, | |
) | |
from modules.wenet_extractor.transformer.encoder import ( | |
ConformerEncoder, | |
TransformerEncoder, | |
) | |
from modules.wenet_extractor.squeezeformer.encoder import SqueezeformerEncoder | |
from modules.wenet_extractor.efficient_conformer.encoder import ( | |
EfficientConformerEncoder, | |
) | |
from modules.wenet_extractor.paraformer.paraformer import Paraformer | |
from modules.wenet_extractor.cif.predictor import Predictor | |
from modules.wenet_extractor.utils.cmvn import load_cmvn | |
def init_model(configs): | |
if configs["cmvn_file"] is not None: | |
mean, istd = load_cmvn(configs["cmvn_file"], configs["is_json_cmvn"]) | |
global_cmvn = GlobalCMVN( | |
torch.from_numpy(mean).float(), torch.from_numpy(istd).float() | |
) | |
else: | |
global_cmvn = None | |
input_dim = configs["input_dim"] | |
vocab_size = configs["output_dim"] | |
encoder_type = configs.get("encoder", "conformer") | |
decoder_type = configs.get("decoder", "bitransformer") | |
if encoder_type == "conformer": | |
encoder = ConformerEncoder( | |
input_dim, global_cmvn=global_cmvn, **configs["encoder_conf"] | |
) | |
elif encoder_type == "squeezeformer": | |
encoder = SqueezeformerEncoder( | |
input_dim, global_cmvn=global_cmvn, **configs["encoder_conf"] | |
) | |
elif encoder_type == "efficientConformer": | |
encoder = EfficientConformerEncoder( | |
input_dim, | |
global_cmvn=global_cmvn, | |
**configs["encoder_conf"], | |
**configs["encoder_conf"]["efficient_conf"] | |
if "efficient_conf" in configs["encoder_conf"] | |
else {}, | |
) | |
else: | |
encoder = TransformerEncoder( | |
input_dim, global_cmvn=global_cmvn, **configs["encoder_conf"] | |
) | |
if decoder_type == "transformer": | |
decoder = TransformerDecoder( | |
vocab_size, encoder.output_size(), **configs["decoder_conf"] | |
) | |
else: | |
assert 0.0 < configs["model_conf"]["reverse_weight"] < 1.0 | |
assert configs["decoder_conf"]["r_num_blocks"] > 0 | |
decoder = BiTransformerDecoder( | |
vocab_size, encoder.output_size(), **configs["decoder_conf"] | |
) | |
ctc = CTC(vocab_size, encoder.output_size()) | |
# Init joint CTC/Attention or Transducer model | |
if "predictor" in configs: | |
predictor_type = configs.get("predictor", "rnn") | |
if predictor_type == "rnn": | |
predictor = RNNPredictor(vocab_size, **configs["predictor_conf"]) | |
elif predictor_type == "embedding": | |
predictor = EmbeddingPredictor(vocab_size, **configs["predictor_conf"]) | |
configs["predictor_conf"]["output_size"] = configs["predictor_conf"][ | |
"embed_size" | |
] | |
elif predictor_type == "conv": | |
predictor = ConvPredictor(vocab_size, **configs["predictor_conf"]) | |
configs["predictor_conf"]["output_size"] = configs["predictor_conf"][ | |
"embed_size" | |
] | |
else: | |
raise NotImplementedError("only rnn, embedding and conv type support now") | |
configs["joint_conf"]["enc_output_size"] = configs["encoder_conf"][ | |
"output_size" | |
] | |
configs["joint_conf"]["pred_output_size"] = configs["predictor_conf"][ | |
"output_size" | |
] | |
joint = TransducerJoint(vocab_size, **configs["joint_conf"]) | |
model = Transducer( | |
vocab_size=vocab_size, | |
blank=0, | |
predictor=predictor, | |
encoder=encoder, | |
attention_decoder=decoder, | |
joint=joint, | |
ctc=ctc, | |
**configs["model_conf"], | |
) | |
elif "paraformer" in configs: | |
predictor = Predictor(**configs["cif_predictor_conf"]) | |
model = Paraformer( | |
vocab_size=vocab_size, | |
encoder=encoder, | |
decoder=decoder, | |
ctc=ctc, | |
predictor=predictor, | |
**configs["model_conf"], | |
) | |
else: | |
model = ASRModel( | |
vocab_size=vocab_size, | |
encoder=encoder, | |
decoder=decoder, | |
ctc=ctc, | |
lfmmi_dir=configs.get("lfmmi_dir", ""), | |
**configs["model_conf"], | |
) | |
return model | |