# Generated 2022-07-09 from: # /notebooks/speechbrain/recipes/LibriSpeech/G2P/hparams/hparams_g2p_rnn.yaml # yamllint disable # ################################ # Model: LSTM (encoder) + GRU (decoder) (tokenized) # Authors: # Loren Lugosch & Mirco Ravanelli 2020 # Artem Ploujnikov 2021 # ################################ # Seed needs to be set at top of yaml, before objects with parameters are made seed: 1234 __set_seed: !apply:torch.manual_seed [!ref ] # Tokenizers char_tokenize: False char_token_type: unigram # ["unigram", "bpe", "char"] char_token_output: 512 char_token_wordwise: True phn_tokenize: False phn_token_type: unigram # ["unigram", "bpe", "char"] phn_token_output: 512 # index(blank/eos/bos/unk) = 0 phn_token_wordwise: True character_coverage: 1.0 phonemes_count: 43 graphemes_count: 31 phonemes_enable_space: True ctc_weight: 0.5 ctc_window_size: 0 homograph_loss_weight: 2.0 # Model parameters output_neurons: !apply:speechbrain.utils.hparams.choice value: !ref choices: True: !ref + 1 False: !ref enc_num_embeddings: !apply:speechbrain.utils.hparams.choice value: !ref choices: True: !ref + 1 False: !ref enc_dropout: 0.5 enc_neurons: 512 enc_num_layers: 4 dec_dropout: 0.5 dec_neurons: 512 dec_att_neurons: 256 dec_num_layers: 4 embedding_dim: 512 # Determines whether to use BOS (beginning-of-sequence) or EOS (end-of-sequence) tokens # Available modes: # raw: no BOS/EOS tokens are added # bos: a beginning-of-sequence token is added # eos: an end-of-sequence token is added grapheme_sequence_mode: bos phoneme_sequence_mode: bos # Special Token information bos_index: 0 eos_index: 1 blank_index: 2 unk_index: 2 token_space_index: 512 # Language Model lm_emb_dim: 256 # dimension of the embeddings lm_rnn_size: 512 # dimension of hidden layers lm_layers: 2 # number of hidden layers lm_output_neurons: 43 # Beam Searcher beam_search_min_decode_ratio: 0 beam_search_max_decode_ratio: 1.0 beam_search_beam_size: 16 beam_search_beam_size_valid: 16 beam_search_eos_threshold: 10.0 beam_search_using_max_attn_shift: false beam_search_max_attn_shift: 10 beam_search_coverage_penalty: 5.0 beam_search_lm_weight: 0.5 beam_search_ctc_weight_decode: 0.4 beam_search_temperature: 1.25 beam_search_temperature_lm: 1.0 # Word embeddings use_word_emb: true word_emb_model: bert-base-uncased word_emb_dim: 768 word_emb_enc_dim: 256 word_emb_norm_type: batch graphemes: - A - B - C - D - E - F - G - H - I - J - K - L - M - N - O - P - Q - R - S - T - U - V - W - X - Y - Z - "'" - ' ' phonemes: - AA - AE - AH - AO - AW - AY - B - CH - D - DH - EH - ER - EY - F - G - HH - IH - IY - JH - K - L - M - N - NG - OW - OY - P - R - S - SH - T - TH - UH - UW - V - W - Y - Z - ZH - ' ' enc_input_dim: !apply:speechbrain.lobes.models.g2p.model.input_dim use_word_emb: !ref word_emb_enc_dim: !ref embedding_dim: !ref phn_char_map: !apply:speechbrain.lobes.models.g2p.dataio.build_token_char_map tokens: !ref char_phn_map: !apply:speechbrain.lobes.models.g2p.dataio.flip_map map_dict: !ref enc: !new:speechbrain.nnet.RNN.LSTM input_shape: [null, null, !ref ] bidirectional: True hidden_size: !ref num_layers: !ref dropout: !ref lin: !new:speechbrain.nnet.linear.Linear input_size: !ref n_neurons: !ref bias: false ctc_lin: !new:speechbrain.nnet.linear.Linear input_size: !ref 2 * n_neurons: !ref encoder_emb: !new:speechbrain.nnet.embedding.Embedding num_embeddings: !ref embedding_dim: !ref emb: !new:speechbrain.nnet.embedding.Embedding num_embeddings: !ref embedding_dim: !ref dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder enc_dim: !ref * 2 input_size: !ref rnn_type: gru attn_type: content dropout: !ref hidden_size: !ref attn_dim: !ref num_layers: !ref word_emb_enc: !new:speechbrain.lobes.models.g2p.model.WordEmbeddingEncoder word_emb_dim: !ref word_emb_enc_dim: !ref norm_type: batch word_emb: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init init: !name:speechbrain.wordemb.transformer.TransformerWordEmbeddings model: bert-base-uncased log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: true model: !new:speechbrain.lobes.models.g2p.model.AttentionSeq2Seq enc: !ref encoder_emb: !ref emb: !ref dec: !ref lin: !ref out: !ref use_word_emb: !ref word_emb_enc: !ref modules: model: !ref enc: !ref encoder_emb: !ref emb: !ref dec: !ref lin: !ref ctc_lin: !ref out: !ref word_emb: !ref word_emb_enc: !ref lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM embedding_dim: !ref rnn_layers: !ref rnn_neurons: !ref output_neurons: !ref return_hidden: True ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer eos_index: !ref blank_index: !ref ctc_fc: !ref ctc_window_size: !ref coverage_scorer: !new:speechbrain.decoders.scorer.CoverageScorer vocab_size: !ref scorer: !new:speechbrain.decoders.scorer.ScorerBuilder full_scorers: [!ref , !ref ] weights: coverage: !ref ctc: !ref beam_searcher: !new:speechbrain.decoders.S2SRNNBeamSearcher embedding: !ref decoder: !ref linear: !ref bos_index: !ref eos_index: !ref min_decode_ratio: !ref max_decode_ratio: !ref beam_size: !ref eos_threshold: !ref using_max_attn_shift: !ref max_attn_shift: !ref temperature: !ref scorer: !ref beam_searcher_valid: !new:speechbrain.decoders.S2SRNNBeamSearcher embedding: !ref decoder: !ref linear: !ref bos_index: !ref eos_index: !ref min_decode_ratio: !ref max_decode_ratio: !ref beam_size: !ref eos_threshold: !ref using_max_attn_shift: !ref max_attn_shift: !ref temperature: !ref scorer: !ref homograph_extractor: !new:speechbrain.lobes.models.g2p.homograph.SubsequenceExtractor model_output_keys: - p_seq - char_lens - encoder_out grapheme_encoder: &id027 !new:speechbrain.dataio.encoder.TextEncoder phoneme_encoder: &id024 !new:speechbrain.dataio.encoder.TextEncoder grapheme_tokenizer: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece model_dir: grapheme_tokenizer bos_id: !ref eos_id: !ref unk_id: !ref vocab_size: !ref annotation_train: null annotation_read: char model_type: !ref # ["unigram", "bpe", "char"] character_coverage: !ref annotation_format: json text_file: grapheme_annotations.txt phoneme_tokenizer: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece model_dir: phoneme_tokenizer bos_id: !ref eos_id: !ref unk_id: !ref vocab_size: !ref annotation_train: null annotation_read: phn model_type: !ref # ["unigram", "bpe", "char"] character_coverage: !ref annotation_format: json text_file: null out_phoneme_decoder_tok: !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize tokenizer: !ref char_map: !ref token_space_index: !ref wordwise: !ref out_phoneme_decoder_raw: !name:speechbrain.lobes.models.g2p.dataio.text_decode encoder: !ref out_phoneme_decoder: !apply:speechbrain.utils.hparams.choice value: false choices: True: !ref False: !ref encode_pipeline: batch: false use_padded_data: true output_keys: - grapheme_list - grapheme_encoded_list - grapheme_encoded - word_emb init: - func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos encoder: !ref tokens: !ref bos_index: !ref eos_index: !ref - func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos encoder: !ref tokens: !ref bos_index: !ref eos_index: !ref steps: - func: !name:speechbrain.lobes.models.g2p.dataio.clean_pipeline graphemes: !ref takes: txt provides: txt_cleaned - func: !name:speechbrain.lobes.models.g2p.dataio.grapheme_pipeline grapheme_encoder: !ref takes: txt_cleaned provides: - grapheme_list - grapheme_encoded_list - grapheme_encoded_raw - func: !name:speechbrain.lobes.models.g2p.dataio.add_bos_eos encoder: !ref takes: grapheme_encoded_list provides: - grapheme_encoded - grapheme_len - grapheme_encoded_eos - grapheme_len_eos - func: !name:speechbrain.lobes.models.g2p.dataio.word_emb_pipeline word_emb: !ref grapheme_encoder: !ref use_word_emb: !ref takes: - txt - grapheme_encoded - grapheme_len provides: word_emb decode_pipeline: batch: true output_keys: - phonemes steps: - func: !name:speechbrain.lobes.models.g2p.dataio.beam_search_pipeline beam_searcher: !ref takes: - char_lens - encoder_out provides: - hyps - scores - func: !apply:speechbrain.utils.hparams.choice value: false choices: True: !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize tokenizer: !ref char_map: !ref token_space_index: !ref wordwise: !ref False: !name:speechbrain.lobes.models.g2p.dataio.phoneme_decoder_pipeline phoneme_encoder: !ref takes: - hyps provides: - phonemes pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: model: !ref ctc_lin: !ref