# Generated 2022-08-16 from: # /netscratch/sagar/thesis/speechbrain/recipes/CommonVoice_de/Tokenizer/hparams/1K_unigram_subword_bpe.yaml # yamllint disable # ############################################################################ # Tokenizer: subword BPE with unigram 1K # Training: German CommonVoice 1,211 hrs # Authors: Abdel Heba 2021 # ############################################################################ token_type: unigram # ["unigram", "bpe", "char"] output_folder: results/unigram train_log: results/unigram/train_log.txt # Data files data_folder: ../CommonVoice/ csv_dir: ../cv_de_acc train_tsv_file: ../CommonVoice//train.tsv dev_tsv_file: ../CommonVoice//dev.tsv test_tsv_file: ../CommonVoice//test.tsv accented_letters: true language: de skip_prep: false # train_splits: ["train-clean-100", "train-clean-360", "train-other-500"] # dev_splits: ["dev-clean"] # test_splits: ["test-clean", "test-other"] train_csv: ../cv_de_acc/train.csv valid_csv: ../cv_de_acc/dev.csv # Training parameters token_output: 5000 # index(blank/eos/bos/unk) = 0 character_coverage: 1.0 csv_read: wrd tokenizer: !name:speechbrain.tokenizers.SentencePiece.SentencePiece model_dir: results/unigram vocab_size: 5000 annotation_train: ../cv_de_acc/train.csv annotation_read: wrd model_type: unigram # ["unigram", "bpe", "char"] character_coverage: 1.0 annotation_list_to_check: [../cv_de_acc/train.csv, ../cv_de_acc/dev.csv]