|
|
|
|
|
|
|
dataset_folder: /home/wayne/CORPUS/MATBN_SEG |
|
prepare_folder: results/prepare_seg |
|
output_folder: results/tokenizer_seg_bpe5k_char |
|
keep_unk: false |
|
|
|
token_type: char |
|
token_output: 5000 |
|
character_coverage: 1.0 |
|
annotation_read: transcription |
|
|
|
train_json: results/prepare_seg/train.json |
|
dev_json: results/prepare_seg/dev.json |
|
eval_json: results/prepare_seg/eval.json |
|
test_json: results/prepare_seg/test.json |
|
|
|
|
|
tokenizer: !name:speechbrain.tokenizers.SentencePiece.SentencePiece |
|
model_dir: results/tokenizer_seg_bpe5k_char |
|
vocab_size: 5000 |
|
annotation_train: results/prepare_seg/train.json |
|
annotation_read: transcription |
|
model_type: char |
|
character_coverage: 1.0 |
|
annotation_list_to_check: [results/prepare_seg/dev.json, results/prepare_seg/eval.json, |
|
results/prepare_seg/test.json] |
|
annotation_format: json |
|
bos_id: 1 |
|
eos_id: 2 |
|
|