File size: 6,959 Bytes
1842ebd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
{
"run_name": "Wav2Vec-fine-tuning-TEDx",
"run_description": "Fine tuning TEDx",
"seed": 42,
// AUDIO PARAMS
"sampling_rate": 16000,
// VOCABULARY PARAMETERS
"vocab":{
"vocab_path": "example/vocab_example_ru.json", // generic vocab for Portuguese
"blank": "<pad>", // blank token for padding
"silence": "|", // token between words
"unk": "<unk>" // unk token
},
// TRAINING
"batch_size": 16, // Batch size for training.
"mixed_precision": true, // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate.
"early_stop_epochs": 10, // If 0 disabled else Number of epochs for stop training with validation loss dont decrease
"preprocess_dataset": false, // if true, the dataset will be pre-processed and saved in disk, otherwise the audio files will be loaded in each step. Preprocessing makes training faster, but requires much more disk space.
// OPTIMIZER
"epochs": 140, // total number of epochs to train.
"lr": 0.00003, // Initial learning rate.
"gradient_accumulation_steps": 12,
// LOGGING
"logging_steps": 100, // Number of steps to plot.
"load_best_model_at_end": true,
"save_total_limit": 3,
"warmup_ratio": 0.04761904762142857, // 0 disable Ratio of total training steps used for a linear warmup from 0 to learning_rate
"warmup_steps": 0, // 0 disable Number of steps used for a linear warmup from 0 to learning_rate
// DATA LOADING
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are goo
// MODEL
"freeze_feature_extractor": true, // Whether to freeze the feature extractor layers of the model.
"attention_dropout": 0.1, // The dropout ratio for the attention probabilities.
"activation_dropout": 0.1, // The dropout ratio for activations inside the fully connected layer.
"hidden_dropout": 0.1, // The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
"feat_proj_dropout": 0.1, // The dropout probabilitiy for all 1D convolutional layers in feature extractor.
"mask_time_prob": 0.05, // Propability of each feature vector along the time axis to be chosen as the start of the vector span to be masked.
"layerdrop": 0.0, // The LayerDrop probability.
"gradient_checkpointing": true, // If True, use gradient checkpointing to save memory at the expense of slower backward pass.
// ToDo: Implement Time mask and Frequency Mask
"audio_augmentation":[
// additive noise and room impulse response (RIR) simulation similar to: https://arxiv.org/pdf/2009.14153.pdf
{
"name": "additive",
"sounds_path":"/raid/datasets/DA/musan/speech/", // download: https://www.openslr.org/17/
"lru_cache_size": 32, // Maximum size of the LRU cache for storing noise files in memory
"min_snr_in_db": 13.0,
"max_snr_in_db": 20.0,
// "sample_rate": 16000,
"p": 0.25
},
{
"name": "additive",
"sounds_path":"/raid/datasets/DA/musan/music/", // download: https://www.openslr.org/17/
"lru_cache_size": 32, // Maximum size of the LRU cache for storing noise files in memory
"min_snr_in_db": 5.0,
"max_snr_in_db": 15.0,
// "sample_rate": 16000,
"p": 0.25
},
{
"name": "additive",
"sounds_path":"/raid/datasets/DA/musan/noise/", // download: https://www.openslr.org/17/
"lru_cache_size": 32, // Maximum size of the LRU cache for storing noise files in memory
"min_snr_in_db": 0.0,
"max_snr_in_db": 15.0,
// "sample_rate": 16000,
"p": 0.25
},
// rir filter proposed by: https://ieeexplore.ieee.org/document/7953152
{
"name": "rir",
"ir_path": "/raid/datasets/DA/RIRS_NOISES/simulated_rirs/", // download: https://www.openslr.org/28/
"lru_cache_size": 128, // Maximum size of the LRU cache for storing noise files in memory
// "sample_rate": 16000,
"p": 0.25
}
,
// {
// "name": "gain",
// "min_gain_in_db": -18.0,
// "max_gain_in_db": 6,
// "p": 0.25 // propability of apply this method, 0 is disable
// },
{
"name": "pitch_shift",
"min_semitones": -4,
"max_semitones": 4,
"p": 0.25 // propability of apply this method, 0 is disable
},
{
"name": "gaussian",
"min_amplitude": 0.0001,
"max_amplitude": 0.001,
"p": 0.25 // propability of apply this method, 0 is disable
}
],
// PATHS
"output_path": "../checkpoints/YourTTS2ASR/Wav2Vec-voxpopuli/one-speaker/just-TTS/RU/140-epoch-high-bs/",
// CACHE
"dataset_cache": "../datasets/ru-cache-high-bs/",
// DATASETS
"datasets":{
"files_path": "/raid/datasets/Mailabs/ru/", // relative path for audios It's will be join with the CS
"train":
[
// this dicts is pass directly for the load dataset see the documentation: https://huggingface.co/docs/datasets/package_reference/loading_methods.html#datasets.load_dataset
{
"name": "csv",
"path": "csv",
"data_files": ["/raid/datasets/Mailabs/ru/train_converted.csv"], // csv files
"text_column": "text",
"path_column": "file_path"
}
]
,
"devel":
[
{
"name": "csv",
"path": "csv",
"data_files": ["/raid/datasets/Mailabs/ru/dev_converted.csv"], // csv files
"text_column": "text",
"path_column": "file_path"
}
]
,
"test":
{
"name": "csv",
"path": "csv",
"data_files": ["/raid/datasets/DA/Common_Voice/cv-corpus-7.0-2021-07-21/ru/test_converted.csv"], // csv files
"text_column": "text",
"path_column": "file_path"
}
}//,
// used only for test
// "KenLM":{
// "kenlm_model_path": "../../kenLM/binaries/subtitle/4-gram/lm.binary", // Path for KenLM model
// "lexicon_path": "example/lexicon.lst", // file with all words for limit the decoder search
// "beam": 2048,
// "nbest": 1,
// "beam_threshold": 25,
// "lm_weight": 1,
// "word_score": -1,
// "sil_weight": 0
// }
}
|