File size: 7,006 Bytes
b96927a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
{
    "run_name": "Wav2Vec-fine-tuning-TEDx",
    "run_description": "Fine tuning TEDx",
    "seed": 42,
    // AUDIO PARAMS
    "sampling_rate": 16000,

    // VOCABULARY PARAMETERS
    "vocab":{
        "vocab_path": "example/vocab_example.json", // generic vocab for Portuguese
        "blank": "<pad>", // blank token for padding
        "silence": "|", // token between words
        "unk": "<unk>" // unk token
    },

    // TRAINING
    "batch_size": 8,       // Batch size for training.
    "mixed_precision": true,     // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate.
    "early_stop_epochs": 10, // If 0 disabled else Number of epochs for stop training with validation loss dont decrease 
    "preprocess_dataset": false, // if true, the dataset will be pre-processed and saved in disk, otherwise the audio files will be loaded in each step. Preprocessing makes training faster, but requires much more disk space.

    // OPTIMIZER
    "epochs": 140,                // total number of epochs to train.
    "lr": 0.00003,                  // Initial learning rate.
    "gradient_accumulation_steps": 24, 

    // LOGGING
    "logging_steps": 100,    // Number of steps to plot.
    "load_best_model_at_end": true,
    "save_total_limit": 3,
    "warmup_ratio": 0.06666666667, // 0 disable Ratio of total training steps used for a linear warmup from 0 to learning_rate
    "warmup_steps": 0, // 0 disable  Number of steps used for a linear warmup from 0 to learning_rate

    // DATA LOADING
    "num_loader_workers": 8,        // number of training data loader processes. Don't set it too big. 4-8 are goo
    
    // MODEL
    "freeze_feature_extractor": true, // Whether to freeze the feature extractor layers of the model.
    "attention_dropout": 0.1, // The dropout ratio for the attention probabilities.
    "activation_dropout": 0.1, // The dropout ratio for activations inside the fully connected layer.
    "hidden_dropout": 0.1, // The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
    "feat_proj_dropout": 0.1, // The dropout probabilitiy for all 1D convolutional layers in feature extractor.
    "mask_time_prob": 0.05, //  Propability of each feature vector along the time axis to be chosen as the start of the vector span to be masked.
    "layerdrop": 0.0,  // The LayerDrop probability. 
    "gradient_checkpointing": true,  // If True, use gradient checkpointing to save memory at the expense of slower backward pass.

    // ToDo: Implement Time mask and Frequency Mask
    "audio_augmentation":[
        // additive noise and room impulse response (RIR) simulation similar to: https://arxiv.org/pdf/2009.14153.pdf
        {
            "name": "additive",
            "sounds_path":"/raid/datasets/DA/musan/speech/", // download: https://www.openslr.org/17/
            "lru_cache_size": 32, // Maximum size of the LRU cache for storing noise files in memory
            "min_snr_in_db": 13.0,
            "max_snr_in_db": 20.0,
            // "sample_rate": 16000,
            "p": 0.25
        },
        {
            "name": "additive",
            "sounds_path":"/raid/datasets/DA/musan/music/", // download: https://www.openslr.org/17/
            "lru_cache_size": 32, // Maximum size of the LRU cache for storing noise files in memory
            "min_snr_in_db": 5.0,
            "max_snr_in_db": 15.0,
            // "sample_rate": 16000,
            "p": 0.25
        },
        {
            "name": "additive",
            "sounds_path":"/raid/datasets/DA/musan/noise/", // download: https://www.openslr.org/17/
            "lru_cache_size": 32, // Maximum size of the LRU cache for storing noise files in memory
            "min_snr_in_db": 0.0,
            "max_snr_in_db": 15.0,
            // "sample_rate": 16000,
            "p": 0.25
        },
        // rir filter proposed by: https://ieeexplore.ieee.org/document/7953152
        {
            "name": "rir",
            "ir_path": "/raid/datasets/DA/RIRS_NOISES/simulated_rirs/", // download: https://www.openslr.org/28/
            "lru_cache_size": 128, // Maximum size of the LRU cache for storing noise files in memory
            // "sample_rate": 16000,
            "p": 0.25
        }
        , 
        // {
        //     "name": "gain",
        //     "min_gain_in_db": -18.0,
        //     "max_gain_in_db": 6,
        //     "p": 0.25 // propability of apply this method, 0 is disable
        // },
        {
            "name": "pitch_shift",
            "min_semitones": -4,
            "max_semitones": 4,
            "p": 0.25 // propability of apply this method, 0 is disable
        },
        {
            "name": "gaussian",
            "min_amplitude": 0.0001,
            "max_amplitude": 0.001,
            "p": 0.25 // propability of apply this method, 0 is disable
        }
    ],
    // PATHS
    "output_path": "../checkpoints/YourTTS2ASR/Wav2Vec-voxpopuli/one-speaker/just-TTS/PT/140-epoch-high-bs/",
    // CACHE
    "dataset_cache": "../datasets/",

    // DATASETS
    "datasets":{ 
        "files_path": "/raid/datasets/TTS-Portuguese-Corpus/", // relative path for audios It's will be join with the CS
        "train":
            [
               // this dicts is pass directly for the load dataset see the documentation: https://huggingface.co/docs/datasets/package_reference/loading_methods.html#datasets.load_dataset 
            {
                "name": "csv",
                "path": "csv",
                
                "data_files": ["/raid/datasets/TTS-Portuguese-Corpus/train_TTS-Portuguese_Corpus_metadata_converted_to_ASR.csv"], // csv files
                "text_column": "text",
                "path_column": "file_path"
            }
            ]
        ,
        "devel":
            [ 
                {
                    "name": "csv",
                    "path": "csv",
                    "data_files": ["/raid/datasets/TTS-Portuguese-Corpus/eval_TTS-Portuguese_Corpus_metadata_converted_to_ASR.csv"], // csv files
                    "text_column": "text",
                    "path_column": "file_path"
                }
            ] 
            ,
        "test":
            {
                "name": "csv",
                "path": "csv",
                "data_files": ["/raid/datasets/Common_Voice/cv-corpus-7.0-2021-07-21/pt/test_converted.csv"], // csv files
                "text_column": "text",
                "path_column": "file_path"
            }
    
    }//,
    // used only for test 
    // "KenLM":{
    // "kenlm_model_path": "../../kenLM/binaries/subtitle/4-gram/lm.binary", // Path for KenLM model
    // "lexicon_path": "example/lexicon.lst", // file with all words for limit the decoder search
    // "beam": 2048, 
    // "nbest": 1,
    // "beam_threshold": 25, 
    // "lm_weight": 1, 
    // "word_score": -1,
    // "sil_weight": 0
    // }



}