File size: 1,367 Bytes
0d80816
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
{
  "base_config": "config/vits.json",
  "model_type": "VITS",
  "dataset": [
    "hifitts",
    "libritts"
  ],
  "dataset_path": {
    // TODO: Fill in your dataset path
    "hifitts": "/mnt/workspace/xueliumeng/data/hifitts/hi_fi_tts_v0",
    "libritts": "/mnt/workspace/xueliumeng/data/libritts/raw/LibriTTS"
  },
  // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts"
  "log_dir": "/mnt/workspace/xueliumeng/data/vits_on_libritts_hifitts/logs",
  "preprocess": {
    "extract_audio": true,
    "use_phone": true,
    // linguistic features
    "extract_phone": true,
    "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
    // TODO: Fill in the output data path. The default value is "Amphion/data"
    "processed_dir": "/mnt/workspace/xueliumeng/data/vits_on_libritts_hifitts/processed_data",
    "sample_rate": 24000,
    "train_file": "train_all.json",
    "valid_file": "valid.json", // validattion set
    "use_spkid": true, // True: use speaker id for multi-speaker dataset
  },
  "train": {
    "batch_size": 16,
    "multi_speaker_training": true, // True: train multi-speaker model; False: training single-speaker model;
    "n_speakers": 2500, // number of speakers, while be automatically set if n_speakers is 0 and multi_speaker_training is true
  }
}