Aki894 commited on
Commit
aa8b339
·
1 Parent(s): 3b6f20f

Upload 3 files

Browse files
Files changed (3) hide show
  1. best_model.pth +3 -0
  2. config.json +105 -0
  3. resample.py +48 -0
best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b67178273ee7ea1cdea4378a3ec9a6e0da93d75238ad641ddb4714e4ef46ea14
3
+ size 114064632
config.json ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "github_branch":"inside_docker",
3
+ "restore_path":"/opt/mueller/TTS/OutputsMozilla/checkpoints/speaker_encoder/mueller91-September-24-2020_09+13AM-debug/best_model.pth.tar",
4
+ "run_name": "mueller91",
5
+ "run_description": "train speaker encoder with voxceleb1, voxceleb2 and libriSpeech ",
6
+ "audio":{
7
+ // Audio processing parameters
8
+ "num_mels": 80, // size of the mel spec frame.
9
+ "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
10
+ "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
11
+ "win_length": 1024, // stft window length in ms.
12
+ "hop_length": 256, // stft window hop-lengh in ms.
13
+ "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
14
+ "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
15
+ "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
16
+ "min_level_db": -100, // normalization range
17
+ "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
18
+ "power": 1.5, // value to sharpen wav signals after GL algorithm.
19
+ "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
20
+ // Normalization parameters
21
+ "signal_norm": true, // normalize the spec values in range [0, 1]
22
+ "symmetric_norm": true, // move normalization to range [-1, 1]
23
+ "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
24
+ "clip_norm": true, // clip normalized values into the range.
25
+ "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
26
+ "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
27
+ "do_trim_silence": true, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
28
+ "trim_db": 60 // threshold for timming silence. Set this according to your dataset.
29
+ },
30
+ "reinit_layers": [],
31
+ "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
32
+ "grad_clip": 3.0, // upper limit for gradients for clipping.
33
+ "epochs": 1000, // total number of epochs to train.
34
+ "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
35
+ "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
36
+ "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
37
+ "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
38
+ "steps_plot_stats": 10, // number of steps to plot embeddings.
39
+ "num_speakers_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
40
+ "voice_len": 2.0, // size of the voice
41
+ "num_utters_per_speaker": 10, //
42
+ "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values.
43
+ "wd": 0.000001, // Weight decay weight.
44
+ "checkpoint": true, // If true, it saves checkpoints per "save_step"
45
+ "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
46
+ "print_step": 20, // Number of steps to log traning on console.
47
+ "output_path": "../../OutputsMozilla/checkpoints/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
48
+ "model": {
49
+ "input_dim": 80,
50
+ "proj_dim": 256,
51
+ "lstm_dim": 768,
52
+ "num_lstm_layers": 3,
53
+ "use_lstm_with_projection": true
54
+ },
55
+ "storage": {
56
+ "sample_from_storage_p": 0.9, // the probability with which we'll sample from the DataSet in-memory storage
57
+ "storage_size": 25, // the size of the in-memory storage with respect to a single batch
58
+ "additive_noise": 1e-5 // add very small gaussian noise to the data in order to increase robustness
59
+ },
60
+ "datasets":
61
+ [
62
+ {
63
+ "name": "vctk_slim",
64
+ "path": "../../../audio-datasets/en/VCTK-Corpus/",
65
+ "meta_file_train": null,
66
+ "meta_file_val": null
67
+ },
68
+ {
69
+ "name": "libri_tts",
70
+ "path": "../../../audio-datasets/en/LibriTTS/train-clean-100",
71
+ "meta_file_train": null,
72
+ "meta_file_val": null
73
+ },
74
+ {
75
+ "name": "libri_tts",
76
+ "path": "../../../audio-datasets/en/LibriTTS/train-clean-360",
77
+ "meta_file_train": null,
78
+ "meta_file_val": null
79
+ },
80
+ {
81
+ "name": "libri_tts",
82
+ "path": "../../../audio-datasets/en/LibriTTS/train-other-500",
83
+ "meta_file_train": null,
84
+ "meta_file_val": null
85
+ },
86
+ {
87
+ "name": "voxceleb1",
88
+ "path": "../../../audio-datasets/en/voxceleb1/",
89
+ "meta_file_train": null,
90
+ "meta_file_val": null
91
+ },
92
+ {
93
+ "name": "voxceleb2",
94
+ "path": "../../../audio-datasets/en/voxceleb2/",
95
+ "meta_file_train": null,
96
+ "meta_file_val": null
97
+ },
98
+ {
99
+ "name": "common_voice",
100
+ "path": "../../../audio-datasets/en/MozillaCommonVoice",
101
+ "meta_file_train": "train.tsv",
102
+ "meta_file_val": "test.tsv"
103
+ }
104
+ ]
105
+ }
resample.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import librosa
4
+ import numpy as np
5
+ from multiprocessing import Pool, cpu_count
6
+ from scipy.io import wavfile
7
+ from tqdm import tqdm
8
+
9
+
10
+ def process(item):
11
+ spkdir, wav_name, args = item
12
+ # speaker 's5', 'p280', 'p315' are excluded,
13
+ speaker = spkdir.replace("\\", "/").split("/")[-1]
14
+ wav_path = os.path.join(args.in_dir, speaker, wav_name)
15
+ if os.path.exists(wav_path) and '.wav' in wav_path:
16
+ os.makedirs(os.path.join(args.out_dir2, speaker), exist_ok=True)
17
+ wav, sr = librosa.load(wav_path, sr=None)
18
+ wav, _ = librosa.effects.trim(wav, top_db=20)
19
+ peak = np.abs(wav).max()
20
+ if peak > 1.0:
21
+ wav = 0.98 * wav / peak
22
+ wav2 = librosa.resample(wav, orig_sr=sr, target_sr=args.sr2)
23
+ wav2 /= max(wav2.max(), -wav2.min())
24
+ save_name = wav_name
25
+ save_path2 = os.path.join(args.out_dir2, speaker, save_name)
26
+ wavfile.write(
27
+ save_path2,
28
+ args.sr2,
29
+ (wav2 * np.iinfo(np.int16).max).astype(np.int16)
30
+ )
31
+
32
+
33
+
34
+ if __name__ == "__main__":
35
+ parser = argparse.ArgumentParser()
36
+ parser.add_argument("--sr2", type=int, default=16000, help="sampling rate")
37
+ parser.add_argument("--in_dir", type=str, default="./dataset_raw", help="path to source dir")
38
+ parser.add_argument("--out_dir2", type=str, default="./data_svc/waves", help="path to target dir")
39
+ args = parser.parse_args()
40
+ processs = cpu_count()-2 if cpu_count() >4 else 1
41
+ pool = Pool(processes=processs)
42
+
43
+ for speaker in os.listdir(args.in_dir):
44
+ spk_dir = os.path.join(args.in_dir, speaker)
45
+ if os.path.isdir(spk_dir):
46
+ print(spk_dir)
47
+ for _ in tqdm(pool.imap_unordered(process, [(spk_dir, i, args) for i in os.listdir(spk_dir) if i.endswith("wav")])):
48
+ pass