keithhon commited on
Commit
761be36
·
1 Parent(s): 7261cce

Upload synthesizer_preprocess_audio.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. synthesizer_preprocess_audio.py +59 -0
synthesizer_preprocess_audio.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from synthesizer.preprocess import preprocess_dataset
2
+ from synthesizer.hparams import hparams
3
+ from utils.argutils import print_args
4
+ from pathlib import Path
5
+ import argparse
6
+
7
+
8
+ if __name__ == "__main__":
9
+ parser = argparse.ArgumentParser(
10
+ description="Preprocesses audio files from datasets, encodes them as mel spectrograms "
11
+ "and writes them to the disk. Audio files are also saved, to be used by the "
12
+ "vocoder for training.",
13
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
14
+ )
15
+ parser.add_argument("datasets_root", type=Path, help=\
16
+ "Path to the directory containing your LibriSpeech/TTS datasets.")
17
+ parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS, help=\
18
+ "Path to the output directory that will contain the mel spectrograms, the audios and the "
19
+ "embeds. Defaults to <datasets_root>/SV2TTS/synthesizer/")
20
+ parser.add_argument("-n", "--n_processes", type=int, default=None, help=\
21
+ "Number of processes in parallel.")
22
+ parser.add_argument("-s", "--skip_existing", action="store_true", help=\
23
+ "Whether to overwrite existing files with the same name. Useful if the preprocessing was "
24
+ "interrupted.")
25
+ parser.add_argument("--hparams", type=str, default="", help=\
26
+ "Hyperparameter overrides as a comma-separated list of name-value pairs")
27
+ parser.add_argument("--no_trim", action="store_true", help=\
28
+ "Preprocess audio without trimming silences (not recommended).")
29
+ parser.add_argument("--no_alignments", action="store_true", help=\
30
+ "Use this option when dataset does not include alignments\
31
+ (these are used to split long audio files into sub-utterances.)")
32
+ parser.add_argument("--datasets_name", type=str, default="LibriSpeech", help=\
33
+ "Name of the dataset directory to process.")
34
+ parser.add_argument("--subfolders", type=str, default="train-clean-100, train-clean-360", help=\
35
+ "Comma-separated list of subfolders to process inside your dataset directory")
36
+ args = parser.parse_args()
37
+
38
+ # Process the arguments
39
+ if not hasattr(args, "out_dir"):
40
+ args.out_dir = args.datasets_root.joinpath("SV2TTS", "synthesizer")
41
+
42
+ # Create directories
43
+ assert args.datasets_root.exists()
44
+ args.out_dir.mkdir(exist_ok=True, parents=True)
45
+
46
+ # Verify webrtcvad is available
47
+ if not args.no_trim:
48
+ try:
49
+ import webrtcvad
50
+ except:
51
+ raise ModuleNotFoundError("Package 'webrtcvad' not found. This package enables "
52
+ "noise removal and is recommended. Please install and try again. If installation fails, "
53
+ "use --no_trim to disable this error message.")
54
+ del args.no_trim
55
+
56
+ # Preprocess the dataset
57
+ print_args(args, parser)
58
+ args.hparams = hparams.parse(args.hparams)
59
+ preprocess_dataset(**vars(args))