updates to scripts
Browse files
api.py
CHANGED
@@ -202,7 +202,7 @@ class TextToSpeech:
|
|
202 |
'ultra_fast': {'num_autoregressive_samples': 32, 'diffusion_iterations': 16, 'cond_free': False},
|
203 |
'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 32},
|
204 |
'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 128},
|
205 |
-
'high_quality': {'num_autoregressive_samples': 512, 'diffusion_iterations':
|
206 |
}
|
207 |
kwargs.update(presets[preset])
|
208 |
return self.tts(text, voice_samples, **kwargs)
|
|
|
202 |
'ultra_fast': {'num_autoregressive_samples': 32, 'diffusion_iterations': 16, 'cond_free': False},
|
203 |
'fast': {'num_autoregressive_samples': 96, 'diffusion_iterations': 32},
|
204 |
'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 128},
|
205 |
+
'high_quality': {'num_autoregressive_samples': 512, 'diffusion_iterations': 1024},
|
206 |
}
|
207 |
kwargs.update(presets[preset])
|
208 |
return self.tts(text, voice_samples, **kwargs)
|
do_tts.py
CHANGED
@@ -11,6 +11,10 @@ if __name__ == '__main__':
|
|
11 |
parser.add_argument('--text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
|
12 |
parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
|
13 |
'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart')
|
|
|
|
|
|
|
|
|
14 |
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/')
|
15 |
args = parser.parse_args()
|
16 |
os.makedirs(args.output_path, exist_ok=True)
|
@@ -25,6 +29,6 @@ if __name__ == '__main__':
|
|
25 |
for cond_path in cond_paths:
|
26 |
c = load_audio(cond_path, 22050)
|
27 |
conds.append(c)
|
28 |
-
gen = tts.tts_with_preset(args.text, conds, preset=
|
29 |
torchaudio.save(os.path.join(args.output_path, f'{voice}.wav'), gen.squeeze(0).cpu(), 24000)
|
30 |
|
|
|
11 |
parser.add_argument('--text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
|
12 |
parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
|
13 |
'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart')
|
14 |
+
parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
|
15 |
+
parser.add_argument('--voice_diversity_intelligibility_slider', type=float,
|
16 |
+
help='How to balance vocal diversity with the quality/intelligibility of the spoken text. 0 means highly diverse voice (not recommended), 1 means maximize intellibility',
|
17 |
+
default=.5)
|
18 |
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/')
|
19 |
args = parser.parse_args()
|
20 |
os.makedirs(args.output_path, exist_ok=True)
|
|
|
29 |
for cond_path in cond_paths:
|
30 |
c = load_audio(cond_path, 22050)
|
31 |
conds.append(c)
|
32 |
+
gen = tts.tts_with_preset(args.text, conds, preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider)
|
33 |
torchaudio.save(os.path.join(args.output_path, f'{voice}.wav'), gen.squeeze(0).cpu(), 24000)
|
34 |
|
read.py
CHANGED
@@ -28,11 +28,14 @@ def split_and_recombine_text(texts, desired_length=200, max_len=300):
|
|
28 |
|
29 |
if __name__ == '__main__':
|
30 |
parser = argparse.ArgumentParser()
|
31 |
-
parser.add_argument('--textfile', type=str, help='A file containing the text to read.', default="data/
|
32 |
parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
|
33 |
'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart')
|
34 |
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/longform/')
|
35 |
-
parser.add_argument('--
|
|
|
|
|
|
|
36 |
args = parser.parse_args()
|
37 |
|
38 |
outpath = args.output_path
|
@@ -60,16 +63,11 @@ if __name__ == '__main__':
|
|
60 |
if not cond_paths:
|
61 |
print('Error: no valid voices specified. Try again.')
|
62 |
|
63 |
-
|
|
|
|
|
|
|
64 |
for j, text in enumerate(texts):
|
65 |
-
conds =
|
66 |
-
for cond_path in cond_paths:
|
67 |
-
c = load_audio(cond_path, 22050)
|
68 |
-
conds.append(c)
|
69 |
-
gen = tts.tts_with_preset(text, conds, preset=args.generation_preset)
|
70 |
torchaudio.save(os.path.join(voice_outpath, f'{j}.wav'), gen.squeeze(0).cpu(), 24000)
|
71 |
|
72 |
-
priors.append(torchaudio.functional.resample(gen, 24000, 22050).squeeze(0))
|
73 |
-
while len(priors) > 2:
|
74 |
-
priors.pop(0)
|
75 |
-
|
|
|
28 |
|
29 |
if __name__ == '__main__':
|
30 |
parser = argparse.ArgumentParser()
|
31 |
+
parser.add_argument('--textfile', type=str, help='A file containing the text to read.', default="data/riding_hood.txt")
|
32 |
parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
|
33 |
'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart')
|
34 |
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/longform/')
|
35 |
+
parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
|
36 |
+
parser.add_argument('--voice_diversity_intelligibility_slider', type=float,
|
37 |
+
help='How to balance vocal diversity with the quality/intelligibility of the spoken text. 0 means highly diverse voice (not recommended), 1 means maximize intellibility',
|
38 |
+
default=.5)
|
39 |
args = parser.parse_args()
|
40 |
|
41 |
outpath = args.output_path
|
|
|
63 |
if not cond_paths:
|
64 |
print('Error: no valid voices specified. Try again.')
|
65 |
|
66 |
+
conds = []
|
67 |
+
for cond_path in cond_paths:
|
68 |
+
c = load_audio(cond_path, 22050)
|
69 |
+
conds.append(c)
|
70 |
for j, text in enumerate(texts):
|
71 |
+
gen = tts.tts_with_preset(text, conds, preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider)
|
|
|
|
|
|
|
|
|
72 |
torchaudio.save(os.path.join(voice_outpath, f'{j}.wav'), gen.squeeze(0).cpu(), 24000)
|
73 |
|
|
|
|
|
|
|
|