jbetker commited on
Commit
2a5166d
1 Parent(s): 073b27c

add regenerate option

Browse files
Files changed (2) hide show
  1. README.md +8 -0
  2. read.py +7 -0
README.md CHANGED
@@ -44,10 +44,18 @@ python do_tts.py --text "I'm going to speak this" --voice dotrice --preset fast
44
  ### read.py
45
 
46
  This script provides tools for reading large amounts of text.
 
47
  ```shell
48
  python read.py --textfile <your text to be read> --voice dotrice
49
  ```
50
 
 
 
 
 
 
 
 
51
  ### API
52
 
53
  Tortoise can be used programmatically, like so:
 
44
  ### read.py
45
 
46
  This script provides tools for reading large amounts of text.
47
+
48
  ```shell
49
  python read.py --textfile <your text to be read> --voice dotrice
50
  ```
51
 
52
+ This will break up the textfile into sentences, and then convert them to speech one at a time. It will output a series
53
+ of spoken clips as they are generated. Once all the clips are generated, it will combine them into a single file and
54
+ output that as well.
55
+
56
+ Sometimes Tortoise screws up an output. You can re-generate any bad clips by re-running `read.py` with the --regenerate
57
+ argument.
58
+
59
  ### API
60
 
61
  Tortoise can be used programmatically, like so:
read.py CHANGED
@@ -35,6 +35,7 @@ if __name__ == '__main__':
35
  'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart')
36
  parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/longform/')
37
  parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
 
38
  parser.add_argument('--voice_diversity_intelligibility_slider', type=float,
39
  help='How to balance vocal diversity with the quality/intelligibility of the spoken text. 0 means highly diverse voice (not recommended), 1 means maximize intellibility',
40
  default=.5)
@@ -43,6 +44,9 @@ if __name__ == '__main__':
43
  outpath = args.output_path
44
  voices = get_voices()
45
  selected_voices = args.voice.split(',')
 
 
 
46
  for selected_voice in selected_voices:
47
  voice_outpath = os.path.join(outpath, selected_voice)
48
  os.makedirs(voice_outpath, exist_ok=True)
@@ -71,6 +75,9 @@ if __name__ == '__main__':
71
  conds.append(c)
72
  all_parts = []
73
  for j, text in enumerate(texts):
 
 
 
74
  gen = tts.tts_with_preset(text, conds, preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider)
75
  gen = gen.squeeze(0).cpu()
76
  torchaudio.save(os.path.join(voice_outpath, f'{j}.wav'), gen, 24000)
 
35
  'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart')
36
  parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/longform/')
37
  parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard')
38
+ parser.add_argument('--regenerate', type=str, help='Comma-separated list of clip numbers to re-generate, or nothing.', default=None)
39
  parser.add_argument('--voice_diversity_intelligibility_slider', type=float,
40
  help='How to balance vocal diversity with the quality/intelligibility of the spoken text. 0 means highly diverse voice (not recommended), 1 means maximize intellibility',
41
  default=.5)
 
44
  outpath = args.output_path
45
  voices = get_voices()
46
  selected_voices = args.voice.split(',')
47
+ regenerate = args.regenerate
48
+ if regenerate is not None:
49
+ regenerate = [int(e) for e in regenerate.split(',')]
50
  for selected_voice in selected_voices:
51
  voice_outpath = os.path.join(outpath, selected_voice)
52
  os.makedirs(voice_outpath, exist_ok=True)
 
75
  conds.append(c)
76
  all_parts = []
77
  for j, text in enumerate(texts):
78
+ if regenerate is not None and j not in regenerate:
79
+ all_parts.append(load_audio(os.path.join(voice_outpath, f'{j}.wav'), 24000))
80
+ continue
81
  gen = tts.tts_with_preset(text, conds, preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider)
82
  gen = gen.squeeze(0).cpu()
83
  torchaudio.save(os.path.join(voice_outpath, f'{j}.wav'), gen, 24000)