Ahsen Khaliq commited on
Commit
a32e829
1 Parent(s): 0e4103d

Update demo_cli.py

Browse files
Files changed (1) hide show
  1. demo_cli.py +37 -37
demo_cli.py CHANGED
@@ -82,45 +82,45 @@ if __name__ == '__main__':
82
 
83
 
84
  ## Run a test
85
- print("Testing your configuration with small inputs.")
86
- # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
87
- # sampling rate, which may differ.
88
- # If you're unfamiliar with digital audio, know that it is encoded as an array of floats
89
- # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
90
- # The sampling rate is the number of values (samples) recorded per second, it is set to
91
- # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
92
- # to an audio of 1 second.
93
- print(" Testing the encoder...")
94
- encoder.embed_utterance(np.zeros(encoder.sampling_rate))
95
 
96
- # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
97
- # returns, but here we're going to make one ourselves just for the sake of showing that it's
98
- # possible.
99
- embed = np.random.rand(speaker_embedding_size)
100
- # Embeddings are L2-normalized (this isn't important here, but if you want to make your own
101
- # embeddings it will be).
102
- embed /= np.linalg.norm(embed)
103
- # The synthesizer can handle multiple inputs with batching. Let's create another embedding to
104
- # illustrate that
105
- embeds = [embed, np.zeros(speaker_embedding_size)]
106
- texts = ["test 1", "test 2"]
107
- print(" Testing the synthesizer... (loading the model will output a lot of text)")
108
- mels = synthesizer.synthesize_spectrograms(texts, embeds)
109
 
110
- # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
111
- # can concatenate the mel spectrograms to a single one.
112
- mel = np.concatenate(mels, axis=1)
113
- # The vocoder can take a callback function to display the generation. More on that later. For
114
- # now we'll simply hide it like this:
115
- no_action = lambda *args: None
116
- print(" Testing the vocoder...")
117
- # For the sake of making this test short, we'll pass a short target length. The target length
118
- # is the length of the wav segments that are processed in parallel. E.g. for audio sampled
119
- # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
120
- # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
121
- # that has a detrimental effect on the quality of the audio. The default parameters are
122
- # recommended in general.
123
- vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
124
 
125
  print("All test passed! You can now synthesize speech.\n\n")
126
 
 
82
 
83
 
84
  ## Run a test
85
+ # print("Testing your configuration with small inputs.")
86
+ # # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
87
+ # # sampling rate, which may differ.
88
+ # # If you're unfamiliar with digital audio, know that it is encoded as an array of floats
89
+ # # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
90
+ # # The sampling rate is the number of values (samples) recorded per second, it is set to
91
+ # # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
92
+ # # to an audio of 1 second.
93
+ # print(" Testing the encoder...")
94
+ # encoder.embed_utterance(np.zeros(encoder.sampling_rate))
95
 
96
+ # # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
97
+ # # returns, but here we're going to make one ourselves just for the sake of showing that it's
98
+ # # possible.
99
+ # embed = np.random.rand(speaker_embedding_size)
100
+ # # Embeddings are L2-normalized (this isn't important here, but if you want to make your own
101
+ # # embeddings it will be).
102
+ # embed /= np.linalg.norm(embed)
103
+ # # The synthesizer can handle multiple inputs with batching. Let's create another embedding to
104
+ # # illustrate that
105
+ # embeds = [embed, np.zeros(speaker_embedding_size)]
106
+ # texts = ["test 1", "test 2"]
107
+ # print(" Testing the synthesizer... (loading the model will output a lot of text)")
108
+ # mels = synthesizer.synthesize_spectrograms(texts, embeds)
109
 
110
+ # # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
111
+ # # can concatenate the mel spectrograms to a single one.
112
+ # mel = np.concatenate(mels, axis=1)
113
+ # # The vocoder can take a callback function to display the generation. More on that later. For
114
+ # # now we'll simply hide it like this:
115
+ # no_action = lambda *args: None
116
+ # print(" Testing the vocoder...")
117
+ # # For the sake of making this test short, we'll pass a short target length. The target length
118
+ # # is the length of the wav segments that are processed in parallel. E.g. for audio sampled
119
+ # # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
120
+ # # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
121
+ # # that has a detrimental effect on the quality of the audio. The default parameters are
122
+ # # recommended in general.
123
+ # vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
124
 
125
  print("All test passed! You can now synthesize speech.\n\n")
126