ruslanmv commited on
Commit
48f2c44
1 Parent(s): eb527fd

Update synthesizer/inference.py

Browse files
Files changed (1) hide show
  1. synthesizer/inference.py +166 -165
synthesizer/inference.py CHANGED
@@ -1,165 +1,166 @@
1
- import torch
2
- from synthesizer import audio
3
- from synthesizer.hparams import hparams
4
- from synthesizer.models.tacotron import Tacotron
5
- from synthesizer.utils.symbols import symbols
6
- from synthesizer.utils.text import text_to_sequence
7
- from vocoder.display import simple_table
8
- from pathlib import Path
9
- from typing import Union, List
10
- import numpy as np
11
- import librosa
12
-
13
-
14
- class Synthesizer:
15
- sample_rate = hparams.sample_rate
16
- hparams = hparams
17
-
18
- def __init__(self, model_fpath: Path, verbose=True):
19
- """
20
- The model isn't instantiated and loaded in memory until needed or until load() is called.
21
-
22
- :param model_fpath: path to the trained model file
23
- :param verbose: if False, prints less information when using the model
24
- """
25
- self.model_fpath = model_fpath
26
- self.verbose = verbose
27
-
28
- # Check for GPU
29
- if torch.cuda.is_available():
30
- self.device = torch.device("cuda")
31
- else:
32
- self.device = torch.device("cpu")
33
- if self.verbose:
34
- print("Synthesizer using device:", self.device)
35
-
36
- # Tacotron model will be instantiated later on first use.
37
- self._model = None
38
-
39
- def is_loaded(self):
40
- """
41
- Whether the model is loaded in memory.
42
- """
43
- return self._model is not None
44
-
45
- def load(self):
46
- """
47
- Instantiates and loads the model given the weights file that was passed in the constructor.
48
- """
49
- self._model = Tacotron(embed_dims=hparams.tts_embed_dims,
50
- num_chars=len(symbols),
51
- encoder_dims=hparams.tts_encoder_dims,
52
- decoder_dims=hparams.tts_decoder_dims,
53
- n_mels=hparams.num_mels,
54
- fft_bins=hparams.num_mels,
55
- postnet_dims=hparams.tts_postnet_dims,
56
- encoder_K=hparams.tts_encoder_K,
57
- lstm_dims=hparams.tts_lstm_dims,
58
- postnet_K=hparams.tts_postnet_K,
59
- num_highways=hparams.tts_num_highways,
60
- dropout=hparams.tts_dropout,
61
- stop_threshold=hparams.tts_stop_threshold,
62
- speaker_embedding_size=hparams.speaker_embedding_size).to(self.device)
63
-
64
- self._model.load(self.model_fpath)
65
- self._model.eval()
66
-
67
- if self.verbose:
68
- print("Loaded synthesizer \"%s\" trained to step %d" % (self.model_fpath.name, self._model.state_dict()["step"]))
69
-
70
- def synthesize_spectrograms(self, texts: List[str],
71
- embeddings: Union[np.ndarray, List[np.ndarray]],
72
- return_alignments=False):
73
- """
74
- Synthesizes mel spectrograms from texts and speaker embeddings.
75
-
76
- :param texts: a list of N text prompts to be synthesized
77
- :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256)
78
- :param return_alignments: if True, a matrix representing the alignments between the
79
- characters
80
- and each decoder output step will be returned for each spectrogram
81
- :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the
82
- sequence length of spectrogram i, and possibly the alignments.
83
- """
84
- # Load the model on the first request.
85
- if not self.is_loaded():
86
- self.load()
87
-
88
- # Preprocess text inputs
89
- inputs = [text_to_sequence(text.strip(), hparams.tts_cleaner_names) for text in texts]
90
- if not isinstance(embeddings, list):
91
- embeddings = [embeddings]
92
-
93
- # Batch inputs
94
- batched_inputs = [inputs[i:i+hparams.synthesis_batch_size]
95
- for i in range(0, len(inputs), hparams.synthesis_batch_size)]
96
- batched_embeds = [embeddings[i:i+hparams.synthesis_batch_size]
97
- for i in range(0, len(embeddings), hparams.synthesis_batch_size)]
98
-
99
- specs = []
100
- for i, batch in enumerate(batched_inputs, 1):
101
- if self.verbose:
102
- print(f"\n| Generating {i}/{len(batched_inputs)}")
103
-
104
- # Pad texts so they are all the same length
105
- text_lens = [len(text) for text in batch]
106
- max_text_len = max(text_lens)
107
- chars = [pad1d(text, max_text_len) for text in batch]
108
- chars = np.stack(chars)
109
-
110
- # Stack speaker embeddings into 2D array for batch processing
111
- speaker_embeds = np.stack(batched_embeds[i-1])
112
-
113
- # Convert to tensor
114
- chars = torch.tensor(chars).long().to(self.device)
115
- speaker_embeddings = torch.tensor(speaker_embeds).float().to(self.device)
116
-
117
- # Inference
118
- _, mels, alignments = self._model.generate(chars, speaker_embeddings)
119
- mels = mels.detach().cpu().numpy()
120
- for m in mels:
121
- # Trim silence from end of each spectrogram
122
- while np.max(m[:, -1]) < hparams.tts_stop_threshold:
123
- m = m[:, :-1]
124
- specs.append(m)
125
-
126
- if self.verbose:
127
- print("\n\nDone.\n")
128
- return (specs, alignments) if return_alignments else specs
129
-
130
- @staticmethod
131
- def load_preprocess_wav(fpath):
132
- """
133
- Loads and preprocesses an audio file under the same conditions the audio files were used to
134
- train the synthesizer.
135
- """
136
- wav = librosa.load(str(fpath), hparams.sample_rate)[0]
137
- if hparams.rescale:
138
- wav = wav / np.abs(wav).max() * hparams.rescaling_max
139
- return wav
140
-
141
- @staticmethod
142
- def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
143
- """
144
- Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that
145
- were fed to the synthesizer when training.
146
- """
147
- if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
148
- wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
149
- else:
150
- wav = fpath_or_wav
151
-
152
- mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
153
- return mel_spectrogram
154
-
155
- @staticmethod
156
- def griffin_lim(mel):
157
- """
158
- Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built
159
- with the same parameters present in hparams.py.
160
- """
161
- return audio.inv_mel_spectrogram(mel, hparams)
162
-
163
-
164
- def pad1d(x, max_len, pad_value=0):
165
- return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)
 
1
+ import torch
2
+ from synthesizer import audio
3
+ from synthesizer.hparams import hparams
4
+ from synthesizer.models.tacotron import Tacotron
5
+ from synthesizer.utils.symbols import symbols
6
+ from synthesizer.utils.text import text_to_sequence
7
+ from vocoder.display import simple_table
8
+ from pathlib import Path
9
+ from typing import Union, List
10
+ import numpy as np
11
+ import librosa
12
+
13
+
14
+ class Synthesizer:
15
+ sample_rate = hparams.sample_rate
16
+ hparams = hparams
17
+
18
+ def __init__(self, model_fpath: Path, verbose=True):
19
+ """
20
+ The model isn't instantiated and loaded in memory until needed or until load() is called.
21
+
22
+ :param model_fpath: path to the trained model file
23
+ :param verbose: if False, prints less information when using the model
24
+ """
25
+ self.model_fpath = model_fpath
26
+ self.verbose = verbose
27
+
28
+ # Check for GPU
29
+ if torch.cuda.is_available():
30
+ self.device = torch.device("cuda")
31
+ else:
32
+ self.device = torch.device("cpu")
33
+ if self.verbose:
34
+ print("Synthesizer using device:", self.device)
35
+
36
+ # Tacotron model will be instantiated later on first use.
37
+ self._model = None
38
+
39
+ def is_loaded(self):
40
+ """
41
+ Whether the model is loaded in memory.
42
+ """
43
+ return self._model is not None
44
+
45
+ def load(self):
46
+ """
47
+ Instantiates and loads the model given the weights file that was passed in the constructor.
48
+ """
49
+ self._model = Tacotron(embed_dims=hparams.tts_embed_dims,
50
+ num_chars=len(symbols),
51
+ encoder_dims=hparams.tts_encoder_dims,
52
+ decoder_dims=hparams.tts_decoder_dims,
53
+ n_mels=hparams.num_mels,
54
+ fft_bins=hparams.num_mels,
55
+ postnet_dims=hparams.tts_postnet_dims,
56
+ encoder_K=hparams.tts_encoder_K,
57
+ lstm_dims=hparams.tts_lstm_dims,
58
+ postnet_K=hparams.tts_postnet_K,
59
+ num_highways=hparams.tts_num_highways,
60
+ dropout=hparams.tts_dropout,
61
+ stop_threshold=hparams.tts_stop_threshold,
62
+ speaker_embedding_size=hparams.speaker_embedding_size).to(self.device)
63
+
64
+ self._model.load(self.model_fpath)
65
+ self._model.eval()
66
+
67
+ if self.verbose:
68
+ print("Loaded synthesizer \"%s\" trained to step %d" % (self.model_fpath.name, self._model.state_dict()["step"]))
69
+
70
+ def synthesize_spectrograms(self, texts: List[str],
71
+ embeddings: Union[np.ndarray, List[np.ndarray]],
72
+ return_alignments=False):
73
+ """
74
+ Synthesizes mel spectrograms from texts and speaker embeddings.
75
+
76
+ :param texts: a list of N text prompts to be synthesized
77
+ :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256)
78
+ :param return_alignments: if True, a matrix representing the alignments between the
79
+ characters
80
+ and each decoder output step will be returned for each spectrogram
81
+ :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the
82
+ sequence length of spectrogram i, and possibly the alignments.
83
+ """
84
+ # Load the model on the first request.
85
+ if not self.is_loaded():
86
+ self.load()
87
+
88
+ # Preprocess text inputs
89
+ inputs = [text_to_sequence(text.strip(), hparams.tts_cleaner_names) for text in texts]
90
+ if not isinstance(embeddings, list):
91
+ embeddings = [embeddings]
92
+
93
+ # Batch inputs
94
+ batched_inputs = [inputs[i:i+hparams.synthesis_batch_size]
95
+ for i in range(0, len(inputs), hparams.synthesis_batch_size)]
96
+ batched_embeds = [embeddings[i:i+hparams.synthesis_batch_size]
97
+ for i in range(0, len(embeddings), hparams.synthesis_batch_size)]
98
+
99
+ specs = []
100
+ for i, batch in enumerate(batched_inputs, 1):
101
+ if self.verbose:
102
+ print(f"\n| Generating {i}/{len(batched_inputs)}")
103
+
104
+ # Pad texts so they are all the same length
105
+ text_lens = [len(text) for text in batch]
106
+ max_text_len = max(text_lens)
107
+ chars = [pad1d(text, max_text_len) for text in batch]
108
+ chars = np.stack(chars)
109
+
110
+ # Stack speaker embeddings into 2D array for batch processing
111
+ speaker_embeds = np.stack(batched_embeds[i-1])
112
+
113
+ # Convert to tensor
114
+ chars = torch.tensor(chars).long().to(self.device)
115
+ speaker_embeddings = torch.tensor(speaker_embeds).float().to(self.device)
116
+
117
+ # Inference
118
+ _, mels, alignments = self._model.generate(chars, speaker_embeddings)
119
+ mels = mels.detach().cpu().numpy()
120
+ for m in mels:
121
+ # Trim silence from end of each spectrogram
122
+ while np.max(m[:, -1]) < hparams.tts_stop_threshold:
123
+ m = m[:, :-1]
124
+ specs.append(m)
125
+
126
+ if self.verbose:
127
+ print("\n\nDone.\n")
128
+ return (specs, alignments) if return_alignments else specs
129
+
130
+ @staticmethod
131
+ def load_preprocess_wav(fpath):
132
+ """
133
+ Loads and preprocesses an audio file under the same conditions the audio files were used to
134
+ train the synthesizer.
135
+ """
136
+ print("Loading fpath and hparams.sample_rate :",str(fpath), hparams.sample_rate)
137
+ wav = librosa.load(str(fpath), hparams.sample_rate)[0]
138
+ if hparams.rescale:
139
+ wav = wav / np.abs(wav).max() * hparams.rescaling_max
140
+ return wav
141
+
142
+ @staticmethod
143
+ def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
144
+ """
145
+ Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that
146
+ were fed to the synthesizer when training.
147
+ """
148
+ if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
149
+ wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
150
+ else:
151
+ wav = fpath_or_wav
152
+
153
+ mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
154
+ return mel_spectrogram
155
+
156
+ @staticmethod
157
+ def griffin_lim(mel):
158
+ """
159
+ Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built
160
+ with the same parameters present in hparams.py.
161
+ """
162
+ return audio.inv_mel_spectrogram(mel, hparams)
163
+
164
+
165
+ def pad1d(x, max_len, pad_value=0):
166
+ return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)