import matplotlib.pyplot as plt from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas from matplotlib.figure import Figure from PyQt5.QtCore import Qt, QStringListModel from PyQt5.QtWidgets import * from encoder.inference import plot_embedding_as_heatmap from toolbox.utterance import Utterance from pathlib import Path from typing import List, Set import sounddevice as sd import soundfile as sf import numpy as np # from sklearn.manifold import TSNE # You can try with TSNE if you like, I prefer UMAP from time import sleep import umap import sys from warnings import filterwarnings, warn filterwarnings("ignore") colormap = np.array([ [0, 127, 70], [255, 0, 0], [255, 217, 38], [0, 135, 255], [165, 0, 165], [255, 167, 255], [97, 142, 151], [0, 255, 255], [255, 96, 38], [142, 76, 0], [33, 0, 127], [0, 0, 0], [183, 183, 183], [76, 255, 0], ], dtype=np.float) / 255 default_text = \ "Welcome to the toolbox! To begin, load an utterance from your datasets or record one " \ "yourself.\nOnce its embedding has been created, you can synthesize any text written here.\n" \ "The synthesizer expects to generate " \ "outputs that are somewhere between 5 and 12 seconds.\nTo mark breaks, write a new line. " \ "Each line will be treated separately.\nThen, they are joined together to make the final " \ "spectrogram. Use the vocoder to generate audio.\nThe vocoder generates almost in constant " \ "time, so it will be more time efficient for longer inputs like this one.\nOn the left you " \ "have the embedding projections. Load or record more utterances to see them.\nIf you have " \ "at least 2 or 3 utterances from a same speaker, a cluster should form.\nSynthesized " \ "utterances are of the same color as the speaker whose voice was used, but they're " \ "represented with a cross." class UI(QDialog): min_umap_points = 4 max_log_lines = 5 max_saved_utterances = 20 def draw_utterance(self, utterance: Utterance, which): self.draw_spec(utterance.spec, which) self.draw_embed(utterance.embed, utterance.name, which) def draw_embed(self, embed, name, which): embed_ax, _ = self.current_ax if which == "current" else self.gen_ax embed_ax.figure.suptitle("" if embed is None else name) ## Embedding # Clear the plot if len(embed_ax.images) > 0: embed_ax.images[0].colorbar.remove() embed_ax.clear() # Draw the embed if embed is not None: plot_embedding_as_heatmap(embed, embed_ax) embed_ax.set_title("embedding") embed_ax.set_aspect("equal", "datalim") embed_ax.set_xticks([]) embed_ax.set_yticks([]) embed_ax.figure.canvas.draw() def draw_spec(self, spec, which): _, spec_ax = self.current_ax if which == "current" else self.gen_ax ## Spectrogram # Draw the spectrogram spec_ax.clear() if spec is not None: im = spec_ax.imshow(spec, aspect="auto", interpolation="none") # spec_ax.figure.colorbar(mappable=im, shrink=0.65, orientation="horizontal", # spec_ax=spec_ax) spec_ax.set_title("mel spectrogram") spec_ax.set_xticks([]) spec_ax.set_yticks([]) spec_ax.figure.canvas.draw() if which != "current": self.vocode_button.setDisabled(spec is None) def draw_umap_projections(self, utterances: Set[Utterance]): self.umap_ax.clear() speakers = np.unique([u.speaker_name for u in utterances]) colors = {speaker_name: colormap[i] for i, speaker_name in enumerate(speakers)} embeds = [u.embed for u in utterances] # Display a message if there aren't enough points if len(utterances) < self.min_umap_points: self.umap_ax.text(.5, .5, "Add %d more points to\ngenerate the projections" % (self.min_umap_points - len(utterances)), horizontalalignment='center', fontsize=15) self.umap_ax.set_title("") # Compute the projections else: if not self.umap_hot: self.log( "Drawing UMAP projections for the first time, this will take a few seconds.") self.umap_hot = True reducer = umap.UMAP(int(np.ceil(np.sqrt(len(embeds)))), metric="cosine") # reducer = TSNE() projections = reducer.fit_transform(embeds) speakers_done = set() for projection, utterance in zip(projections, utterances): color = colors[utterance.speaker_name] mark = "x" if "_gen_" in utterance.name else "o" label = None if utterance.speaker_name in speakers_done else utterance.speaker_name speakers_done.add(utterance.speaker_name) self.umap_ax.scatter(projection[0], projection[1], c=[color], marker=mark, label=label) # self.umap_ax.set_title("UMAP projections") self.umap_ax.legend(prop={'size': 10}) # Draw the plot self.umap_ax.set_aspect("equal", "datalim") self.umap_ax.set_xticks([]) self.umap_ax.set_yticks([]) self.umap_ax.figure.canvas.draw() def save_audio_file(self, wav, sample_rate): dialog = QFileDialog() dialog.setDefaultSuffix(".wav") fpath, _ = dialog.getSaveFileName( parent=self, caption="Select a path to save the audio file", filter="Audio Files (*.flac *.wav)" ) if fpath: #Default format is wav if Path(fpath).suffix == "": fpath += ".wav" sf.write(fpath, wav, sample_rate) def setup_audio_devices(self, sample_rate): input_devices = [] output_devices = [] for device in sd.query_devices(): # Check if valid input try: sd.check_input_settings(device=device["name"], samplerate=sample_rate) input_devices.append(device["name"]) except: pass # Check if valid output try: sd.check_output_settings(device=device["name"], samplerate=sample_rate) output_devices.append(device["name"]) except Exception as e: # Log a warning only if the device is not an input if not device["name"] in input_devices: warn("Unsupported output device %s for the sample rate: %d \nError: %s" % (device["name"], sample_rate, str(e))) if len(input_devices) == 0: self.log("No audio input device detected. Recording may not work.") self.audio_in_device = None else: self.audio_in_device = input_devices[0] if len(output_devices) == 0: self.log("No supported output audio devices were found! Audio output may not work.") self.audio_out_devices_cb.addItems(["None"]) self.audio_out_devices_cb.setDisabled(True) else: self.audio_out_devices_cb.clear() self.audio_out_devices_cb.addItems(output_devices) self.audio_out_devices_cb.currentTextChanged.connect(self.set_audio_device) self.set_audio_device() def set_audio_device(self): output_device = self.audio_out_devices_cb.currentText() if output_device == "None": output_device = None # If None, sounddevice queries portaudio sd.default.device = (self.audio_in_device, output_device) def play(self, wav, sample_rate): try: sd.stop() sd.play(wav, sample_rate) except Exception as e: print(e) self.log("Error in audio playback. Try selecting a different audio output device.") self.log("Your device must be connected before you start the toolbox.") def stop(self): sd.stop() def record_one(self, sample_rate, duration): self.record_button.setText("Recording...") self.record_button.setDisabled(True) self.log("Recording %d seconds of audio" % duration) sd.stop() try: wav = sd.rec(duration * sample_rate, sample_rate, 1) except Exception as e: print(e) self.log("Could not record anything. Is your recording device enabled?") self.log("Your device must be connected before you start the toolbox.") return None for i in np.arange(0, duration, 0.1): self.set_loading(i, duration) sleep(0.1) self.set_loading(duration, duration) sd.wait() self.log("Done recording.") self.record_button.setText("Record") self.record_button.setDisabled(False) return wav.squeeze() @property def current_dataset_name(self): return self.dataset_box.currentText() @property def current_speaker_name(self): return self.speaker_box.currentText() @property def current_utterance_name(self): return self.utterance_box.currentText() def browse_file(self): fpath = QFileDialog().getOpenFileName( parent=self, caption="Select an audio file", filter="Audio Files (*.mp3 *.flac *.wav *.m4a)" ) return Path(fpath[0]) if fpath[0] != "" else "" @staticmethod def repopulate_box(box, items, random=False): """ Resets a box and adds a list of items. Pass a list of (item, data) pairs instead to join data to the items """ box.blockSignals(True) box.clear() for item in items: item = list(item) if isinstance(item, tuple) else [item] box.addItem(str(item[0]), *item[1:]) if len(items) > 0: box.setCurrentIndex(np.random.randint(len(items)) if random else 0) box.setDisabled(len(items) == 0) box.blockSignals(False) def populate_browser(self, datasets_root: Path, recognized_datasets: List, level: int, random=True): # Select a random dataset if level <= 0: if datasets_root is not None: datasets = [datasets_root.joinpath(d) for d in recognized_datasets] datasets = [d.relative_to(datasets_root) for d in datasets if d.exists()] self.browser_load_button.setDisabled(len(datasets) == 0) if datasets_root is None or len(datasets) == 0: msg = "Warning: you d" + ("id not pass a root directory for datasets as argument" \ if datasets_root is None else "o not have any of the recognized datasets" \ " in %s" % datasets_root) self.log(msg) msg += ".\nThe recognized datasets are:\n\t%s\nFeel free to add your own. You " \ "can still use the toolbox by recording samples yourself." % \ ("\n\t".join(recognized_datasets)) print(msg, file=sys.stderr) self.random_utterance_button.setDisabled(True) self.random_speaker_button.setDisabled(True) self.random_dataset_button.setDisabled(True) self.utterance_box.setDisabled(True) self.speaker_box.setDisabled(True) self.dataset_box.setDisabled(True) self.browser_load_button.setDisabled(True) self.auto_next_checkbox.setDisabled(True) return self.repopulate_box(self.dataset_box, datasets, random) # Select a random speaker if level <= 1: speakers_root = datasets_root.joinpath(self.current_dataset_name) speaker_names = [d.stem for d in speakers_root.glob("*") if d.is_dir()] self.repopulate_box(self.speaker_box, speaker_names, random) # Select a random utterance if level <= 2: utterances_root = datasets_root.joinpath( self.current_dataset_name, self.current_speaker_name ) utterances = [] for extension in ['mp3', 'flac', 'wav', 'm4a']: utterances.extend(Path(utterances_root).glob("**/*.%s" % extension)) utterances = [fpath.relative_to(utterances_root) for fpath in utterances] self.repopulate_box(self.utterance_box, utterances, random) def browser_select_next(self): index = (self.utterance_box.currentIndex() + 1) % len(self.utterance_box) self.utterance_box.setCurrentIndex(index) @property def current_encoder_fpath(self): return self.encoder_box.itemData(self.encoder_box.currentIndex()) @property def current_synthesizer_fpath(self): return self.synthesizer_box.itemData(self.synthesizer_box.currentIndex()) @property def current_vocoder_fpath(self): return self.vocoder_box.itemData(self.vocoder_box.currentIndex()) def populate_models(self, encoder_models_dir: Path, synthesizer_models_dir: Path, vocoder_models_dir: Path): # Encoder encoder_fpaths = list(encoder_models_dir.glob("*.pt")) if len(encoder_fpaths) == 0: raise Exception("No encoder models found in %s" % encoder_models_dir) self.repopulate_box(self.encoder_box, [(f.stem, f) for f in encoder_fpaths]) # Synthesizer synthesizer_fpaths = list(synthesizer_models_dir.glob("**/*.pt")) if len(synthesizer_fpaths) == 0: raise Exception("No synthesizer models found in %s" % synthesizer_models_dir) self.repopulate_box(self.synthesizer_box, [(f.stem, f) for f in synthesizer_fpaths]) # Vocoder vocoder_fpaths = list(vocoder_models_dir.glob("**/*.pt")) vocoder_items = [(f.stem, f) for f in vocoder_fpaths] + [("Griffin-Lim", None)] self.repopulate_box(self.vocoder_box, vocoder_items) @property def selected_utterance(self): return self.utterance_history.itemData(self.utterance_history.currentIndex()) def register_utterance(self, utterance: Utterance): self.utterance_history.blockSignals(True) self.utterance_history.insertItem(0, utterance.name, utterance) self.utterance_history.setCurrentIndex(0) self.utterance_history.blockSignals(False) if len(self.utterance_history) > self.max_saved_utterances: self.utterance_history.removeItem(self.max_saved_utterances) self.play_button.setDisabled(False) self.generate_button.setDisabled(False) self.synthesize_button.setDisabled(False) def log(self, line, mode="newline"): if mode == "newline": self.logs.append(line) if len(self.logs) > self.max_log_lines: del self.logs[0] elif mode == "append": self.logs[-1] += line elif mode == "overwrite": self.logs[-1] = line log_text = '\n'.join(self.logs) self.log_window.setText(log_text) self.app.processEvents() def set_loading(self, value, maximum=1): self.loading_bar.setValue(value * 100) self.loading_bar.setMaximum(maximum * 100) self.loading_bar.setTextVisible(value != 0) self.app.processEvents() def populate_gen_options(self, seed, trim_silences): if seed is not None: self.random_seed_checkbox.setChecked(True) self.seed_textbox.setText(str(seed)) self.seed_textbox.setEnabled(True) else: self.random_seed_checkbox.setChecked(False) self.seed_textbox.setText(str(0)) self.seed_textbox.setEnabled(False) if not trim_silences: self.trim_silences_checkbox.setChecked(False) self.trim_silences_checkbox.setDisabled(True) def update_seed_textbox(self): if self.random_seed_checkbox.isChecked(): self.seed_textbox.setEnabled(True) else: self.seed_textbox.setEnabled(False) def reset_interface(self): self.draw_embed(None, None, "current") self.draw_embed(None, None, "generated") self.draw_spec(None, "current") self.draw_spec(None, "generated") self.draw_umap_projections(set()) self.set_loading(0) self.play_button.setDisabled(True) self.generate_button.setDisabled(True) self.synthesize_button.setDisabled(True) self.vocode_button.setDisabled(True) self.replay_wav_button.setDisabled(True) self.export_wav_button.setDisabled(True) [self.log("") for _ in range(self.max_log_lines)] def __init__(self): ## Initialize the application self.app = QApplication(sys.argv) super().__init__(None) self.setWindowTitle("SV2TTS toolbox") ## Main layouts # Root root_layout = QGridLayout() self.setLayout(root_layout) # Browser browser_layout = QGridLayout() root_layout.addLayout(browser_layout, 0, 0, 1, 2) # Generation gen_layout = QVBoxLayout() root_layout.addLayout(gen_layout, 0, 2, 1, 2) # Projections self.projections_layout = QVBoxLayout() root_layout.addLayout(self.projections_layout, 1, 0, 1, 1) # Visualizations vis_layout = QVBoxLayout() root_layout.addLayout(vis_layout, 1, 1, 1, 3) ## Projections # UMap fig, self.umap_ax = plt.subplots(figsize=(3, 3), facecolor="#F0F0F0") fig.subplots_adjust(left=0.02, bottom=0.02, right=0.98, top=0.98) self.projections_layout.addWidget(FigureCanvas(fig)) self.umap_hot = False self.clear_button = QPushButton("Clear") self.projections_layout.addWidget(self.clear_button) ## Browser # Dataset, speaker and utterance selection i = 0 self.dataset_box = QComboBox() browser_layout.addWidget(QLabel("Dataset"), i, 0) browser_layout.addWidget(self.dataset_box, i + 1, 0) self.speaker_box = QComboBox() browser_layout.addWidget(QLabel("Speaker"), i, 1) browser_layout.addWidget(self.speaker_box, i + 1, 1) self.utterance_box = QComboBox() browser_layout.addWidget(QLabel("Utterance"), i, 2) browser_layout.addWidget(self.utterance_box, i + 1, 2) self.browser_load_button = QPushButton("Load") browser_layout.addWidget(self.browser_load_button, i + 1, 3) i += 2 # Random buttons self.random_dataset_button = QPushButton("Random") browser_layout.addWidget(self.random_dataset_button, i, 0) self.random_speaker_button = QPushButton("Random") browser_layout.addWidget(self.random_speaker_button, i, 1) self.random_utterance_button = QPushButton("Random") browser_layout.addWidget(self.random_utterance_button, i, 2) self.auto_next_checkbox = QCheckBox("Auto select next") self.auto_next_checkbox.setChecked(True) browser_layout.addWidget(self.auto_next_checkbox, i, 3) i += 1 # Utterance box browser_layout.addWidget(QLabel("Use embedding from:"), i, 0) self.utterance_history = QComboBox() browser_layout.addWidget(self.utterance_history, i, 1, 1, 3) i += 1 # Random & next utterance buttons self.browser_browse_button = QPushButton("Browse") browser_layout.addWidget(self.browser_browse_button, i, 0) self.record_button = QPushButton("Record") browser_layout.addWidget(self.record_button, i, 1) self.play_button = QPushButton("Play") browser_layout.addWidget(self.play_button, i, 2) self.stop_button = QPushButton("Stop") browser_layout.addWidget(self.stop_button, i, 3) i += 1 # Model and audio output selection self.encoder_box = QComboBox() browser_layout.addWidget(QLabel("Encoder"), i, 0) browser_layout.addWidget(self.encoder_box, i + 1, 0) self.synthesizer_box = QComboBox() browser_layout.addWidget(QLabel("Synthesizer"), i, 1) browser_layout.addWidget(self.synthesizer_box, i + 1, 1) self.vocoder_box = QComboBox() browser_layout.addWidget(QLabel("Vocoder"), i, 2) browser_layout.addWidget(self.vocoder_box, i + 1, 2) self.audio_out_devices_cb=QComboBox() browser_layout.addWidget(QLabel("Audio Output"), i, 3) browser_layout.addWidget(self.audio_out_devices_cb, i + 1, 3) i += 2 #Replay & Save Audio browser_layout.addWidget(QLabel("Toolbox Output:"), i, 0) self.waves_cb = QComboBox() self.waves_cb_model = QStringListModel() self.waves_cb.setModel(self.waves_cb_model) self.waves_cb.setToolTip("Select one of the last generated waves in this section for replaying or exporting") browser_layout.addWidget(self.waves_cb, i, 1) self.replay_wav_button = QPushButton("Replay") self.replay_wav_button.setToolTip("Replay last generated vocoder") browser_layout.addWidget(self.replay_wav_button, i, 2) self.export_wav_button = QPushButton("Export") self.export_wav_button.setToolTip("Save last generated vocoder audio in filesystem as a wav file") browser_layout.addWidget(self.export_wav_button, i, 3) i += 1 ## Embed & spectrograms vis_layout.addStretch() gridspec_kw = {"width_ratios": [1, 4]} fig, self.current_ax = plt.subplots(1, 2, figsize=(10, 2.25), facecolor="#F0F0F0", gridspec_kw=gridspec_kw) fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8) vis_layout.addWidget(FigureCanvas(fig)) fig, self.gen_ax = plt.subplots(1, 2, figsize=(10, 2.25), facecolor="#F0F0F0", gridspec_kw=gridspec_kw) fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8) vis_layout.addWidget(FigureCanvas(fig)) for ax in self.current_ax.tolist() + self.gen_ax.tolist(): ax.set_facecolor("#F0F0F0") for side in ["top", "right", "bottom", "left"]: ax.spines[side].set_visible(False) ## Generation self.text_prompt = QPlainTextEdit(default_text) gen_layout.addWidget(self.text_prompt, stretch=1) self.generate_button = QPushButton("Synthesize and vocode") gen_layout.addWidget(self.generate_button) layout = QHBoxLayout() self.synthesize_button = QPushButton("Synthesize only") layout.addWidget(self.synthesize_button) self.vocode_button = QPushButton("Vocode only") layout.addWidget(self.vocode_button) gen_layout.addLayout(layout) layout_seed = QGridLayout() self.random_seed_checkbox = QCheckBox("Random seed:") self.random_seed_checkbox.setToolTip("When checked, makes the synthesizer and vocoder deterministic.") layout_seed.addWidget(self.random_seed_checkbox, 0, 0) self.seed_textbox = QLineEdit() self.seed_textbox.setMaximumWidth(80) layout_seed.addWidget(self.seed_textbox, 0, 1) self.trim_silences_checkbox = QCheckBox("Enhance vocoder output") self.trim_silences_checkbox.setToolTip("When checked, trims excess silence in vocoder output." " This feature requires `webrtcvad` to be installed.") layout_seed.addWidget(self.trim_silences_checkbox, 0, 2, 1, 2) gen_layout.addLayout(layout_seed) self.loading_bar = QProgressBar() gen_layout.addWidget(self.loading_bar) self.log_window = QLabel() self.log_window.setAlignment(Qt.AlignBottom | Qt.AlignLeft) gen_layout.addWidget(self.log_window) self.logs = [] gen_layout.addStretch() ## Set the size of the window and of the elements max_size = QDesktopWidget().availableGeometry(self).size() * 0.8 self.resize(max_size) ## Finalize the display self.reset_interface() self.show() def start(self): self.app.exec_()