ChatGPT-with-Voice-Cloning-in-Chinese

Build error

App Files Files Community

ChatGPT-with-Voice-Cloning-in-Chinese / synthesizer /preprocess_speaker.py

Kevin676

Duplicate from lewiswu1209/MockingBird

4817bcc almost 2 years ago

raw

history blame

4.27 kB

	import librosa
	import numpy as np

	from encoder import inference as encoder
	from utils import logmmse
	from synthesizer import audio
	from pathlib import Path
	from pypinyin import Style
	from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
	from pypinyin.converter import DefaultConverter
	from pypinyin.core import Pinyin

	class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter):
	pass

	pinyin = Pinyin(PinyinConverter()).pinyin


	def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
	skip_existing: bool, hparams):
	## FOR REFERENCE:
	# For you not to lose your head if you ever wish to change things here or implement your own
	# synthesizer.
	# - Both the audios and the mel spectrograms are saved as numpy arrays
	# - There is no processing done to the audios that will be saved to disk beyond volume
	# normalization (in split_on_silences)
	# - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
	# is why we re-apply it on the audio on the side of the vocoder.
	# - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
	# without extra padding. This means that you won't have an exact relation between the length
	# of the wav and of the mel spectrogram. See the vocoder data loader.


	# Skip existing utterances if needed
	mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
	wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
	if skip_existing and mel_fpath.exists() and wav_fpath.exists():
	return None

	# Trim silence
	if hparams.trim_silence:
	wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)

	# Skip utterances that are too short
	if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
	return None

	# Compute the mel spectrogram
	mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
	mel_frames = mel_spectrogram.shape[1]

	# Skip utterances that are too long
	if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
	return None

	# Write the spectrogram, embed and audio to disk
	np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
	np.save(wav_fpath, wav, allow_pickle=False)

	# Return a tuple describing this training example
	return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text


	def _split_on_silences(wav_fpath, words, hparams):
	# Load the audio waveform
	wav, _ = librosa.load(wav_fpath, sr= hparams.sample_rate)
	wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0]
	if hparams.rescale:
	wav = wav / np.abs(wav).max() * hparams.rescaling_max
	# denoise, we may not need it here.
	if len(wav) > hparams.sample_rate*(0.3+0.1):
	noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)],
	wav[-int(hparams.sample_rate*0.15):]])
	profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
	wav = logmmse.denoise(wav, profile, eta=0)

	resp = pinyin(words, style=Style.TONE3)
	res = [v[0] for v in resp if v[0].strip()]
	res = " ".join(res)

	return wav, res

	def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
	metadata = []
	extensions = [".wav", ".flac", "*.mp3"]
	for extension in extensions:
	wav_fpath_list = speaker_dir.glob(extension)
	# Iterate over each wav
	for wav_fpath in wav_fpath_list:
	words = dict_info.get(wav_fpath.name.split(".")[0])
	words = dict_info.get(wav_fpath.name) if not words else words # try with wav
	if not words:
	print("no wordS")
	continue
	sub_basename = "%s_%02d" % (wav_fpath.name, 0)
	wav, text = _split_on_silences(wav_fpath, words, hparams)
	metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
	skip_existing, hparams))
	return [m for m in metadata if m is not None]