Spaces:
Running
Running
File size: 4,718 Bytes
7088d16 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import os
from paddlespeech.cli.tts.infer import TTSExecutor
"""
PaddleSpeech
声码器说明:这里预制了三种声码器【PWGan】【WaveRnn】【HifiGan】, 三种声码器效果和生成时间有比较大的差距,请跟进自己的需要进行选择。不过只选择了前两种,因为WaveRNN太慢了
| 声码器 | 音频质量 | 生成速度 |
| :----: | :----: | :----: |
| PWGan | 中等 | 中等 |
| WaveRnn | 高 | 非常慢(耐心等待) |
| HifiGan | 低 | 快 |
这些PaddleSpeech中的样例主要按数据集分类,我们主要使用的TTS数据集有:
CSMCS (普通话单发音人)
AISHELL3 (普通话多发音人)
LJSpeech (英文单发音人)
VCTK (英文多发音人)
PaddleSpeech 的 TTS 模型具有以下映射关系:
tts0 - Tacotron2
tts1 - TransformerTTS
tts2 - SpeedySpeech
tts3 - FastSpeech2
voc0 - WaveFlow
voc1 - Parallel WaveGAN
voc2 - MelGAN
voc3 - MultiBand MelGAN
voc4 - Style MelGAN
voc5 - HiFiGAN
vc0 - Tacotron2 Voice Clone with GE2E
vc1 - FastSpeech2 Voice Clone with GE2E
以下是 PaddleSpeech 提供的可以被命令行和 python API 使用的预训练模型列表:
- 声学模型
| 模型 | 语言 |
| :--- | :---: |
| speedyspeech_csmsc | zh |
| fastspeech2_csmsc | zh |
| fastspeech2_ljspeech | en |
| fastspeech2_aishell3 | zh |
| fastspeech2_vctk | en |
| fastspeech2_cnndecoder_csmsc | zh |
| fastspeech2_mix | mix |
| tacotron2_csmsc | zh |
| tacotron2_ljspeech | en |
| fastspeech2_male | zh |
| fastspeech2_male | en |
| fastspeech2_male | mix |
| fastspeech2_canton | canton |
- 声码器
| 模型 | 语言 |
| :--- | :---: |
| pwgan_csmsc | zh |
| pwgan_ljspeech | en |
| pwgan_aishell3 | zh |
| pwgan_vctk | en |
| mb_melgan_csmsc | zh |
| style_melgan_csmsc | zh |
| hifigan_csmsc | zh |
| hifigan_ljspeech | en |
| hifigan_aishell3 | zh |
| hifigan_vctk | en |
| wavernn_csmsc | zh |
| pwgan_male | zh |
| hifigan_male | zh |
"""
class PaddleTTS:
def __init__(self) -> None:
pass
def predict(self, text, am, voc, spk_id = 174, lang = 'zh', male=False, save_path = 'output.wav'):
self.tts = TTSExecutor()
use_onnx = True
voc = voc.lower()
am = am.lower()
if male:
assert voc in ["pwgan", "hifigan"], "male voc must be 'pwgan' or 'hifigan'"
wav_file = self.tts(
text = text,
output = save_path,
am='fastspeech2_male',
voc= voc + '_male',
lang=lang,
use_onnx=use_onnx
)
return wav_file
assert am in ['tacotron2', 'fastspeech2'], "am must be 'tacotron2' or 'fastspeech2'"
# 混合中文英文语音合成
if lang == 'mix':
# mix只有fastspeech2
am = 'fastspeech2_mix'
voc += '_csmsc'
# 英文语音合成
elif lang == 'en':
am += '_ljspeech'
voc += '_ljspeech'
# 中文语音合成
elif lang == 'zh':
assert voc in ['wavernn', 'pwgan', 'hifigan', 'style_melgan', 'mb_melgan'], "voc must be 'wavernn' or 'pwgan' or 'hifigan' or 'style_melgan' or 'mb_melgan'"
am += '_csmsc'
voc += '_csmsc'
elif lang == 'canton':
am = 'fastspeech2_canton'
voc = 'pwgan_aishell3'
spk_id = 10
print("am:", am, "voc:", voc, "lang:", lang, "male:", male, "spk_id:", spk_id)
try:
cmd = f'paddlespeech tts --am {am} --voc {voc} --input "{text}" --output {save_path} --lang {lang} --spk_id {spk_id} --use_onnx {use_onnx}'
os.system(cmd)
wav_file = save_path
except:
# 语音合成
wav_file = self.tts(
text = text,
output = save_path,
am = am,
voc = voc,
lang = lang,
spk_id = spk_id,
use_onnx=use_onnx
)
return wav_file
if __name__ == "__main__":
tts = PaddleTTS()
tts.predict("Hello world", 'FastSpeech2', 'PWGan', spk_id=174, lang='en', male=False, save_path='output.wav') |