Spaces:
Sleeping
Sleeping
File size: 2,776 Bytes
1f3bd14 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
from typing import Optional, Union, Dict
from speakers.common.registry import registry
from speakers.processors import BaseProcessor, ProcessorData
from io import BytesIO
import logging
import numpy as np
import edge_tts
import asyncio
import nest_asyncio
import util
import librosa
logger = logging.getLogger('edge_to_voice')
def set_edge_to_voice_logger(l):
global logger
logger = l
class EdgeProcessorData(ProcessorData):
"""
:param text: 生成文本
:param tts_speaker: 讲话人id
:param rate: 语速
:param volume: 语气轻重
"""
"""生成文本"""
text: str
"""讲话人id"""
tts_speaker: int
"""语速"""
rate: str
"""语气轻重"""
volume: str
@property
def type(self) -> str:
"""Type of the Message, used for serialization."""
return "EDGE"
@registry.register_processor("edge_to_voice")
class EdgeToVoice(BaseProcessor):
def __init__(self):
super().__init__()
nest_asyncio.apply()
self._tts_speakers_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices()) # noqa
def __call__(
self,
data: EdgeProcessorData
):
if data.text is None:
raise RuntimeError('Please provide TTS text.')
if data.tts_speaker is None:
raise RuntimeError('Please provide TTS text.')
# 同步调用协程代码
tts_np, tts_sr = asyncio.get_event_loop().run_until_complete( self._call_edge_tts(data=data))
return tts_np, tts_sr
@property
def tts_speakers_list(self):
return self._tts_speakers_list
@classmethod
def from_config(cls, cfg=None):
if cfg is None:
raise RuntimeError("from_config cfg is None.")
return cls()
def match(self, data: ProcessorData):
return "EDGE" in data.type
async def _call_edge_tts(self, data: EdgeProcessorData):
speaker = self._tts_speakers_list[data.tts_speaker]['ShortName']
tts_com = edge_tts.Communicate(text=data.text, voice=speaker, rate=data.rate, volume=data.volume)
tts_raw = b''
# Stream TTS audio to bytes
async for chunk in tts_com.stream():
if chunk['type'] == 'audio':
tts_raw += chunk['data']
# Convert mp3 stream to wav
ffmpeg_proc = await asyncio.create_subprocess_exec(
'ffmpeg',
'-f', 'mp3',
'-i', '-',
'-f', 'wav',
'-loglevel', 'error',
'-',
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE
)
(tts_wav, _) = await ffmpeg_proc.communicate(tts_raw)
return librosa.load(BytesIO(tts_wav))
|