File size: 2,776 Bytes
1f3bd14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from typing import Optional, Union, Dict

from speakers.common.registry import registry
from speakers.processors import BaseProcessor, ProcessorData
from io import BytesIO
import logging
import numpy as np
import edge_tts
import asyncio
import nest_asyncio
import util
import librosa

logger = logging.getLogger('edge_to_voice')


def set_edge_to_voice_logger(l):
    global logger
    logger = l


class EdgeProcessorData(ProcessorData):
    """
        :param text: 生成文本
        :param tts_speaker: 讲话人id
        :param rate: 语速
        :param volume: 语气轻重

    """
    """生成文本"""
    text: str
    """讲话人id"""
    tts_speaker: int
    """语速"""
    rate: str
    """语气轻重"""
    volume: str


    @property
    def type(self) -> str:
        """Type of the Message, used for serialization."""
        return "EDGE"


@registry.register_processor("edge_to_voice")
class EdgeToVoice(BaseProcessor):

    def __init__(self):
        super().__init__()
        nest_asyncio.apply()
        self._tts_speakers_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())  # noqa

    def __call__(
            self,
            data: EdgeProcessorData
    ):

        if data.text is None:
            raise RuntimeError('Please provide TTS text.')

        if data.tts_speaker is None:
            raise RuntimeError('Please provide TTS text.')
        # 同步调用协程代码
        tts_np, tts_sr = asyncio.get_event_loop().run_until_complete( self._call_edge_tts(data=data))

        return tts_np, tts_sr

    @property
    def tts_speakers_list(self):
        return self._tts_speakers_list

    @classmethod
    def from_config(cls, cfg=None):
        if cfg is None:
            raise RuntimeError("from_config cfg is None.")

        return cls()

    def match(self, data: ProcessorData):
        return "EDGE" in data.type

    async def _call_edge_tts(self, data: EdgeProcessorData):

        speaker = self._tts_speakers_list[data.tts_speaker]['ShortName']
        tts_com = edge_tts.Communicate(text=data.text, voice=speaker, rate=data.rate, volume=data.volume)
        tts_raw = b''

        # Stream TTS audio to bytes
        async for chunk in tts_com.stream():
            if chunk['type'] == 'audio':
                tts_raw += chunk['data']

        # Convert mp3 stream to wav
        ffmpeg_proc = await asyncio.create_subprocess_exec(
            'ffmpeg',
            '-f', 'mp3',
            '-i', '-',
            '-f', 'wav',
            '-loglevel', 'error',
            '-',
            stdin=asyncio.subprocess.PIPE,
            stdout=asyncio.subprocess.PIPE
        )
        (tts_wav, _) = await ffmpeg_proc.communicate(tts_raw)

        return librosa.load(BytesIO(tts_wav))