File size: 10,109 Bytes
17d0a32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a5e8bc
 
 
 
 
 
 
 
 
 
 
 
 
5c0a088
8a5e8bc
 
 
 
 
 
5c0a088
8a5e8bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17d0a32
8a5e8bc
17d0a32
8a5e8bc
 
 
 
17d0a32
 
 
 
 
 
 
 
 
8a5e8bc
 
 
 
 
 
 
17d0a32
8a5e8bc
 
17d0a32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a5e8bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
import time, logging, json, sys, struct
import numpy as np
from scipy.io.wavfile import WAVE_FORMAT

def write_numpy_to_wave(filename, rate, data, add_header=False):
    """
    Write a NumPy array as a WAV file.
    """
    def _array_tofile(fid, data):
        # ravel gives a c-contiguous buffer
        fid.write(data.ravel().view('b').data)

    if hasattr(filename, 'write'):
        fid = filename
    else:
        fid = open(filename, 'wb')

    fs = rate

    try:
        dkind = data.dtype.kind
        if not (dkind == 'i' or dkind == 'f' or (dkind == 'u' and
                                                 data.dtype.itemsize == 1)):
            raise ValueError("Unsupported data type '%s'" % data.dtype)

        header_data = b''

        header_data += b'RIFF'
        header_data += b'\x00\x00\x00\x00'
        header_data += b'WAVE'

        # fmt chunk
        header_data += b'fmt '
        if dkind == 'f':
            format_tag = WAVE_FORMAT.IEEE_FLOAT
        else:
            format_tag = WAVE_FORMAT.PCM
        if data.ndim == 1:
            channels = 1
        else:
            channels = data.shape[1]
        bit_depth = data.dtype.itemsize * 8
        bytes_per_second = fs*(bit_depth // 8)*channels
        block_align = channels * (bit_depth // 8)

        fmt_chunk_data = struct.pack('<HHIIHH', format_tag, channels, fs,
                                     bytes_per_second, block_align, bit_depth)
        if not (dkind == 'i' or dkind == 'u'):
            # add cbSize field for non-PCM files
            fmt_chunk_data += b'\x00\x00'

        header_data += struct.pack('<I', len(fmt_chunk_data))
        header_data += fmt_chunk_data

        # fact chunk (non-PCM files)
        if not (dkind == 'i' or dkind == 'u'):
            header_data += b'fact'
            header_data += struct.pack('<II', 4, data.shape[0])

        # check data size (needs to be immediately before the data chunk)
        if ((len(header_data)-4-4) + (4+4+data.nbytes)) > 0xFFFFFFFF:
            raise ValueError("Data exceeds wave file size limit")
        if add_header:
            fid.write(header_data)
            # data chunk
            fid.write(b'data')
            fid.write(struct.pack('<I', data.nbytes))
            if data.dtype.byteorder == '>' or (data.dtype.byteorder == '=' and
                                            sys.byteorder == 'big'):
                data = data.byteswap()
        _array_tofile(fid, data)

        if add_header:
            # Determine file size and place it in correct
            #  position at start of the file.
            size = fid.tell()
            fid.seek(4)
            fid.write(struct.pack('<I', size-8))

    finally:
        if not hasattr(filename, 'write'):
            fid.close()
        else:
            fid.seek(0)

def is_speaker_speaking(vad, data, sample_rate):
    # Function to detect if the speaker is speaking
    # The WebRTC VAD only accepts 16-bit mono PCM audio, 
    # sampled at 8000, 16000, 32000 or 48000 Hz. 
    # A frame must be either 10, 20, or 30 ms in duration:
    frame_duration = 30
    n_bit_each = int(sample_rate * frame_duration / 1000)*2 # x2 because audio is 16 bit (2 bytes)
    res_list = []
    for t in range(len(data)):
        if t!=0 and t % n_bit_each == 0:
            res_list.append(vad.is_speech(data[t-n_bit_each:t], sample_rate))
    
    info = ''.join(['^' if r else '.' for r in res_list])
    info = info[:10]
    if any(res_list):
        return True, info
    else:
        return False, info


class AliyunASR():

    def test_on_sentence_begin(self, message, *args):
        # print("test_on_sentence_begin:{}".format(message))
        pass

    def test_on_sentence_end(self, message, *args):
        # print("test_on_sentence_end:{}".format(message))
        message = json.loads(message)
        self.parsed_sentence = message['payload']['result']
        self.event_on_entence_end.set()
        # print(self.parsed_sentence)

    def test_on_start(self, message, *args):
        # print("test_on_start:{}".format(message))
        pass

    def test_on_error(self, message, *args):
        logging.error("on_error args=>{}".format(args))
        pass

    def test_on_close(self, *args):
        self.aliyun_service_ok = False
        pass

    def test_on_result_chg(self, message, *args):
        # print("test_on_chg:{}".format(message))
        message = json.loads(message)
        self.parsed_text = message['payload']['result']
        self.event_on_result_chg.set()

    def test_on_completed(self, message, *args):
        # print("on_completed:args=>{} message=>{}".format(args, message))
        pass

    def audio_convertion_thread(self, uuid):
        # 在一个异步线程中采集音频
        import nls  # pip install git+https://github.com/aliyun/alibabacloud-nls-python-sdk.git
        import tempfile
        from scipy import io
        from toolbox import get_conf
        from .audio_io import change_sample_rate
        from .audio_io import RealtimeAudioDistribution
        NEW_SAMPLERATE = 16000
        rad = RealtimeAudioDistribution()
        rad.clean_up()
        temp_folder = tempfile.gettempdir()
        TOKEN, APPKEY = get_conf('ALIYUN_TOKEN', 'ALIYUN_APPKEY')
        if len(TOKEN) == 0:
            TOKEN = self.get_token()
        self.aliyun_service_ok = True
        URL="wss://nls-gateway.aliyuncs.com/ws/v1"
        sr = nls.NlsSpeechTranscriber(
                    url=URL,
                    token=TOKEN,
                    appkey=APPKEY,
                    on_sentence_begin=self.test_on_sentence_begin,
                    on_sentence_end=self.test_on_sentence_end,
                    on_start=self.test_on_start,
                    on_result_changed=self.test_on_result_chg,
                    on_completed=self.test_on_completed,
                    on_error=self.test_on_error,
                    on_close=self.test_on_close,
                    callback_args=[uuid.hex]
                )
        timeout_limit_second = 20
        r = sr.start(aformat="pcm",
                timeout=timeout_limit_second,
                enable_intermediate_result=True,
                enable_punctuation_prediction=True,
                enable_inverse_text_normalization=True)

        import webrtcvad
        vad = webrtcvad.Vad()
        vad.set_mode(1)

        is_previous_frame_transmitted = False   # 上一帧是否有人说话
        previous_frame_data = None
        echo_cnt = 0        # 在没有声音之后,继续向服务器发送n次音频数据
        echo_cnt_max = 4   # 在没有声音之后,继续向服务器发送n次音频数据
        keep_alive_last_send_time = time.time()
        while not self.stop:
            # time.sleep(self.capture_interval)
            audio = rad.read(uuid.hex) 
            if audio is not None:
                # convert to pcm file
                temp_file = f'{temp_folder}/{uuid.hex}.pcm' # 
                dsdata = change_sample_rate(audio, rad.rate, NEW_SAMPLERATE) # 48000 --> 16000
                write_numpy_to_wave(temp_file, NEW_SAMPLERATE, dsdata)
                # read pcm binary
                with open(temp_file, "rb") as f: data = f.read()
                is_speaking, info = is_speaker_speaking(vad, data, NEW_SAMPLERATE)

                if is_speaking or echo_cnt > 0:
                    # 如果话筒激活 / 如果处于回声收尾阶段
                    echo_cnt -= 1
                    if not is_previous_frame_transmitted:   # 上一帧没有人声,但是我们把上一帧同样加上
                        if previous_frame_data is not None: data = previous_frame_data + data
                    if is_speaking:
                        echo_cnt = echo_cnt_max
                    slices = zip(*(iter(data),) * 640)      # 640个字节为一组
                    for i in slices: sr.send_audio(bytes(i))
                    keep_alive_last_send_time = time.time()
                    is_previous_frame_transmitted = True
                else:
                    is_previous_frame_transmitted = False
                    echo_cnt = 0
                    # 保持链接激活,即使没有声音,也根据时间间隔,发送一些音频片段给服务器
                    if time.time() - keep_alive_last_send_time > timeout_limit_second/2:
                        slices = zip(*(iter(data),) * 640)    # 640个字节为一组
                        for i in slices: sr.send_audio(bytes(i))
                        keep_alive_last_send_time = time.time()
                        is_previous_frame_transmitted = True
                self.audio_shape = info
            else:
                time.sleep(0.1)

            if not self.aliyun_service_ok:
                self.stop = True
                self.stop_msg = 'Aliyun音频服务异常,请检查ALIYUN_TOKEN和ALIYUN_APPKEY是否过期。'
        r = sr.stop()

    def get_token(self):
        from toolbox import get_conf
        import json
        from aliyunsdkcore.request import CommonRequest
        from aliyunsdkcore.client import AcsClient
        AccessKey_ID, AccessKey_secret = get_conf('ALIYUN_ACCESSKEY', 'ALIYUN_SECRET')

        # 创建AcsClient实例
        client = AcsClient(
            AccessKey_ID,
            AccessKey_secret,
            "cn-shanghai"
        )

        # 创建request,并设置参数。
        request = CommonRequest()
        request.set_method('POST')
        request.set_domain('nls-meta.cn-shanghai.aliyuncs.com')
        request.set_version('2019-02-28')
        request.set_action_name('CreateToken')

        try:
            response = client.do_action_with_exception(request)
            print(response)
            jss = json.loads(response)
            if 'Token' in jss and 'Id' in jss['Token']:
                token = jss['Token']['Id']
                expireTime = jss['Token']['ExpireTime']
                print("token = " + token)
                print("expireTime = " + str(expireTime))
        except Exception as e:
            print(e)

        return token