File size: 18,949 Bytes
2f2406a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
import json
import os
import time
import azure.cognitiveservices.speech as speechsdk
import datetime
import zipfile
import io
import copy
import re
from urllib.request import urlopen
from pathlib import Path

from . import azure_batch
from . import utils 
from .utils import parseBool
# Get variables from config

# Get Azure variables if applicable
AZURE_SPEECH_KEY = os.environ.get('SPEECH_KEY')
AZURE_SPEECH_REGION = os.environ.get('SPEECH_REGION')

azure_sentence_pause = 80
azure_comma_pause = 50
debug_mode = False
tts_service = 'azure'


# ======================================== Pronunciation Correction Functions ================================================
BASE_DIR = Path(__file__).resolve().parent.parent / 'SSML_Customization'

interpretAsOverrideFile = os.path.join(BASE_DIR, 'interpret-as.csv')
interpretAsEntries = utils.csv_to_dict(interpretAsOverrideFile)

aliasOverrideFile = os.path.join(BASE_DIR, 'aliases.csv')
aliasEntries = utils.csv_to_dict(aliasOverrideFile)

urlListFile = os.path.join(BASE_DIR, 'url_list.txt')
urlList = utils.txt_to_list(urlListFile)

phonemeFile = os.path.join(BASE_DIR, 'Phoneme_Pronunciation.csv')
phonemeEntries = utils.csv_to_dict(phonemeFile)

def add_all_pronunciation_overrides(text):
    text = add_interpretas_tags(text)
    text = add_alias_tags(text)
    text = add_phoneme_tags(text)
    return text

def add_interpretas_tags(text):
    # Add interpret-as tags from interpret-as.csv
    for entryDict in interpretAsEntries:
        # Get entry info
        entryText = entryDict['Text']
        entryInterpretAsType = entryDict['interpret-as Type']
        isCaseSensitive = parseBool(entryDict['Case Sensitive (True/False)'])
        entryFormat = entryDict['Format (Optional)']

        # Create say-as tag
        if entryFormat == "":
            sayAsTagStart = rf'<say-as interpret-as="{entryInterpretAsType}">'
        else:
            sayAsTagStart = rf'<say-as interpret-as="{entryInterpretAsType}" format="{entryFormat}">'
        
        # Find and replace the word
        findWordRegex = rf'(\b["\']?{entryText}[.,!?]?["\']?\b)' # Find the word, with optional punctuation after, and optional quotes before or after
        if isCaseSensitive:
            text = re.sub(findWordRegex, rf'{sayAsTagStart}\1</say-as>', text) # Uses group reference, so remember regex must be in parentheses
            
        else:
            text = re.sub(findWordRegex, rf'{sayAsTagStart}\1</say-as>', text, flags=re.IGNORECASE)

    # Add interpret-as tags from url_list.txt
    for url in urlList:
        # This regex expression will match the top level domain extension, and the punctuation before/after it, and any periods, slashes or colons
        # It will then put the say-as characters tag around all matches
        punctuationRegex = re.compile(r'((?:\.[a-z]{2,6}(?:\/|$|\s))|(?:[\.\/:]+))') 
        taggedURL = re.sub(punctuationRegex, r'<say-as interpret-as="characters">\1</say-as>', url)
        # Replace any instances of the URL with the tagged version
        text = text.replace(url, taggedURL)

    return text

def add_alias_tags(text):
    for entryDict in aliasEntries:
        # Get entry info
        entryText = entryDict['Original Text']
        entryAlias = entryDict['Alias']
        if entryDict['Case Sensitive (True/False)'] == "":
            isCaseSensitive = False
        else:
            isCaseSensitive = parseBool(entryDict['Case Sensitive (True/False)'])

        # Find and replace the word
        findWordRegex = rf'\b["\'()]?{entryText}[.,!?()]?["\']?\b' # Find the word, with optional punctuation after, and optional quotes before or after
        if isCaseSensitive:
            text = re.sub(findWordRegex, rf'{entryAlias}', text)
        else:
            text = re.sub(findWordRegex, rf'{entryAlias}', text, flags=re.IGNORECASE)
    return text


# Uses the phoneme pronunciation file to add phoneme tags to the text
def add_phoneme_tags(text):
    for entryDict in phonemeEntries:
        # Get entry info
        entryText = entryDict['Text']
        entryPhoneme = entryDict['Phonetic Pronunciation']
        entryAlphabet = entryDict['Phonetic Alphabet']

        if entryDict['Case Sensitive (True/False)'] == "":
            isCaseSensitive = False
        else:
            isCaseSensitive = parseBool(entryDict['Case Sensitive (True/False)'])

        # Find and replace the word
        findWordRegex = rf'(\b["\'()]?{entryText}[.,!?()]?["\']?\b)' # Find the word, with optional punctuation after, and optional quotes before or after
        if isCaseSensitive:
            text = re.sub(findWordRegex, rf'<phoneme alphabet="ipa" ph="{entryPhoneme}">\1</phoneme>', text)
        else:
            text = re.sub(findWordRegex, rf'<phoneme alphabet="{entryAlphabet}" ph="{entryPhoneme}">\1</phoneme>', text, flags=re.IGNORECASE)
    return text

# ================================================== Azure Functions ========================================================= 

def synthesize_text_azure(text, duration, voiceName, languageCode):

    # Create tag for desired duration of clip
    durationTag = f'<mstts:audioduration value="{str(duration)}ms"/>'

    # Create string for sentence pauses, if not default
    if not azure_sentence_pause == 'default':
        sentencePauseTag = f'<mstts:silence type="Sentenceboundary-exact" value="{str(azure_sentence_pause)}ms"/>'
    else:
        sentencePauseTag = ''

    # Create string for comma pauses, if not default
    if not azure_comma_pause == 'default':
        commaPauseTag = f'<mstts:silence type="Comma-exact" value="{str(azure_comma_pause)}ms"/>'
    else:
        commaPauseTag = ''

    # Set string for tag to set leading and trailing silence times to zero
    leadSilenceTag = '<mstts:silence  type="Leading-exact" value="0ms"/>'
    tailSilenceTag = '<mstts:silence  type="Tailing-exact" value="0ms"/>'

    # Process text using pronunciation customization set by user
    text = add_all_pronunciation_overrides(text)

    # Create SSML syntax for Azure TTS
    ssml = f"<speak version='1.0' xml:lang='{languageCode}' xmlns='http://www.w3.org/2001/10/synthesis' " \
        "xmlns:mstts='http://www.w3.org/2001/mstts'>" \
        f"<voice name='{voiceName}'>{sentencePauseTag}{commaPauseTag}{durationTag}{leadSilenceTag}{tailSilenceTag}" \
        f"{text}</voice></speak>"

    speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
    # For Azure voices, see: https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support?tabs=stt-tts
    speech_config.speech_synthesis_voice_name=voiceName
    # For audio outputs, see: https://learn.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.speechsynthesisoutputformat?view=azure-python
    speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Audio48Khz192KBitRateMonoMp3)
    synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)

    #result = synthesizer.speak_text_async(text).get()
    result = synthesizer.speak_ssml_async(ssml).get()

    stream = speechsdk.AudioDataStream(result)
    return stream

def format_percentage_change(speedFactor):
    # Determine speedFactor value for Azure TTS. It should be either 'default' or a relative change.
    if speedFactor == 1.0:
        rate = 'default'
    else:
        # Whether to add a plus sign to the number to relative change. A negative will automatically be added
        if speedFactor >= 1.0:
            percentSign = '+'
        else:
            percentSign = ''
        # Convert speedFactor float value to a relative percentage    
        rate = percentSign + str(round((speedFactor - 1.0) * 100, 5)) + '%'
    return rate

def synthesize_text_azure_batch(subsDict, langDict, skipSynthesize=False, secondPass=False):

    def create_request_payload(remainingEntriesDict):
        # Create SSML for all subtitles
        ssmlJson = []
        payloadSizeInBytes = 0
        tempDict = dict(remainingEntriesDict) # Need to do this to avoid changing the original dict which would mess with the loop

        for key, value in tempDict.items():
            text = tempDict[key]['translated_text']
            duration = tempDict[key]['duration_ms_buffered']
            language = langDict['languageCode']
            voice = langDict['voiceName']

            # Create tag for desired duration of clip
            durationTag = f'<mstts:audioduration value="{str(duration)}ms"/>'

            # Create string for sentence pauses, if not default
            if not azure_sentence_pause == 'default':
                sentencePauseTag = f'<mstts:silence type="Sentenceboundary-exact" value="{str(azure_sentence_pause)}ms"/>'
            else:
                sentencePauseTag = ''

            # Create string for comma pauses, if not default
            if not azure_comma_pause == 'default':
                commaPauseTag = f'<mstts:silence type="Comma-exact" value="{str(azure_comma_pause)}ms"/>'
            else:
                commaPauseTag = ''

            # Set string for tag to set leading and trailing silence times to zero
            leadSilenceTag = '<mstts:silence  type="Leading-exact" value="0ms"/>'
            tailSilenceTag = '<mstts:silence  type="Tailing-exact" value="0ms"/>'    

            # Process text using pronunciation customization set by user
            text = add_all_pronunciation_overrides(text)

            # Create the SSML for each subtitle
            ssml = f"<speak version='1.0' xml:lang='{language}' xmlns='http://www.w3.org/2001/10/synthesis' " \
            "xmlns:mstts='http://www.w3.org/2001/mstts'>" \
            f"<voice name='{voice}'>{sentencePauseTag}{commaPauseTag}{durationTag}{leadSilenceTag}{tailSilenceTag}" \
            f"{text}</voice></speak>"
            ssmlJson.append({"text": ssml})

            # Construct request payload with SSML
            # Reconstruct payload with every loop with new SSML so that the payload size is accurate
            now = datetime.datetime.now()
            pendingPayload = {
                'displayName': langDict['languageCode'] + '-' + now.strftime("%Y-%m-%d %H:%M:%S"),
                'description': 'Batch synthesis of ' + langDict['languageCode'] + ' subtitles',
                "textType": "SSML",
                # To use custom voice, see original example code script linked from azure_batch.py
                "inputs": ssmlJson,
                "properties": {
                    "outputFormat": "audio-48khz-192kbitrate-mono-mp3",
                    "wordBoundaryEnabled": False,
                    "sentenceBoundaryEnabled": False,
                    "concatenateResult": False,
                    "decompressOutputFiles": False
                },
            }
            # Azure TTS Batch requests require payload must be under 500 kilobytes, so check payload is under 500,000 bytes. Not sure if they actually mean kibibytes, assume worst case.
            # Payload will be formatted as json so must account for that too by doing json.dumps(), otherwise calculated size will be inaccurate
            payloadSizeInBytes = len(str(json.dumps(pendingPayload)).encode('utf-8')) 

            if payloadSizeInBytes > 495000 or len(ssmlJson) > 995: # Leave some room for anything unexpected. Also number of inputs must be below 1000
                # If payload would be too large, ignore the last entry and break out of loop
                return payload, remainingEntriesDict
            else:
                payload = copy.deepcopy(pendingPayload) # Must make deepycopy otherwise ssmlJson will be updated in both instead of just pendingPayload
                # Remove entry from remainingEntriesDict if it was added to payload
                remainingEntriesDict.pop(key)                


        # If all the rest of the entries fit, return the payload
        return payload, remainingEntriesDict
    # ------------------------- End create_request_payload() -----------------------------------


    # Create payloads, split into multiple if necessary
    payloadList = []
    remainingPayloadEntriesDict = dict(subsDict) # Will remove entries as they are added to payloads
    while len(remainingPayloadEntriesDict) > 0:
        payloadToAppend, remainingPayloadEntriesDict = create_request_payload(remainingPayloadEntriesDict)
        payloadList.append(payloadToAppend)
    
    # Tell user if request will be broken up into multiple payloads
    if len(payloadList) > 1:
        print(f'Payload will be broken up into {len(payloadList)} requests (due to Azure size limitations).')

    # Use to keep track of filenames downloaded via separate zip files. WIll remove as they are downloaded
    remainingDownloadedEntriesList = list(subsDict.keys())

    # Clear out workingFolder
    for filename in os.listdir('workingFolder'):
        if not debug_mode:
            os.remove(os.path.join('workingFolder', filename))

    # Loop through payloads and submit to Azure
    for payload in payloadList:
        # Reset job_id from previous loops
        job_id = None
        
        # Send request to Azure
        job_id = azure_batch.submit_synthesis(payload)

        # Wait for job to finish
        if job_id is not None:
            status = "Running"
            resultDownloadLink = None
            
            while True: # Must use break to exit loop
                # Get status
                response = azure_batch.get_synthesis(job_id)
                status = response.json()['status']
                if status == 'Succeeded':
                    print('Batch synthesis job succeeded')
                    resultDownloadLink = azure_batch.get_synthesis(job_id).json()['outputs']['result']
                    break
                elif status == 'Failed':
                    print('ERROR: Batch synthesis job failed!')
                    print("Reason:" + response.reason)
                    break
                else:
                    print(f'Waiting for Azure batch synthesis job to finish. Status: [{status}]')
                    time.sleep(5)
            
            # Download resultig zip file
            if resultDownloadLink is not None:
                # Download zip file
                urlResponse = urlopen(resultDownloadLink)

                # If debug mode, save zip file to disk
                if debug_mode:
                    if secondPass == False:
                        zipName = 'azureBatch.zip'
                    else:
                        zipName = 'azureBatchPass2.zip'

                    zipPath = os.path.join('workingFolder', zipName)
                    with open(zipPath, 'wb') as f:
                        f.write(urlResponse.read())
                    # Reset urlResponse so it can be read again
                    urlResponse = urlopen(resultDownloadLink)

                # Process zip file    
                virtualResultZip = io.BytesIO(urlResponse.read())
                zipdata = zipfile.ZipFile(virtualResultZip)
                zipinfos = zipdata.infolist()

                # Reorder zipinfos so the file names are in alphanumeric order
                zipinfos.sort(key=lambda x: x.filename)

                # Only extract necessary files, and rename them while doing so
                for file in zipinfos:
                    if file.filename == "summary.json":
                        #zipdata.extract(file, 'workingFolder') # For debugging
                        pass
                    elif "json" not in file.filename:
                        # Rename file to match first entry in remainingDownloadedEntriesDict, then extract
                        currentFileNum = remainingDownloadedEntriesList[0]
                        file.filename = str(currentFileNum) + '.mp3'
                        #file.filename = file.filename.lstrip('0')

                        # Add file path to subsDict then remove from remainingDownloadedEntriesList
                        subsDict[currentFileNum]['TTS_FilePath'] = os.path.join('workingFolder', str(currentFileNum)) + '.mp3'
                        # Extract file
                        zipdata.extract(file, 'workingFolder')
                        # Remove entry from remainingDownloadedEntriesList
                        remainingDownloadedEntriesList.pop(0)
                    

    return subsDict


def synthesize_dictionary_batch(subsDict, langDict, skipSynthesize=False, secondPass=False):
    if not skipSynthesize:
        subsDict = synthesize_text_azure_batch(subsDict, langDict, skipSynthesize, secondPass)
    return subsDict

def synthesize_dictionary(subsDict, langDict, outputFolder, skipSynthesize=False, secondPass=False):
    for key, value in subsDict.items():
        # TTS each subtitle text, write to file, write filename into dictionary
        workingFolder = os.path.join(outputFolder, 'workingFolder')
        filePath = os.path.join(workingFolder, f'{str(key)}.mp3')
        filePathStem = os.path.join(workingFolder, f'{str(key)}')
        if not skipSynthesize:

            duration = value['duration_ms_buffered']

            if secondPass:
                # Get speed factor from subsDict
                speedFactor = subsDict[key]['speed_factor']
            else:
                speedFactor = float(1.0)

            # Prepare output location. If folder doesn't exist, create it
            if not os.path.exists(os.path.dirname(filePath)):
                try:
                    os.makedirs(os.path.dirname(filePath))
                except OSError:
                    print("Error creating directory")


            # If Azure TTS, use Azure API
            if tts_service == "azure":
                # Audio variable is an AudioDataStream object
                audio = synthesize_text_azure(value['translated_text'], duration, langDict['voiceName'], langDict['languageCode'])
                # Save to file using save_to_wav_file method of audio object
                audio.save_to_wav_file(filePath)
                
                # If debug mode, write to files after Google TTS
                if debug_mode and secondPass == False:
                    audio.save_to_wav_file(filePathStem+"_p1.mp3")
                elif debug_mode and secondPass == True:
                    audio.save_to_wav_file(filePathStem+"_p2.mp3")

        subsDict[key]['TTS_FilePath'] = filePath

        # Get key index
        keyIndex = list(subsDict.keys()).index(key)
        # Print progress and overwrite line next time
        if not secondPass:
            print(f" Synthesizing TTS Line: {keyIndex+1} of {len(subsDict)}", end="\r")
        else:
            print(f" Synthesizing TTS Line (2nd Pass): {keyIndex+1} of {len(subsDict)}", end="\r")
    print("                                               ") # Clear the line
    return subsDict