#%% import azure.cognitiveservices.speech as speechsdk import re import os import hashlib import random from dotenv import load_dotenv load_dotenv(".env") print(os.environ.get('SPEECH_KEY')) print(os.environ.get('SPEECH_REGION')) speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY'), region=os.environ.get('SPEECH_REGION')) def do_cleanup(dir='wavs', num_files=100): files = os.listdir(dir) if len(files) > num_files: for file in files[:len(files) - num_files]: os.remove(f"{dir}/{file}") def add_sukun(text): # Define Arabic letters and sukun arabic_letters = 'اأإآةبتثجحخدذرزسشصضطظعغفقكلمنهوي' shadda = 'ّ' arabic_letters += shadda sukun = 'ْ' punctuation = '.,;!?،؛؟' def process_word(word): # If the last character is punctuation, process the letter before it if word[-1] in punctuation: if len(word) > 1 and word[-2] in arabic_letters and word[-2] != sukun: return word[:-2] + word[-2] + sukun + word[-1] return word # If the last character is an Arabic letter and does not have a sukun, add one elif word[-1] in arabic_letters and word[-1] != sukun: return word + sukun return word # Use regex to split text into words and punctuation words = re.findall(r'\S+|[.,;!?،؛؟]', text) processed_text = ' '.join(process_word(word) for word in words) return processed_text def get_ssml(text, voice='de-DE-SeraphinaMultilingualNeural'): return f'{text}' def get_audio(input_text, voice='de-DE-FlorianMultilingualNeural', use_ssml=True): input_text = add_sukun(input_text) hash = hashlib.md5(input_text.encode()).hexdigest() if os.path.exists(f"wavs/{hash}.wav"): return f"wavs/{hash}.wav" audio_config = speechsdk.audio.AudioOutputConfig(filename=f"wavs/{hash}.wav") # speech_config.speech_synthesis_voice_name=voice # speech_config.speech_synthesis_language = "ar-EG" speech_config.set_speech_synthesis_output_format( speechsdk.SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm ) speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config) if use_ssml: # print("Using SSML") ssml = get_ssml(input_text, voice=voice) result = speech_synthesizer.speak_ssml_async(ssml).get() else: # print("Using text") result = speech_synthesizer.speak_text_async(input_text).get() if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: print("Speech synthesized for text [{}]".format(input_text)) elif result.reason == speechsdk.ResultReason.Canceled: cancellation_details = result.cancellation_details print("Speech synthesis canceled: {}".format(cancellation_details.reason)) if cancellation_details.reason == speechsdk.CancellationReason.Error: print("Error details: {}".format(cancellation_details.error_details)) # randomly every 50 calls, clean up the wavs folder if random.randint(1, 50) == 1: do_cleanup() return f"wavs/{hash}.wav"