Spaces:
Running
Running
#%% | |
import azure.cognitiveservices.speech as speechsdk | |
import re | |
import os | |
import hashlib | |
import random | |
speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY'), | |
region=os.environ.get('SPEECH_REGION')) | |
def do_cleanup(dir='wavs', num_files=100): | |
files = os.listdir(dir) | |
if len(files) > num_files: | |
for file in files[:len(files) - num_files]: | |
os.remove(f"{dir}/{file}") | |
def add_sukun(text): | |
# Define Arabic letters and sukun | |
arabic_letters = 'اأإآةبتثجحخدذرزسشصضطظعغفقكلمنهوي' | |
shadda = 'ّ' | |
arabic_letters += shadda | |
sukun = 'ْ' | |
punctuation = '.,;!?،؛؟' | |
def process_word(word): | |
# If the last character is punctuation, process the letter before it | |
if word[-1] in punctuation: | |
if len(word) > 1 and word[-2] in arabic_letters and word[-2] != sukun: | |
return word[:-2] + word[-2] + sukun + word[-1] | |
return word | |
# If the last character is an Arabic letter and does not have a sukun, add one | |
elif word[-1] in arabic_letters and word[-1] != sukun: | |
return word + sukun | |
return word | |
# Use regex to split text into words and punctuation | |
words = re.findall(r'\S+|[.,;!?،؛؟]', text) | |
processed_text = ' '.join(process_word(word) for word in words) | |
return processed_text | |
def get_ssml(text, voice='de-DE-SeraphinaMultilingualNeural'): | |
return f'<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="ar-SA"><voice name="{voice}"><lang xml:lang="ar-SA">{text}</lang></voice></speak>' | |
def get_audio(input_text, voice='de-DE-FlorianMultilingualNeural', use_ssml=True): | |
input_text = add_sukun(input_text) | |
hash = hashlib.md5(input_text.encode()).hexdigest() | |
if os.path.exists(f"wavs/{hash}.wav"): | |
return f"wavs/{hash}.wav" | |
audio_config = speechsdk.audio.AudioOutputConfig(filename=f"wavs/{hash}.wav") | |
# speech_config.speech_synthesis_voice_name=voice | |
# speech_config.speech_synthesis_language = "ar-EG" | |
speech_config.set_speech_synthesis_output_format( | |
speechsdk.SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm | |
) | |
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, | |
audio_config=audio_config) | |
if use_ssml: | |
# print("Using SSML") | |
ssml = get_ssml(input_text, voice=voice) | |
result = speech_synthesizer.speak_ssml_async(ssml).get() | |
else: | |
# print("Using text") | |
result = speech_synthesizer.speak_text_async(input_text).get() | |
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: | |
print("Speech synthesized for text [{}]".format(input_text)) | |
elif result.reason == speechsdk.ResultReason.Canceled: | |
cancellation_details = result.cancellation_details | |
print("Speech synthesis canceled: {}".format(cancellation_details.reason)) | |
if cancellation_details.reason == speechsdk.CancellationReason.Error: | |
print("Error details: {}".format(cancellation_details.error_details)) | |
# randomly every 50 calls, clean up the wavs folder | |
if random.randint(1, 50) == 1: | |
do_cleanup() | |
return f"wavs/{hash}.wav" | |