Spaces:

guymorlan
/

levanti_en_ar

Running

File size: 3,459 Bytes

e35836c

#%%
import azure.cognitiveservices.speech as speechsdk
import re
import os
import hashlib
import random
from dotenv import load_dotenv
load_dotenv(".env")

print(os.environ.get('SPEECH_KEY'))
print(os.environ.get('SPEECH_REGION'))
speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY'),
                                       region=os.environ.get('SPEECH_REGION'))

def do_cleanup(dir='wavs', num_files=100):
    files = os.listdir(dir)
    if len(files) > num_files:
        for file in files[:len(files) - num_files]:
            os.remove(f"{dir}/{file}")

def add_sukun(text):
    # Define Arabic letters and sukun
    arabic_letters = 'اأإآةبتثجحخدذرزسشصضطظعغفقكلمنهوي'
    shadda = 'ّ'
    arabic_letters += shadda
    sukun = 'ْ'
    punctuation = '.,;!?،؛؟'

    def process_word(word):
        # If the last character is punctuation, process the letter before it
        if word[-1] in punctuation:
            if len(word) > 1 and word[-2] in arabic_letters and word[-2] != sukun:
                return word[:-2] + word[-2] + sukun + word[-1]
            return word
        # If the last character is an Arabic letter and does not have a sukun, add one
        elif word[-1] in arabic_letters and word[-1] != sukun:
            return word + sukun
        return word

    # Use regex to split text into words and punctuation
    words = re.findall(r'\S+|[.,;!?،؛؟]', text)
    processed_text = ' '.join(process_word(word) for word in words)
    return processed_text

def get_ssml(text, voice='de-DE-SeraphinaMultilingualNeural'):
    return f'<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="ar-SA"><voice name="{voice}"><lang xml:lang="ar-SA">{text}</lang></voice></speak>'


def get_audio(input_text, voice='de-DE-FlorianMultilingualNeural', use_ssml=True):

    input_text = add_sukun(input_text)
    hash = hashlib.md5(input_text.encode()).hexdigest()
    
    if os.path.exists(f"wavs/{hash}.wav"):
        return f"wavs/{hash}.wav"
    
    audio_config = speechsdk.audio.AudioOutputConfig(filename=f"wavs/{hash}.wav")
    # speech_config.speech_synthesis_voice_name=voice
    # speech_config.speech_synthesis_language = "ar-EG"
    speech_config.set_speech_synthesis_output_format(
        speechsdk.SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm
    )

    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config,
                                                     audio_config=audio_config)
    if use_ssml:
        # print("Using SSML")
        ssml = get_ssml(input_text, voice=voice)
        result = speech_synthesizer.speak_ssml_async(ssml).get()
    else:
        # print("Using text")
        result = speech_synthesizer.speak_text_async(input_text).get()
    
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        print("Speech synthesized for text [{}]".format(input_text))
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("Speech synthesis canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(cancellation_details.error_details))

    # randomly every 50 calls, clean up the wavs folder
    if random.randint(1, 50) == 1:
        do_cleanup()

    return f"wavs/{hash}.wav"