Spaces:

Elyadata
/

TunArTTS

Running

File size: 2,768 Bytes

455c922
 
23e6066
57d76c0
4328747
23e6066
4328747
57d76c0
086c039
60a4fae
086c039
21c7503
299936d
21c7503
 
 
 
 
 
9a43531
086c039
ab9af27
 
 
086c039
60af963
086c039
 
00c4b3b
086c039
 
f139b27
cddc3e0
16fcae1
cddc3e0
 
 
f139b27
 
e6757ef
23e6066
 
 
 
 
 
 
 
 
 
 
f572cd2
23e6066
4328747
 
455c922
54c9038
 
4328747
7ad56ab
42acf0b
52dc6f4
42acf0b
455c922
908a29b
fdf341a
21e7589
 
a3419d8
908a29b
cddc3e0
fdf341a
e6f3aba
fdf341a

import numpy as np
import gradio as gr

from scipy.io import wavfile
from espnet2.bin.tts_inference import Text2Speech
from arabic_pronounce import phonetise
import soundfile as sf


title = "                                                 Tunisian Text To Speech"

description = """ 
This is a demo for our Tunisian TTS system. You can write your dicritized tunisian text to synthesis the corresponding speech.
This project project was developed with the purpose of bridging the gap between high-resource and low-resource languages.

If you need help,  feel free to drop an email here : 
fethi.bougares@elyadata.com
rami.kammoun@algobrain.ai
imen.laouirine@elyadata.com

Authors : 
* [Imen Laouirine](https://www.linkedin.com/in/imen-laouirine-9a557b209)
* [Rami Kammoun](https://www.linkedin.com/in/rami-kammoun/)
* [Fethi Bougares](https://www.linkedin.com/in/fethi-bougares/)

More implementation details could be found in[ ![GitHub](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white) ](https://github.com/elyadata/TunArTTS/tree/develop)
More in-depth details and insights are available in a released preprint. Please find the paper [here](paper_link).
If you use or refer to this model, please cite : 

"""

examples = [
    ["يْكِنّْلُو مَشَاعِرْ قْوِيَّة يْكِنّْلُو مَشَاعِرْ قْوِيَّة"],
    ["سَارَقْ وْفِي يِدُّو شَمْعَة"],
    ["صَامْ نْهَارْ مِنْ رُمْضَانْ، قَالْ العِيدْ آشْ مَازَالُو؟"],
    ["ضَحْكُولُو تْمَدْ عْلَى طُولُو"],
    ["عَارِكْ وْخَلِّي لِلْصُلْحْ مْكَانْ"]
]


def text_to_phoneme(tun_text):
    space_split = tun_text.split(" ")
    res = ""
    for i in range(len(space_split)):
      res +=" "+phonetise(space_split[i])[0]
    res = res.strip()
    res = "sil "+res+" sil"
    return res

def generate_tts(input_text):
    phonemized_text = text_to_phoneme(input_text)
    tts = Text2Speech.from_pretrained(model_file="exp/tts_train_conformer_fastspeech2_raw_phn_none/train.loss.ave_5best.pth", vocoder_file="train_tun_parallel_wavegan.v3/checkpoint-560000steps.pkl")
    wav = tts(f"sil {phonemized_text} sil")["wav"]
    audio_data = wav.numpy()
    sf.write('output.wav', audio_data, samplerate=22050)

def generate_audio(inputs):
    generate_tts(inputs)
    wav_file_path = "output.wav"
    
    sr, audio_data = wavfile.read(wav_file_path)

    return sr, audio_data


demo = gr.Interface(
    title= title,
    description=description,
    fn=generate_audio,
    examples = examples,
    inputs= gr.Text(label="Input Text"),
    outputs ="audio")

if __name__ == "__main__":
    demo.launch()