File size: 4,185 Bytes
1547a56
 
 
006edc3
1547a56
 
 
 
 
 
 
 
 
 
 
 
4e8e8e3
 
c331458
 
1547a56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
006edc3
 
1547a56
 
 
 
 
 
 
 
c331458
 
1547a56
0c28d14
1547a56
ea38bcc
 
c331458
 
ea38bcc
 
 
 
c331458
 
 
 
 
 
27f7d09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c331458
 
 
a4f8653
c331458
 
 
1547a56
c331458
1547a56
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
import torch
import gradio as gr
import numpy as np
import os.path as op
import pyarabic.araby as araby

from artst.tasks.artst import ArTSTTask
from transformers import SpeechT5HifiGan
from artst.models.artst import ArTSTTransformerModel
from fairseq.tasks.hubert_pretraining import LabelEncoder
from fairseq.data.audio.speech_to_text_dataset import get_features_or_waveform 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint = torch.load('ckpts/clartts_tts.pt')
checkpoint['cfg']['task'].t5_task = 't2s'
checkpoint['cfg']['task'].bpe_tokenizer = "utils/arabic.model"
checkpoint['cfg']['task'].data = "utils/"
checkpoint['cfg']['model'].mask_prob = 0.5
checkpoint['cfg']['task'].mask_prob = 0.5
task = ArTSTTask.setup_task(checkpoint['cfg']['task'])

emb_path='embs/clartts.npy'
model = ArTSTTransformerModel.build_model(checkpoint['cfg']['model'], task)
model.load_state_dict(checkpoint['model'])

checkpoint['cfg']['task'].bpe_tokenizer = task.build_bpe(checkpoint['cfg']['model'])
tokenizer = checkpoint['cfg']['task'].bpe_tokenizer

processor = LabelEncoder(task.dicts['text'])

vocoder = SpeechT5HifiGan.from_pretrained('microsoft/speecht5_hifigan').to(device)

def get_embs(emb_path):
    spkembs = get_features_or_waveform(emb_path)
    spkembs = torch.from_numpy(spkembs).float().unsqueeze(0)
    return spkembs

def process_text(text):
    text = araby.strip_diacritics(text)
    return processor(tokenizer.encode(text)).reshape(1, -1)

net_input = {}

def inference(text, spkr=emb_path):
    if len(text.strip()) == 0:
        return (16000, np.zeros(0).astype(np.int16))
    net_input['src_tokens'] = process_text(text)
    net_input['spkembs'] = get_embs(spkr)
    outs, _, attn = task.generate_speech(
            [model], 
            net_input,
        )
    with torch.no_grad():
        gen_audio = vocoder(outs.to(device))
    speech = (gen_audio.cpu().numpy() * 32767).astype(np.int16)
    return (16000,speech)

text_box = gr.Textbox(max_lines=2, label="Arabic Text", rtl=True)
out = gr.Audio(label="Synthesized Audio", type="numpy")
title="ArTST: Arabic Speech Synthesis"
description="ArTST: Arabic text and speech transformer based on the T5 transformer. This space demonstarates the TTS checkpoint finetuned on \
    the Classical Arabic Text-To-Speech (CLARTTS) dataset. The model is pre-trained on the MGB-2 dataset."

examples=["ู„ุฃู† ูุฑุงู‚ ุงู„ู…ุฃู„ูˆู ููŠ ุงู„ุนุงุฏุฉ ูˆู…ุฌุงู†ุจุฉ ู…ุง ุตุงุฑ ู…ุชูู‚ุง ุนู„ูŠู‡ ุจุงู„ู…ูˆุงุถุนุฉ",\
    "ูˆู…ู† ู„ุทูŠู ุญูƒู…ุชู‡ ุฃู† ุฌุนู„ ู„ูƒู„ ุนุจุงุฏุฉ ุญุงู„ุชูŠู†",\
    "ูู…ู† ู„ู‡ู… ุนุฏู„ ุงู„ุฅู†ุณุงู† ู…ุน ู…ู† ููˆู‚ู‡"]

article = """
<div style='margin:20px auto;'>
<p>References: <a href="https://arxiv.org/abs/2310.16621">ArTST paper</a> |
<a href="https://github.com/mbzuai-nlp/ArTST">GitHub</a> |
<a href="https://huggingface.co/MBZUAI/ArTST">Weights and Tokenizer</a></p>
<pre>
@inproceedings{toyin-etal-2023-artst,
    title = "{A}r{TST}: {A}rabic Text and Speech Transformer",
    author = "Toyin, Hawau  and
      Djanibekov, Amirbek  and
      Kulkarni, Ajinkya  and
      Aldarmaki, Hanan",
    editor = "Sawaf, Hassan  and
      El-Beltagy, Samhaa  and
      Zaghouani, Wajdi  and
      Magdy, Walid  and
      Abdelali, Ahmed  and
      Tomeh, Nadi  and
      Abu Farha, Ibrahim  and
      Habash, Nizar  and
      Khalifa, Salam  and
      Keleg, Amr  and
      Haddad, Hatem  and
      Zitouni, Imed  and
      Mrini, Khalil  and
      Almatham, Rawan",
    booktitle = "Proceedings of ArabicNLP 2023",
    month = dec,
    year = "2023",
    address = "Singapore (Hybrid)",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.arabicnlp-1.5",
    pages = "41--51"
}
</pre>
<p>Speaker embeddings were generated from <a href="http://www.festvox.org/cmu_arctic/">CMU ARCTIC</a>.</p>
<p>ArTST is based on <a href="https://arxiv.org/abs/2110.07205">SpeechT5 architecture</a>.</p>
</div>
"""

demo = gr.Interface(inference, \
    inputs=text_box, outputs=out, title=title, description=description, examples=examples, article=article)

if __name__ == "__main__":
    demo.launch(share=True)