Spaces:
Daextream
/
Runtime error

File size: 6,627 Bytes
567073a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# Based on example code of https://huggingface.co/facebook/m2m100_1.2B
# and https://github.com/wannaphong/ttsmms
# See also https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md

import gradio as gr
import os
import re
import soundfile as sf

import json
import nltk
from underthesea import sent_tokenize as vie_sent_tokenize  # Vietnamese NLP toolkit
from underthesea import text_normalize as vie_text_normalize
from nltk import sent_tokenize as nltk_sent_tokenize
from ttsmms import download
from ttsmms import TTS

from collections import OrderedDict
import uuid
import datetime
import shutil
from num2words import num2words


this_description = """Text To Speech for [1000+ languages](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) - using [fairseq MMS TTS](https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md) and [ttsmms](https://github.com/wannaphong/ttsmms) wrapper.
Please note that for some languages, it may not pronounce all words correctly (yet).
"""

nltk.download("punkt")

# Pre-download some languages
tts_models = {}
eng_path = download("eng", "./data")
tts_models["eng"] = eng_path
vie_path = download("vie", "./data")
tts_models["vie"] = vie_path
mya_path = download("mya", "./data")
tts_models["mya"] = mya_path

lang_codes = OrderedDict()

language_names = list(lang_codes.keys())
with open("lang_code.txt", "r") as file:
    for line in file:
        line = line.strip()
        if line.startswith("----"):
            continue
        iso, lang = line.split("\t", 1)
        lang_codes[lang + " (" + iso + ")"] = iso

language_names = list(lang_codes.keys())

# Load num2words_lang_map
with open("num2words_lang_map.json") as f:
    num2words_lang_map = json.load(f, object_pairs_hook=OrderedDict)


def convert_numbers_to_words_num2words(text, lang):
    # Find all numbers in the text using regex
    numbers = re.findall(r"\d+", text)
    # Sort numbers in descending order of length
    sorted_numbers = sorted(numbers, key=len, reverse=True)
    print(sorted_numbers)

    # Replace numbers with their word equivalents
    for number in sorted_numbers:
        number_word = num2words(int(number), lang=num2words_lang_map[lang][0])
        text = text.replace(number, number_word)

    return text


def convert_mya_numbers_to_words(text):
    from mm_num2word import mm_num2word, extract_num

    numbers = extract_num(text)
    sorted_numbers = sorted(numbers, key=len, reverse=True)
    print(sorted_numbers)

    for n in sorted_numbers:
        text = text.replace(n, mm_num2word(n))
    return text


def prepare_sentences(text, lang="mya"):
    sentences = []
    # pre-process the text for some languages
    if lang.lower() == "mya":
        text = convert_mya_numbers_to_words(text)
        text = text.replace("\u104A", ",").replace("\u104B", ".")

    if lang in num2words_lang_map:
        print("num2words supports this lang", lang)
        text = convert_numbers_to_words_num2words(text, lang)
    print("Processed text", text)

    # Not sure why this can fix unclear pronunciation for the first word of vie
    text = text.lower()

    paragraphs = [paragraph for paragraph in text.split("\n") if paragraph.strip()]

    if lang.lower() == "vie":
        for paragraph in paragraphs:
            sentences_raw = vie_sent_tokenize(paragraph)
            sentences.extend(
                [
                    vie_text_normalize(sentence)
                    for sentence in sentences_raw
                    if sentence.strip()
                ]
            )
    else:
        sentences = [
            sentence
            for paragraph in paragraphs
            for sentence in nltk_sent_tokenize(paragraph)
            if sentence.strip()
        ]
    return sentences


def list_dir(lang):
    # Get the current directory
    current_dir = os.getcwd()
    print(current_dir)

    # List all files in the current directory
    files = os.listdir(current_dir)

    # Filter the list to include only WAV files
    wav_files = [file for file in files if file.endswith(".wav")]
    print("Total wav files:", len(wav_files))

    # Print the last WAV file
    sorted_list = sorted(wav_files)
    print(lang, sorted_list[-1])


def combine_wav(source_dir, stamp, lang):
    # Get a list of all WAV files in the folder
    wav_files = [file for file in os.listdir(source_dir) if file.endswith(".wav")]

    # Sort the files alphabetically to ensure the correct order of combination
    wav_files.sort()

    # Combine the WAV files
    combined_data = []
    for file in wav_files:
        file_path = os.path.join(source_dir, file)
        data, sr = sf.read(file_path)
        combined_data.extend(data)

    # Save the combined audio to a new WAV file
    combined_file_path = f"{stamp}_{lang}.wav"
    sf.write(combined_file_path, combined_data, sr)

    shutil.rmtree(source_dir)
    list_dir(lang)

    # Display the combined audio in the Hugging Face Space app
    return combined_file_path


def mms_tts(Input_Text, lang_name="Burmese (mya)"):
    # lang_code = lang_codes[lang_name]
    try:
        lang_code = lang_codes[lang_name]
    except KeyError:
        lang_code = "mya"

    user_model = download(lang_code, "./data")
    tts = TTS(user_model)

    sentences = prepare_sentences(Input_Text, lang_code)

    # output_dir = f"out_{lang_code}"
    current_datetime = datetime.datetime.now()
    timestamp = current_datetime.strftime("%Y%m%d%H%M%S%f")

    user_dir = f"u_{timestamp}"
    if os.path.exists(user_dir):
        session_id = str(uuid.uuid4())  # Generate a random session ID
        user_dir = f"u_{session_id}_{timestamp}"
    os.makedirs(user_dir, exist_ok=True)
    print("New user directory", user_dir)

    for i, sentence in enumerate(sentences):
        tts.synthesis(sentence, wav_path=f"{user_dir}/s_{str(i).zfill(10)}.wav")
    combined_file_path = combine_wav(user_dir, timestamp, lang_code)
    return combined_file_path


# common_languages = ["eng", "mya", "vie"]  # List of common language codes
iface = gr.Interface(
    fn=mms_tts,
    title="Massively Multilingual Speech (MMS) - Text To Speech",
    description=this_description,
    inputs=[
        gr.Textbox(lines=5, placeholder="Enter text (unlimited sentences)", label="Input text (unlimited sentences)"),
        gr.Dropdown(
            choices=language_names,
            label="Select language 1,000+",
            value="Burmese (mya)",
        ),
    ],
    outputs="audio",
)
# outputs=[
#         "audio",
#         gr.File(label="Download", type="file", download_to="done.wav")
#     ])


iface.launch()