|
|
|
|
|
|
|
|
|
import gradio as gr |
|
import os |
|
import re |
|
import soundfile as sf |
|
|
|
import json |
|
import nltk |
|
from underthesea import sent_tokenize as vie_sent_tokenize |
|
from underthesea import text_normalize as vie_text_normalize |
|
from nltk import sent_tokenize as nltk_sent_tokenize |
|
from ttsmms import download |
|
from ttsmms import TTS |
|
|
|
from collections import OrderedDict |
|
import uuid |
|
import datetime |
|
import shutil |
|
from num2words import num2words |
|
|
|
|
|
this_description = """Text To Speech for [1000+ languages](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) - using [fairseq MMS TTS](https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md) and [ttsmms](https://github.com/wannaphong/ttsmms) wrapper. |
|
Please note that for some languages, it may not pronounce all words correctly (yet). |
|
""" |
|
|
|
nltk.download("punkt") |
|
|
|
|
|
tts_models = {} |
|
eng_path = download("eng", "./data") |
|
tts_models["eng"] = eng_path |
|
vie_path = download("vie", "./data") |
|
tts_models["vie"] = vie_path |
|
mya_path = download("mya", "./data") |
|
tts_models["mya"] = mya_path |
|
|
|
lang_codes = OrderedDict() |
|
|
|
language_names = list(lang_codes.keys()) |
|
with open("lang_code.txt", "r") as file: |
|
for line in file: |
|
line = line.strip() |
|
if line.startswith("----"): |
|
continue |
|
iso, lang = line.split("\t", 1) |
|
lang_codes[lang + " (" + iso + ")"] = iso |
|
|
|
language_names = list(lang_codes.keys()) |
|
|
|
|
|
with open("num2words_lang_map.json") as f: |
|
num2words_lang_map = json.load(f, object_pairs_hook=OrderedDict) |
|
|
|
|
|
def convert_numbers_to_words_num2words(text, lang): |
|
|
|
numbers = re.findall(r"\d+", text) |
|
|
|
sorted_numbers = sorted(numbers, key=len, reverse=True) |
|
print(sorted_numbers) |
|
|
|
|
|
for number in sorted_numbers: |
|
number_word = num2words(int(number), lang=num2words_lang_map[lang][0]) |
|
text = text.replace(number, number_word) |
|
|
|
return text |
|
|
|
|
|
def convert_mya_numbers_to_words(text): |
|
from mm_num2word import mm_num2word, extract_num |
|
|
|
numbers = extract_num(text) |
|
sorted_numbers = sorted(numbers, key=len, reverse=True) |
|
print(sorted_numbers) |
|
|
|
for n in sorted_numbers: |
|
text = text.replace(n, mm_num2word(n)) |
|
return text |
|
|
|
|
|
def prepare_sentences(text, lang="mya"): |
|
sentences = [] |
|
|
|
if lang.lower() == "mya": |
|
text = convert_mya_numbers_to_words(text) |
|
text = text.replace("\u104A", ",").replace("\u104B", ".") |
|
|
|
if lang in num2words_lang_map: |
|
print("num2words supports this lang", lang) |
|
text = convert_numbers_to_words_num2words(text, lang) |
|
print("Processed text", text) |
|
|
|
|
|
text = text.lower() |
|
|
|
paragraphs = [paragraph for paragraph in text.split("\n") if paragraph.strip()] |
|
|
|
if lang.lower() == "vie": |
|
for paragraph in paragraphs: |
|
sentences_raw = vie_sent_tokenize(paragraph) |
|
sentences.extend( |
|
[ |
|
vie_text_normalize(sentence) |
|
for sentence in sentences_raw |
|
if sentence.strip() |
|
] |
|
) |
|
else: |
|
sentences = [ |
|
sentence |
|
for paragraph in paragraphs |
|
for sentence in nltk_sent_tokenize(paragraph) |
|
if sentence.strip() |
|
] |
|
return sentences |
|
|
|
|
|
def list_dir(lang): |
|
|
|
current_dir = os.getcwd() |
|
print(current_dir) |
|
|
|
|
|
files = os.listdir(current_dir) |
|
|
|
|
|
wav_files = [file for file in files if file.endswith(".wav")] |
|
print("Total wav files:", len(wav_files)) |
|
|
|
|
|
sorted_list = sorted(wav_files) |
|
print(lang, sorted_list[-1]) |
|
|
|
|
|
def combine_wav(source_dir, stamp, lang): |
|
|
|
wav_files = [file for file in os.listdir(source_dir) if file.endswith(".wav")] |
|
|
|
|
|
wav_files.sort() |
|
|
|
|
|
combined_data = [] |
|
for file in wav_files: |
|
file_path = os.path.join(source_dir, file) |
|
data, sr = sf.read(file_path) |
|
combined_data.extend(data) |
|
|
|
|
|
combined_file_path = f"{stamp}_{lang}.wav" |
|
sf.write(combined_file_path, combined_data, sr) |
|
|
|
shutil.rmtree(source_dir) |
|
list_dir(lang) |
|
|
|
|
|
return combined_file_path |
|
|
|
|
|
def mms_tts(Input_Text, lang_name="Burmese (mya)"): |
|
|
|
try: |
|
lang_code = lang_codes[lang_name] |
|
except KeyError: |
|
lang_code = "mya" |
|
|
|
user_model = download(lang_code, "./data") |
|
tts = TTS(user_model) |
|
|
|
sentences = prepare_sentences(Input_Text, lang_code) |
|
|
|
|
|
current_datetime = datetime.datetime.now() |
|
timestamp = current_datetime.strftime("%Y%m%d%H%M%S%f") |
|
|
|
user_dir = f"u_{timestamp}" |
|
if os.path.exists(user_dir): |
|
session_id = str(uuid.uuid4()) |
|
user_dir = f"u_{session_id}_{timestamp}" |
|
os.makedirs(user_dir, exist_ok=True) |
|
print("New user directory", user_dir) |
|
|
|
for i, sentence in enumerate(sentences): |
|
tts.synthesis(sentence, wav_path=f"{user_dir}/s_{str(i).zfill(10)}.wav") |
|
combined_file_path = combine_wav(user_dir, timestamp, lang_code) |
|
return combined_file_path |
|
|
|
|
|
|
|
iface = gr.Interface( |
|
fn=mms_tts, |
|
title="Massively Multilingual Speech (MMS) - Text To Speech", |
|
description=this_description, |
|
inputs=[ |
|
gr.Textbox(lines=5, placeholder="Enter text (unlimited sentences)", label="Input text (unlimited sentences)"), |
|
gr.Dropdown( |
|
choices=language_names, |
|
label="Select language 1,000+", |
|
value="Burmese (mya)", |
|
), |
|
], |
|
outputs="audio", |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
iface.launch() |
|
|