mmstts / app.py
dpc's picture
Update app.py
f1343f1
# Based on example code of https://huggingface.co/facebook/m2m100_1.2B
# and https://github.com/wannaphong/ttsmms
# See also https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md
import gradio as gr
import os
import re
import soundfile as sf
import json
import nltk
from underthesea import sent_tokenize as vie_sent_tokenize # Vietnamese NLP toolkit
from underthesea import text_normalize as vie_text_normalize
from nltk import sent_tokenize as nltk_sent_tokenize
from ttsmms import download
from ttsmms import TTS
from collections import OrderedDict
import uuid
import datetime
import shutil
from num2words import num2words
this_description = """Text To Speech for [1000+ languages](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) - using [fairseq MMS TTS](https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md) and [ttsmms](https://github.com/wannaphong/ttsmms) wrapper.
Please note that for some languages, it may not pronounce all words correctly (yet).
"""
nltk.download("punkt")
# Pre-download some languages
tts_models = {}
eng_path = download("eng", "./data")
tts_models["eng"] = eng_path
vie_path = download("vie", "./data")
tts_models["vie"] = vie_path
mya_path = download("mya", "./data")
tts_models["mya"] = mya_path
lang_codes = OrderedDict()
language_names = list(lang_codes.keys())
with open("lang_code.txt", "r") as file:
for line in file:
line = line.strip()
if line.startswith("----"):
continue
iso, lang = line.split("\t", 1)
lang_codes[lang + " (" + iso + ")"] = iso
language_names = list(lang_codes.keys())
# Load num2words_lang_map
with open("num2words_lang_map.json") as f:
num2words_lang_map = json.load(f, object_pairs_hook=OrderedDict)
def convert_numbers_to_words_num2words(text, lang):
# Find all numbers in the text using regex
numbers = re.findall(r"\d+", text)
# Sort numbers in descending order of length
sorted_numbers = sorted(numbers, key=len, reverse=True)
print(sorted_numbers)
# Replace numbers with their word equivalents
for number in sorted_numbers:
number_word = num2words(int(number), lang=num2words_lang_map[lang][0])
text = text.replace(number, number_word)
return text
def convert_mya_numbers_to_words(text):
from mm_num2word import mm_num2word, extract_num
numbers = extract_num(text)
sorted_numbers = sorted(numbers, key=len, reverse=True)
print(sorted_numbers)
for n in sorted_numbers:
text = text.replace(n, mm_num2word(n))
return text
def prepare_sentences(text, lang="mya"):
sentences = []
# pre-process the text for some languages
if lang.lower() == "mya":
text = convert_mya_numbers_to_words(text)
text = text.replace("\u104A", ",").replace("\u104B", ".")
if lang in num2words_lang_map:
print("num2words supports this lang", lang)
text = convert_numbers_to_words_num2words(text, lang)
print("Processed text", text)
# Not sure why this can fix unclear pronunciation for the first word of vie
text = text.lower()
paragraphs = [paragraph for paragraph in text.split("\n") if paragraph.strip()]
if lang.lower() == "vie":
for paragraph in paragraphs:
sentences_raw = vie_sent_tokenize(paragraph)
sentences.extend(
[
vie_text_normalize(sentence)
for sentence in sentences_raw
if sentence.strip()
]
)
else:
sentences = [
sentence
for paragraph in paragraphs
for sentence in nltk_sent_tokenize(paragraph)
if sentence.strip()
]
return sentences
def list_dir(lang):
# Get the current directory
current_dir = os.getcwd()
print(current_dir)
# List all files in the current directory
files = os.listdir(current_dir)
# Filter the list to include only WAV files
wav_files = [file for file in files if file.endswith(".wav")]
print("Total wav files:", len(wav_files))
# Print the last WAV file
sorted_list = sorted(wav_files)
print(lang, sorted_list[-1])
def combine_wav(source_dir, stamp, lang):
# Get a list of all WAV files in the folder
wav_files = [file for file in os.listdir(source_dir) if file.endswith(".wav")]
# Sort the files alphabetically to ensure the correct order of combination
wav_files.sort()
# Combine the WAV files
combined_data = []
for file in wav_files:
file_path = os.path.join(source_dir, file)
data, sr = sf.read(file_path)
combined_data.extend(data)
# Save the combined audio to a new WAV file
combined_file_path = f"{stamp}_{lang}.wav"
sf.write(combined_file_path, combined_data, sr)
shutil.rmtree(source_dir)
list_dir(lang)
# Display the combined audio in the Hugging Face Space app
return combined_file_path
def mms_tts(Input_Text, lang_name="Burmese (mya)"):
# lang_code = lang_codes[lang_name]
try:
lang_code = lang_codes[lang_name]
except KeyError:
lang_code = "mya"
user_model = download(lang_code, "./data")
tts = TTS(user_model)
sentences = prepare_sentences(Input_Text, lang_code)
# output_dir = f"out_{lang_code}"
current_datetime = datetime.datetime.now()
timestamp = current_datetime.strftime("%Y%m%d%H%M%S%f")
user_dir = f"u_{timestamp}"
if os.path.exists(user_dir):
session_id = str(uuid.uuid4()) # Generate a random session ID
user_dir = f"u_{session_id}_{timestamp}"
os.makedirs(user_dir, exist_ok=True)
print("New user directory", user_dir)
for i, sentence in enumerate(sentences):
tts.synthesis(sentence, wav_path=f"{user_dir}/s_{str(i).zfill(10)}.wav")
combined_file_path = combine_wav(user_dir, timestamp, lang_code)
return combined_file_path
# common_languages = ["eng", "mya", "vie"] # List of common language codes
iface = gr.Interface(
fn=mms_tts,
title="Massively Multilingual Speech (MMS) - Text To Speech",
description=this_description,
inputs=[
gr.Textbox(lines=5, placeholder="Enter text (unlimited sentences)", label="Input text (unlimited sentences)"),
gr.Dropdown(
choices=language_names,
label="Select language 1,000+",
value="Burmese (mya)",
),
],
outputs="audio",
)
# outputs=[
# "audio",
# gr.File(label="Download", type="file", download_to="done.wav")
# ])
iface.launch()