# Based on example code of https://huggingface.co/facebook/m2m100_1.2B # and https://github.com/wannaphong/ttsmms # See also https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md import gradio as gr import os import re import soundfile as sf import json import nltk from underthesea import sent_tokenize as vie_sent_tokenize # Vietnamese NLP toolkit from underthesea import text_normalize as vie_text_normalize from nltk import sent_tokenize as nltk_sent_tokenize from ttsmms import download from ttsmms import TTS from collections import OrderedDict import uuid import datetime import shutil from num2words import num2words this_description = """Text To Speech for [1000+ languages](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) - using [fairseq MMS TTS](https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md) and [ttsmms](https://github.com/wannaphong/ttsmms) wrapper. Please note that for some languages, it may not pronounce all words correctly (yet). """ nltk.download("punkt") # Pre-download some languages tts_models = {} eng_path = download("eng", "./data") tts_models["eng"] = eng_path vie_path = download("vie", "./data") tts_models["vie"] = vie_path mya_path = download("mya", "./data") tts_models["mya"] = mya_path lang_codes = OrderedDict() language_names = list(lang_codes.keys()) with open("lang_code.txt", "r") as file: for line in file: line = line.strip() if line.startswith("----"): continue iso, lang = line.split("\t", 1) lang_codes[lang + " (" + iso + ")"] = iso language_names = list(lang_codes.keys()) # Load num2words_lang_map with open("num2words_lang_map.json") as f: num2words_lang_map = json.load(f, object_pairs_hook=OrderedDict) def convert_numbers_to_words_num2words(text, lang): # Find all numbers in the text using regex numbers = re.findall(r"\d+", text) # Sort numbers in descending order of length sorted_numbers = sorted(numbers, key=len, reverse=True) print(sorted_numbers) # Replace numbers with their word equivalents for number in sorted_numbers: number_word = num2words(int(number), lang=num2words_lang_map[lang][0]) text = text.replace(number, number_word) return text def convert_mya_numbers_to_words(text): from mm_num2word import mm_num2word, extract_num numbers = extract_num(text) sorted_numbers = sorted(numbers, key=len, reverse=True) print(sorted_numbers) for n in sorted_numbers: text = text.replace(n, mm_num2word(n)) return text def prepare_sentences(text, lang="mya"): sentences = [] # pre-process the text for some languages if lang.lower() == "mya": text = convert_mya_numbers_to_words(text) text = text.replace("\u104A", ",").replace("\u104B", ".") if lang in num2words_lang_map: print("num2words supports this lang", lang) text = convert_numbers_to_words_num2words(text, lang) print("Processed text", text) # Not sure why this can fix unclear pronunciation for the first word of vie text = text.lower() paragraphs = [paragraph for paragraph in text.split("\n") if paragraph.strip()] if lang.lower() == "vie": for paragraph in paragraphs: sentences_raw = vie_sent_tokenize(paragraph) sentences.extend( [ vie_text_normalize(sentence) for sentence in sentences_raw if sentence.strip() ] ) else: sentences = [ sentence for paragraph in paragraphs for sentence in nltk_sent_tokenize(paragraph) if sentence.strip() ] return sentences def list_dir(lang): # Get the current directory current_dir = os.getcwd() print(current_dir) # List all files in the current directory files = os.listdir(current_dir) # Filter the list to include only WAV files wav_files = [file for file in files if file.endswith(".wav")] print("Total wav files:", len(wav_files)) # Print the last WAV file sorted_list = sorted(wav_files) print(lang, sorted_list[-1]) def combine_wav(source_dir, stamp, lang): # Get a list of all WAV files in the folder wav_files = [file for file in os.listdir(source_dir) if file.endswith(".wav")] # Sort the files alphabetically to ensure the correct order of combination wav_files.sort() # Combine the WAV files combined_data = [] for file in wav_files: file_path = os.path.join(source_dir, file) data, sr = sf.read(file_path) combined_data.extend(data) # Save the combined audio to a new WAV file combined_file_path = f"{stamp}_{lang}.wav" sf.write(combined_file_path, combined_data, sr) shutil.rmtree(source_dir) list_dir(lang) # Display the combined audio in the Hugging Face Space app return combined_file_path def mms_tts(Input_Text, lang_name="Burmese (mya)"): # lang_code = lang_codes[lang_name] try: lang_code = lang_codes[lang_name] except KeyError: lang_code = "mya" user_model = download(lang_code, "./data") tts = TTS(user_model) sentences = prepare_sentences(Input_Text, lang_code) # output_dir = f"out_{lang_code}" current_datetime = datetime.datetime.now() timestamp = current_datetime.strftime("%Y%m%d%H%M%S%f") user_dir = f"u_{timestamp}" if os.path.exists(user_dir): session_id = str(uuid.uuid4()) # Generate a random session ID user_dir = f"u_{session_id}_{timestamp}" os.makedirs(user_dir, exist_ok=True) print("New user directory", user_dir) for i, sentence in enumerate(sentences): tts.synthesis(sentence, wav_path=f"{user_dir}/s_{str(i).zfill(10)}.wav") combined_file_path = combine_wav(user_dir, timestamp, lang_code) return combined_file_path # common_languages = ["eng", "mya", "vie"] # List of common language codes iface = gr.Interface( fn=mms_tts, title="Massively Multilingual Speech (MMS) - Text To Speech", description=this_description, inputs=[ gr.Textbox(lines=5, placeholder="Enter text (unlimited sentences)", label="Input text (unlimited sentences)"), gr.Dropdown( choices=language_names, label="Select language 1,000+", value="Burmese (mya)", ), ], outputs="audio", ) # outputs=[ # "audio", # gr.File(label="Download", type="file", download_to="done.wav") # ]) iface.launch()