mmstts

Runtime error

File size: 6,627 Bytes

567073a

# Based on example code of https://huggingface.co/facebook/m2m100_1.2B
# and https://github.com/wannaphong/ttsmms
# See also https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md

import gradio as gr
import os
import re
import soundfile as sf

import json
import nltk
from underthesea import sent_tokenize as vie_sent_tokenize  # Vietnamese NLP toolkit
from underthesea import text_normalize as vie_text_normalize
from nltk import sent_tokenize as nltk_sent_tokenize
from ttsmms import download
from ttsmms import TTS

from collections import OrderedDict
import uuid
import datetime
import shutil
from num2words import num2words


this_description = """Text To Speech for [1000+ languages](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) - using [fairseq MMS TTS](https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md) and [ttsmms](https://github.com/wannaphong/ttsmms) wrapper.
Please note that for some languages, it may not pronounce all words correctly (yet).
"""

nltk.download("punkt")

# Pre-download some languages
tts_models = {}
eng_path = download("eng", "./data")
tts_models["eng"] = eng_path
vie_path = download("vie", "./data")
tts_models["vie"] = vie_path
mya_path = download("mya", "./data")
tts_models["mya"] = mya_path

lang_codes = OrderedDict()

language_names = list(lang_codes.keys())
with open("lang_code.txt", "r") as file:
    for line in file:
        line = line.strip()
        if line.startswith("----"):
            continue
        iso, lang = line.split("\t", 1)
        lang_codes[lang + " (" + iso + ")"] = iso

language_names = list(lang_codes.keys())

# Load num2words_lang_map
with open("num2words_lang_map.json") as f:
    num2words_lang_map = json.load(f, object_pairs_hook=OrderedDict)


def convert_numbers_to_words_num2words(text, lang):
    # Find all numbers in the text using regex
    numbers = re.findall(r"\d+", text)
    # Sort numbers in descending order of length
    sorted_numbers = sorted(numbers, key=len, reverse=True)
    print(sorted_numbers)

    # Replace numbers with their word equivalents
    for number in sorted_numbers:
        number_word = num2words(int(number), lang=num2words_lang_map[lang][0])
        text = text.replace(number, number_word)

    return text


def convert_mya_numbers_to_words(text):
    from mm_num2word import mm_num2word, extract_num

    numbers = extract_num(text)
    sorted_numbers = sorted(numbers, key=len, reverse=True)
    print(sorted_numbers)

    for n in sorted_numbers:
        text = text.replace(n, mm_num2word(n))
    return text


def prepare_sentences(text, lang="mya"):
    sentences = []
    # pre-process the text for some languages
    if lang.lower() == "mya":
        text = convert_mya_numbers_to_words(text)
        text = text.replace("\u104A", ",").replace("\u104B", ".")

    if lang in num2words_lang_map:
        print("num2words supports this lang", lang)
        text = convert_numbers_to_words_num2words(text, lang)
    print("Processed text", text)

    # Not sure why this can fix unclear pronunciation for the first word of vie
    text = text.lower()

    paragraphs = [paragraph for paragraph in text.split("\n") if paragraph.strip()]

    if lang.lower() == "vie":
        for paragraph in paragraphs:
            sentences_raw = vie_sent_tokenize(paragraph)
            sentences.extend(
                [
                    vie_text_normalize(sentence)
                    for sentence in sentences_raw
                    if sentence.strip()
                ]
            )
    else:
        sentences = [
            sentence
            for paragraph in paragraphs
            for sentence in nltk_sent_tokenize(paragraph)
            if sentence.strip()
        ]
    return sentences


def list_dir(lang):
    # Get the current directory
    current_dir = os.getcwd()
    print(current_dir)

    # List all files in the current directory
    files = os.listdir(current_dir)

    # Filter the list to include only WAV files
    wav_files = [file for file in files if file.endswith(".wav")]
    print("Total wav files:", len(wav_files))

    # Print the last WAV file
    sorted_list = sorted(wav_files)
    print(lang, sorted_list[-1])


def combine_wav(source_dir, stamp, lang):
    # Get a list of all WAV files in the folder
    wav_files = [file for file in os.listdir(source_dir) if file.endswith(".wav")]

    # Sort the files alphabetically to ensure the correct order of combination
    wav_files.sort()

    # Combine the WAV files
    combined_data = []
    for file in wav_files:
        file_path = os.path.join(source_dir, file)
        data, sr = sf.read(file_path)
        combined_data.extend(data)

    # Save the combined audio to a new WAV file
    combined_file_path = f"{stamp}_{lang}.wav"
    sf.write(combined_file_path, combined_data, sr)

    shutil.rmtree(source_dir)
    list_dir(lang)

    # Display the combined audio in the Hugging Face Space app
    return combined_file_path


def mms_tts(Input_Text, lang_name="Burmese (mya)"):
    # lang_code = lang_codes[lang_name]
    try:
        lang_code = lang_codes[lang_name]
    except KeyError:
        lang_code = "mya"

    user_model = download(lang_code, "./data")
    tts = TTS(user_model)

    sentences = prepare_sentences(Input_Text, lang_code)

    # output_dir = f"out_{lang_code}"
    current_datetime = datetime.datetime.now()
    timestamp = current_datetime.strftime("%Y%m%d%H%M%S%f")

    user_dir = f"u_{timestamp}"
    if os.path.exists(user_dir):
        session_id = str(uuid.uuid4())  # Generate a random session ID
        user_dir = f"u_{session_id}_{timestamp}"
    os.makedirs(user_dir, exist_ok=True)
    print("New user directory", user_dir)

    for i, sentence in enumerate(sentences):
        tts.synthesis(sentence, wav_path=f"{user_dir}/s_{str(i).zfill(10)}.wav")
    combined_file_path = combine_wav(user_dir, timestamp, lang_code)
    return combined_file_path


# common_languages = ["eng", "mya", "vie"]  # List of common language codes
iface = gr.Interface(
    fn=mms_tts,
    title="Massively Multilingual Speech (MMS) - Text To Speech",
    description=this_description,
    inputs=[
        gr.Textbox(lines=5, placeholder="Enter text (unlimited sentences)", label="Input text (unlimited sentences)"),
        gr.Dropdown(
            choices=language_names,
            label="Select language 1,000+",
            value="Burmese (mya)",
        ),
    ],
    outputs="audio",
)
# outputs=[
#         "audio",
#         gr.File(label="Download", type="file", download_to="done.wav")
#     ])


iface.launch()