from vietTTS.hifigan.mel2wave import mel2wave from vietTTS.nat.text2mel import text2mel from vietTTS import nat_normalize_text import numpy as np import gradio as gr import os def download_assets(): os.system("gdown --id 16UhN8QBxG1YYwUh8smdEeVnKo9qZhvZj -O duration_latest_ckpt.pickle") os.system("gdown --id 1-8Ig65S3irNHSzcskT37SLgeyuUhjKdj -O acoustic_latest_ckpt.pickle") os.system("gdown --id 19cRNDC6IrHFAAE4U9I7K0mzLMgPsi5zb -O hk_hifi.pickle") os.system("wget https://raw.githubusercontent.com/NTT123/vietTTS/master/assets/hifigan/config.json") os.system("wget https://raw.githubusercontent.com/NTT123/vietTTS/master/assets/infore/lexicon.txt") def text_to_speech(text): text = nat_normalize_text(text) mel = text2mel( text, "lexicon.txt", 0.2, "acoustic_latest_ckpt.pickle", "duration_latest_ckpt.pickle", ) wave = mel2wave(mel, "config.json", "hk_hifi.pickle") return (wave * (2**15)).astype(np.int16) def speak(text): y = text_to_speech(text) return 16_000, y download_assets() title = "vietTTS" description = "A vietnamese text-to-speech demo." gr.Interface( fn=speak, inputs="text", outputs="audio", title = title, description=description, theme="default", allow_screenshot=False, allow_flagging="never", ).launch(debug=False)