import os import sys import torch import numpy as np import gradio as gr import json import gdown from scipy.io.wavfile import write as write_wav # Add model repos to path if not os.path.exists("TTS-TT2"): os.system("git clone --recursive https://github.com/justinjohn0306/TTS-TT2.git") if not os.path.exists("hifi-gan"): os.system("git clone --recursive https://github.com/justinjohn0306/hifi-gan.git") sys.path.append("TTS-TT2") sys.path.append("hifi-gan") from model import Tacotron2 from hparams import create_hparams from text import text_to_sequence from env import AttrDict from meldataset import mel_spectrogram, MAX_WAV_VALUE from models import Generator from denoiser import Denoiser # Model paths TACOTRON2_ID = "1--eW5nk5ijbpgBqEt1TdBPr9nopcjuHE" TACOTRON2_PATH = "tacotron2_statedict.pth" HIFIGAN_CONFIG = "hifi-gan/config_v1.json" HIFIGAN_MODEL_PATH = "hifigan_generator.pth" HIFIGAN_URL = "https://github.com/justinjohn0306/tacotron2/releases/download/assets/g_02500000" def download_models(): if not os.path.exists(TACOTRON2_PATH): print("Downloading Tacotron2 model...") gdown.download(id=TACOTRON2_ID, output=TACOTRON2_PATH, quiet=False) if not os.path.exists(HIFIGAN_MODEL_PATH): print("Downloading HiFi-GAN model...") os.system(f"wget -O {HIFIGAN_MODEL_PATH} {HIFIGAN_URL}") def load_tacotron2(): hparams = create_hparams() hparams.sampling_rate = 22050 model = Tacotron2(hparams) checkpoint = torch.load(TACOTRON2_PATH, map_location='cpu') if 'state_dict' in checkpoint: checkpoint = checkpoint['state_dict'] model.load_state_dict(checkpoint, strict=False) model.eval() return model def load_hifigan(): with open(HIFIGAN_CONFIG) as f: config = json.load(f) h = AttrDict(config) torch.manual_seed(h.seed) model = Generator(h).to('cpu') checkpoint = torch.load(HIFIGAN_MODEL_PATH, map_location='cpu') if 'generator' in checkpoint: model.load_state_dict(checkpoint['generator']) else: model.load_state_dict(checkpoint) model.eval() model.remove_weight_norm() return model def synthesize(text): sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :] sequence = torch.from_numpy(sequence).long() with torch.no_grad(): mel_outputs, _, _, _ = tacotron2.inference(sequence) audio = hifigan(mel_outputs) audio = audio.squeeze().cpu().numpy() audio = audio * MAX_WAV_VALUE audio = audio.astype(np.int16) return 22050, audio # Run setup download_models() tacotron2 = load_tacotron2() hifigan = load_hifigan() # Gradio Interface iface = gr.Interface( fn=synthesize, inputs=gr.Textbox(label="Enter text"), outputs=gr.Audio(label="Generated Speech"), title="Tacotron2 Speech Synthesis", description="This app converts text to speech using a trained Tacotron2 model and HiFi-GAN vocoder." ) iface.launch()