Spaces:

Harshitsoni294
/

Text-to-speech

Sleeping

App Files Files

xet

Community

Harshitsoni294 commited on Apr 19

Commit

b5684d8

1 Parent(s): 0f72062

Initial

Browse files

Files changed (3) hide show

.gitmodules +3 -0
app.py +96 -0
requirements.txt +20 -0

.gitmodules ADDED Viewed

	@@ -0,0 +1,3 @@

+[submodule "TTS-TT2"]
+	path = TTS-TT2
+	url = https://github.com/justinjohn0306/TTS-TT2.git

app.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import os
+import sys
+import torch
+import numpy as np
+import gradio as gr
+import json
+import gdown
+from scipy.io.wavfile import write as write_wav
+# Add model repos to path
+if not os.path.exists("TTS-TT2"):
+    os.system("git clone --recursive https://github.com/justinjohn0306/TTS-TT2.git")
+if not os.path.exists("hifi-gan"):
+    os.system("git clone --recursive https://github.com/justinjohn0306/hifi-gan.git")
+sys.path.append("TTS-TT2")
+sys.path.append("hifi-gan")
+from model import Tacotron2
+from hparams import create_hparams
+from text import text_to_sequence
+from env import AttrDict
+from meldataset import mel_spectrogram, MAX_WAV_VALUE
+from models import Generator
+from denoiser import Denoiser
+# Model paths
+TACOTRON2_ID = "1--eW5nk5ijbpgBqEt1TdBPr9nopcjuHE"
+TACOTRON2_PATH = "tacotron2_statedict.pth"
+HIFIGAN_CONFIG = "hifi-gan/config_v1.json"
+HIFIGAN_MODEL_PATH = "hifigan_generator.pth"
+HIFIGAN_URL = "https://github.com/justinjohn0306/tacotron2/releases/download/assets/g_02500000"
+def download_models():
+    if not os.path.exists(TACOTRON2_PATH):
+        print("Downloading Tacotron2 model...")
+        gdown.download(id=TACOTRON2_ID, output=TACOTRON2_PATH, quiet=False)
+    if not os.path.exists(HIFIGAN_MODEL_PATH):
+        print("Downloading HiFi-GAN model...")
+        os.system(f"wget -O {HIFIGAN_MODEL_PATH} {HIFIGAN_URL}")
+def load_tacotron2():
+    hparams = create_hparams()
+    hparams.sampling_rate = 22050
+    model = Tacotron2(hparams)
+    checkpoint = torch.load(TACOTRON2_PATH, map_location='cpu')
+    if 'state_dict' in checkpoint:
+        checkpoint = checkpoint['state_dict']
+    model.load_state_dict(checkpoint, strict=False)
+    model.eval()
+    return model
+def load_hifigan():
+    with open(HIFIGAN_CONFIG) as f:
+        config = json.load(f)
+    h = AttrDict(config)
+    torch.manual_seed(h.seed)
+    model = Generator(h).to('cpu')
+    checkpoint = torch.load(HIFIGAN_MODEL_PATH, map_location='cpu')
+    if 'generator' in checkpoint:
+        model.load_state_dict(checkpoint['generator'])
+    else:
+        model.load_state_dict(checkpoint)
+    model.eval()
+    model.remove_weight_norm()
+    return model
+def synthesize(text):
+    sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
+    sequence = torch.from_numpy(sequence).long()
+    with torch.no_grad():
+        mel_outputs, _, _, _ = tacotron2.inference(sequence)
+        audio = hifigan(mel_outputs)
+        audio = audio.squeeze().cpu().numpy()
+        audio = audio * MAX_WAV_VALUE
+        audio = audio.astype(np.int16)
+    return 22050, audio
+# Run setup
+download_models()
+tacotron2 = load_tacotron2()
+hifigan = load_hifigan()
+# Gradio Interface
+iface = gr.Interface(
+    fn=synthesize,
+    inputs=gr.Textbox(label="Enter text"),
+    outputs=gr.Audio(label="Generated Speech"),
+    title="Tacotron2 Speech Synthesis",
+    description="This app converts text to speech using a trained Tacotron2 model and HiFi-GAN vocoder."
+)
+iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+torch
+torchaudio
+tensorflow
+numpy
+scipy
+gdown
+tqdm
+unidecode
+resampy
+librosa
+matplotlib
+inflect
+numba
+gradio
+Jinja2
+phonemizer
+pyyaml
+webrtcvad
+requests
+soundfile