Harshitsoni294 commited on
Commit
b5684d8
·
1 Parent(s): 0f72062
Files changed (3) hide show
  1. .gitmodules +3 -0
  2. app.py +96 -0
  3. requirements.txt +20 -0
.gitmodules ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [submodule "TTS-TT2"]
2
+ path = TTS-TT2
3
+ url = https://github.com/justinjohn0306/TTS-TT2.git
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import torch
4
+ import numpy as np
5
+ import gradio as gr
6
+ import json
7
+ import gdown
8
+ from scipy.io.wavfile import write as write_wav
9
+
10
+ # Add model repos to path
11
+ if not os.path.exists("TTS-TT2"):
12
+ os.system("git clone --recursive https://github.com/justinjohn0306/TTS-TT2.git")
13
+ if not os.path.exists("hifi-gan"):
14
+ os.system("git clone --recursive https://github.com/justinjohn0306/hifi-gan.git")
15
+
16
+ sys.path.append("TTS-TT2")
17
+ sys.path.append("hifi-gan")
18
+
19
+ from model import Tacotron2
20
+ from hparams import create_hparams
21
+ from text import text_to_sequence
22
+ from env import AttrDict
23
+ from meldataset import mel_spectrogram, MAX_WAV_VALUE
24
+ from models import Generator
25
+ from denoiser import Denoiser
26
+
27
+ # Model paths
28
+ TACOTRON2_ID = "1--eW5nk5ijbpgBqEt1TdBPr9nopcjuHE"
29
+ TACOTRON2_PATH = "tacotron2_statedict.pth"
30
+ HIFIGAN_CONFIG = "hifi-gan/config_v1.json"
31
+ HIFIGAN_MODEL_PATH = "hifigan_generator.pth"
32
+ HIFIGAN_URL = "https://github.com/justinjohn0306/tacotron2/releases/download/assets/g_02500000"
33
+
34
+ def download_models():
35
+ if not os.path.exists(TACOTRON2_PATH):
36
+ print("Downloading Tacotron2 model...")
37
+ gdown.download(id=TACOTRON2_ID, output=TACOTRON2_PATH, quiet=False)
38
+
39
+ if not os.path.exists(HIFIGAN_MODEL_PATH):
40
+ print("Downloading HiFi-GAN model...")
41
+ os.system(f"wget -O {HIFIGAN_MODEL_PATH} {HIFIGAN_URL}")
42
+
43
+ def load_tacotron2():
44
+ hparams = create_hparams()
45
+ hparams.sampling_rate = 22050
46
+ model = Tacotron2(hparams)
47
+ checkpoint = torch.load(TACOTRON2_PATH, map_location='cpu')
48
+ if 'state_dict' in checkpoint:
49
+ checkpoint = checkpoint['state_dict']
50
+ model.load_state_dict(checkpoint, strict=False)
51
+ model.eval()
52
+ return model
53
+
54
+ def load_hifigan():
55
+ with open(HIFIGAN_CONFIG) as f:
56
+ config = json.load(f)
57
+ h = AttrDict(config)
58
+ torch.manual_seed(h.seed)
59
+ model = Generator(h).to('cpu')
60
+ checkpoint = torch.load(HIFIGAN_MODEL_PATH, map_location='cpu')
61
+ if 'generator' in checkpoint:
62
+ model.load_state_dict(checkpoint['generator'])
63
+ else:
64
+ model.load_state_dict(checkpoint)
65
+ model.eval()
66
+ model.remove_weight_norm()
67
+ return model
68
+
69
+ def synthesize(text):
70
+ sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
71
+ sequence = torch.from_numpy(sequence).long()
72
+
73
+ with torch.no_grad():
74
+ mel_outputs, _, _, _ = tacotron2.inference(sequence)
75
+ audio = hifigan(mel_outputs)
76
+ audio = audio.squeeze().cpu().numpy()
77
+ audio = audio * MAX_WAV_VALUE
78
+ audio = audio.astype(np.int16)
79
+
80
+ return 22050, audio
81
+
82
+ # Run setup
83
+ download_models()
84
+ tacotron2 = load_tacotron2()
85
+ hifigan = load_hifigan()
86
+
87
+ # Gradio Interface
88
+ iface = gr.Interface(
89
+ fn=synthesize,
90
+ inputs=gr.Textbox(label="Enter text"),
91
+ outputs=gr.Audio(label="Generated Speech"),
92
+ title="Tacotron2 Speech Synthesis",
93
+ description="This app converts text to speech using a trained Tacotron2 model and HiFi-GAN vocoder."
94
+ )
95
+
96
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchaudio
3
+ tensorflow
4
+ numpy
5
+ scipy
6
+ gdown
7
+ tqdm
8
+ unidecode
9
+ resampy
10
+ librosa
11
+ matplotlib
12
+ inflect
13
+ numba
14
+ gradio
15
+ Jinja2
16
+ phonemizer
17
+ pyyaml
18
+ webrtcvad
19
+ requests
20
+ soundfile