Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib.pyplot as plt
|
2 |
+
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
import math
|
6 |
+
import torch
|
7 |
+
from torch import nn
|
8 |
+
from torch.nn import functional as F
|
9 |
+
from torch.utils.data import DataLoader
|
10 |
+
|
11 |
+
import commons
|
12 |
+
import utils
|
13 |
+
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
|
14 |
+
|
15 |
+
import sys
|
16 |
+
from subprocess import call
|
17 |
+
|
18 |
+
def run_cmd(command):
|
19 |
+
try:
|
20 |
+
print(command)
|
21 |
+
call(command, shell=True)
|
22 |
+
except KeyboardInterrupt:
|
23 |
+
print("Process interrupted")
|
24 |
+
sys.exit(1)
|
25 |
+
|
26 |
+
current = os.getcwd()
|
27 |
+
print(current)
|
28 |
+
full = current + "/monotonic_align"
|
29 |
+
print(full)
|
30 |
+
os.chdir(full)
|
31 |
+
print(os.getcwd())
|
32 |
+
run_cmd("python3 setup.py build_ext --inplace")
|
33 |
+
run_cmd("apt-get install espeak -y")
|
34 |
+
os.chdir("..")
|
35 |
+
print(os.getcwd())
|
36 |
+
|
37 |
+
from models import SynthesizerTrn
|
38 |
+
from text.symbols import symbols
|
39 |
+
from text.cleaners import japanese_phrase_cleaners
|
40 |
+
from text import cleaned_text_to_sequence
|
41 |
+
|
42 |
+
from scipy.io.wavfile import write
|
43 |
+
import gradio as gr
|
44 |
+
import scipy.io.wavfile
|
45 |
+
import numpy as np
|
46 |
+
import torchtext
|
47 |
+
|
48 |
+
def get_text(text, hps):
|
49 |
+
text_norm = cleaned_text_to_sequence(text)
|
50 |
+
if hps.data.add_blank:
|
51 |
+
text_norm = commons.intersperse(text_norm, 0)
|
52 |
+
text_norm = torch.LongTensor(text_norm)
|
53 |
+
return text_norm
|
54 |
+
|
55 |
+
hps = utils.get_hparams_from_file("./configs/ATR.json")
|
56 |
+
net_g = SynthesizerTrn(
|
57 |
+
len(symbols),
|
58 |
+
hps.data.filter_length // 2 + 1,
|
59 |
+
hps.train.segment_size // hps.data.hop_length,
|
60 |
+
**hps.model)
|
61 |
+
_ = net_g.eval()
|
62 |
+
|
63 |
+
_ = utils.load_checkpoint("./logs/ATR/G_74000.pth", net_g, None)
|
64 |
+
|
65 |
+
def jtts(text):
|
66 |
+
stn_tst = get_text(japanese_phrase_cleaners(text), hps)
|
67 |
+
with torch.no_grad():
|
68 |
+
x_tst = stn_tst.unsqueeze(0)
|
69 |
+
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
|
70 |
+
audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.float().numpy()
|
71 |
+
scipy.io.wavfile.write("out.wav", hps.data.sampling_rate, audio)
|
72 |
+
return "./out.wav"
|
73 |
+
|
74 |
+
inputs = gr.inputs.Textbox(lines=5, label="Input Text")
|
75 |
+
outputs = gr.outputs.Audio(label="Output Audio")
|
76 |
+
|
77 |
+
|
78 |
+
title = "VITS"
|
79 |
+
description = "demo for VITS: Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech. To use it, simply add your text, or click one of the examples to load them. Read more at the links below."
|
80 |
+
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2106.06103'>Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech</a> | <a href='https://github.com/jaywalnut310/vits'>Github Repo</a></p>"
|
81 |
+
|
82 |
+
examples = [
|
83 |
+
["吾輩は猫である。名前はまだない"],
|
84 |
+
["試験勉強頑張ってくださいね"]]
|
85 |
+
|
86 |
+
gr.Interface(jtts, inputs, outputs, title=title, description=description, article=article, examples=examples).launch()
|