hhim8826 commited on
Commit
cdee0f1
1 Parent(s): 9d83034

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -0
app.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+
3
+ import os
4
+ import json
5
+ import math
6
+ import torch
7
+ from torch import nn
8
+ from torch.nn import functional as F
9
+ from torch.utils.data import DataLoader
10
+
11
+ import commons
12
+ import utils
13
+ from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
14
+
15
+ import sys
16
+ from subprocess import call
17
+
18
+ def run_cmd(command):
19
+ try:
20
+ print(command)
21
+ call(command, shell=True)
22
+ except KeyboardInterrupt:
23
+ print("Process interrupted")
24
+ sys.exit(1)
25
+
26
+ current = os.getcwd()
27
+ print(current)
28
+ full = current + "/monotonic_align"
29
+ print(full)
30
+ os.chdir(full)
31
+ print(os.getcwd())
32
+ run_cmd("python3 setup.py build_ext --inplace")
33
+ run_cmd("apt-get install espeak -y")
34
+ os.chdir("..")
35
+ print(os.getcwd())
36
+
37
+ from models import SynthesizerTrn
38
+ from text.symbols import symbols
39
+ from text.cleaners import japanese_phrase_cleaners
40
+ from text import cleaned_text_to_sequence
41
+
42
+ from scipy.io.wavfile import write
43
+ import gradio as gr
44
+ import scipy.io.wavfile
45
+ import numpy as np
46
+ import torchtext
47
+
48
+ def get_text(text, hps):
49
+ text_norm = cleaned_text_to_sequence(text)
50
+ if hps.data.add_blank:
51
+ text_norm = commons.intersperse(text_norm, 0)
52
+ text_norm = torch.LongTensor(text_norm)
53
+ return text_norm
54
+
55
+ hps = utils.get_hparams_from_file("./configs/ATR.json")
56
+ net_g = SynthesizerTrn(
57
+ len(symbols),
58
+ hps.data.filter_length // 2 + 1,
59
+ hps.train.segment_size // hps.data.hop_length,
60
+ **hps.model)
61
+ _ = net_g.eval()
62
+
63
+ _ = utils.load_checkpoint("./logs/ATR/G_74000.pth", net_g, None)
64
+
65
+ def jtts(text):
66
+ stn_tst = get_text(japanese_phrase_cleaners(text), hps)
67
+ with torch.no_grad():
68
+ x_tst = stn_tst.unsqueeze(0)
69
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
70
+ audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.float().numpy()
71
+ scipy.io.wavfile.write("out.wav", hps.data.sampling_rate, audio)
72
+ return "./out.wav"
73
+
74
+ inputs = gr.inputs.Textbox(lines=5, label="Input Text")
75
+ outputs = gr.outputs.Audio(label="Output Audio")
76
+
77
+
78
+ title = "VITS"
79
+ description = "demo for VITS: Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech. To use it, simply add your text, or click one of the examples to load them. Read more at the links below."
80
+ article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2106.06103'>Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech</a> | <a href='https://github.com/jaywalnut310/vits'>Github Repo</a></p>"
81
+
82
+ examples = [
83
+ ["吾輩は猫である。名前はまだない"],
84
+ ["試験勉強頑張ってくださいね"]]
85
+
86
+ gr.Interface(jtts, inputs, outputs, title=title, description=description, article=article, examples=examples).launch()