Spaces:

hhim8826
/

vits-ATR

Running

App Files Files Community

hhim8826 commited on Aug 10, 2022

Commit

cdee0f1

1 Parent(s): 9d83034

Create app.py

Browse files

Files changed (1) hide show

app.py +86 -0

app.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import matplotlib.pyplot as plt
+import os
+import json
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.utils.data import DataLoader
+import commons
+import utils
+from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
+import sys
+from subprocess import call
+def run_cmd(command):
+    try:
+        print(command)
+        call(command, shell=True)
+    except KeyboardInterrupt:
+        print("Process interrupted")
+        sys.exit(1)
+current = os.getcwd()
+print(current)
+full = current + "/monotonic_align"
+print(full)
+os.chdir(full)
+print(os.getcwd())
+run_cmd("python3 setup.py build_ext --inplace")
+run_cmd("apt-get install espeak -y")
+os.chdir("..")
+print(os.getcwd())
+from models import SynthesizerTrn
+from text.symbols import symbols
+from text.cleaners import japanese_phrase_cleaners
+from text import cleaned_text_to_sequence
+from scipy.io.wavfile import write
+import gradio as gr
+import scipy.io.wavfile
+import numpy as np
+import torchtext
+def get_text(text, hps):
+    text_norm = cleaned_text_to_sequence(text)
+    if hps.data.add_blank:
+        text_norm = commons.intersperse(text_norm, 0)
+    text_norm = torch.LongTensor(text_norm)
+    return text_norm
+hps = utils.get_hparams_from_file("./configs/ATR.json")
+net_g = SynthesizerTrn(
+    len(symbols),
+    hps.data.filter_length // 2 + 1,
+    hps.train.segment_size // hps.data.hop_length,
+    **hps.model)
+_ = net_g.eval()
+_ = utils.load_checkpoint("./logs/ATR/G_74000.pth", net_g, None)
+def jtts(text):
+  stn_tst = get_text(japanese_phrase_cleaners(text), hps)
+  with torch.no_grad():
+    x_tst = stn_tst.unsqueeze(0)
+    x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
+    audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.float().numpy()
+    scipy.io.wavfile.write("out.wav", hps.data.sampling_rate, audio)
+    return "./out.wav"
+inputs = gr.inputs.Textbox(lines=5, label="Input Text")
+outputs =  gr.outputs.Audio(label="Output Audio")
+title = "VITS"
+description = "demo for VITS: Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech. To use it, simply add your text, or click one of the examples to load them. Read more at the links below."
+article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2106.06103'>Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech</a> | <a href='https://github.com/jaywalnut310/vits'>Github Repo</a></p>"
+examples = [
+ ["吾輩は猫である。名前はまだない"],
+ ["試験勉強頑張ってくださいね"]]
+gr.Interface(jtts, inputs, outputs, title=title, description=description, article=article, examples=examples).launch()