|
from flask import Flask, jsonify, request, send_file |
|
import sys |
|
import re |
|
import os |
|
import torch |
|
import utils |
|
import commons |
|
import soundfile as sf |
|
from models import SynthesizerTrn |
|
from text import text_to_sequence, _clean_text |
|
from pyngrok import ngrok |
|
|
|
parser = sys.argv |
|
argsAuth = parser[1] |
|
argsDevice = parser[2] |
|
ngrok.set_auth_token(argsAuth) |
|
public_url = ngrok.connect(5000).public_url |
|
print("Monggo: " + public_url) |
|
|
|
limitation = os.getenv("SYSTEM") == "spaces" |
|
hps_ms = utils.get_hparams_from_file(r'config/config.json') |
|
|
|
def get_text(text, hps, is_symbol): |
|
text_norm, clean_text = text_to_sequence( |
|
text, hps.symbols, [] if is_symbol else hps.data.text_cleaners) |
|
if hps.data.add_blank: |
|
text_norm = commons.intersperse(text_norm, 0) |
|
text_norm = torch.LongTensor(text_norm) |
|
return text_norm, clean_text |
|
|
|
def generate(textnya): |
|
tipeDevicenya = argsDevice |
|
net_g_ms = SynthesizerTrn( |
|
len(hps_ms.symbols), |
|
hps_ms.data.filter_length // 2 + 1, |
|
hps_ms.train.segment_size // hps_ms.data.hop_length, |
|
n_speakers=hps_ms.data.n_speakers, |
|
**hps_ms.model) |
|
utils.load_checkpoint('models/ayaka-jp.pth', net_g_ms, None) |
|
_ = net_g_ms.eval().to(tipeDevicenya) |
|
is_symbol = True |
|
input_text = textnya |
|
language = "Mix(wrap the Chinese text with [ZH][ZH], wrap the Japanese text with [JA][JA])" |
|
noise_scale = 0.6 |
|
noise_scale_w = 0.668 |
|
length_scale = 1.2 |
|
sid = 303 |
|
speaker_id = 303 |
|
text = input_text.replace('\n', ' ').replace('\r', '').replace(" ", "") |
|
if limitation: |
|
text_len = len(re.sub("\[([A-Z]{2})\]", "", input_text)) |
|
max_len = 100 |
|
if is_symbol: |
|
max_len *= 3 |
|
if text_len > max_len: |
|
print("Error: Text is too long") |
|
if not is_symbol: |
|
if language == 0: |
|
text = f"[ZH]{text}[ZH]" |
|
elif language == 1: |
|
text = f"[JA]{text}[JA]" |
|
else: |
|
text = f"{text}" |
|
stn_tst, clean_text = get_text(text, hps_ms, is_symbol) |
|
with torch.no_grad(): |
|
x_tst = stn_tst.unsqueeze(0).to(tipeDevicenya) |
|
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(tipeDevicenya) |
|
sid = torch.LongTensor([speaker_id]).to(tipeDevicenya) |
|
audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, |
|
length_scale=length_scale)[0][0, 0].data.cpu().float().numpy() |
|
sf.write('speech.wav', audio, 22050) |
|
|
|
def create_to_symbol_fn(hps, is_symbol_input, input_text, temp_lang): |
|
if temp_lang == 0: |
|
clean_text = f'[ZH]{input_text}[ZH]' |
|
elif temp_lang == 1: |
|
clean_text = f'[JA]{input_text}[JA]' |
|
else: |
|
clean_text = input_text |
|
return _clean_text(clean_text, hps.data.text_cleaners) if is_symbol_input else '' |
|
|
|
app = Flask(__name__) |
|
@app.route('/') |
|
def home(): |
|
return "WOI" |
|
@app.route('/ayaka/') |
|
def ayakaTTS(): |
|
query = request.args.get('text') |
|
generate(query) |
|
return jsonify({"work": query, "url": public_url + "/result"}) |
|
@app.route('/result') |
|
def resultAud(): |
|
return send_file("speech.wav") |
|
|
|
if __name__ == "__main__": |
|
app.run() |
|
|