import librosa import matplotlib.pyplot as plt import os import json import math import requests import torch from torch import nn from torch.nn import functional as F from torch.utils.data import DataLoader import utils from models import SynthesizerTrn from text import text_to_sequence from scipy.io.wavfile import write import re from scipy import signal import time # - paths path_to_config = "./config.json" # path to .json path_to_model = "./best.pth" # path to G_xxxx.pth #- text input input = "소프트웨어 교육의 중요성이 날로 더해가는데 학생들은 소프트웨어 관련 교육을 쉽게 지루해해요." # check device if torch.cuda.is_available() is True: device = "cuda:0" else: device = "cpu" _pad = '_' _punctuation = ';:,.!?¡¿—…"«»“” ' _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ' _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) # ! SPACE_ID = symbols.index(" ") def intersperse(lst, item): result = [item] * (len(lst) * 2 + 1) result[1::2] = lst return result def get_text(text, hps): text_norm = text_to_sequence(text, hps.data.text_cleaners) if hps.data.add_blank: text_norm = intersperse(text_norm, 0) text_norm = torch.LongTensor(text_norm) return text_norm def vcss(inputstr): # single print('text:',inputstr) fltstr = re.sub(r"[\[\]\(\)\{\}]", "", inputstr) stn_tst = get_text(fltstr, hps) speed = 1 output_dir = 'output' sid = 0 start_time=time.time() with torch.no_grad(): x_tst = stn_tst.to(device).unsqueeze(0) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device) audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1 / speed)[0][ 0, 0].data.cpu().float().numpy() write(f'./{output_dir}/tts_output.wav', hps.data.sampling_rate, audio) print(f'./{output_dir}/output file Generated!') end_time=time.time() runTime=end_time-start_time print("RunTime:{}sec".format(runTime)) def vcms(inputstr, sid): # multi fltstr = re.sub(r"[\[\]\(\)\{\}]", "", inputstr) #fltstr = langdetector(fltstr) #- optional for cjke/cjks type cleaners stn_tst = get_text(fltstr, hps) speed = 1 output_dir = 'output' start_time=time.time() with torch.no_grad(): x_tst = stn_tst.to(device).unsqueeze(0) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device) sid = torch.LongTensor([sid]).to(device) audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1 / speed)[0][ 0, 0].data.cpu().float().numpy() write(f'./{output_dir}/output.wav', hps.data.sampling_rate, audio) end_time=time.time() print(f'./{output_dir}/output file Generated!') end_time=time.time() runTime=end_time-start_time print("RunTime:{}sec".format(runTime)) hps = utils.get_hparams_from_file(path_to_config) if "use_mel_posterior_encoder" in hps.model.keys() and hps.model.use_mel_posterior_encoder == True: print("Using mel posterior encoder for VITS2") posterior_channels = 80 # vits2 hps.data.use_mel_posterior_encoder = True else: print("Using lin posterior encoder for VITS1") posterior_channels = hps.data.filter_length // 2 + 1 hps.data.use_mel_posterior_encoder = False net_g = SynthesizerTrn( len(symbols), posterior_channels, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, #- >0 for multi speaker **hps.model).to(device) _ = net_g.eval() _ = utils.load_checkpoint(path_to_model, net_g, None) vcss(input)