Spaces:
Runtime error
Runtime error
File size: 4,939 Bytes
3b92d66 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
from .tts import TextToMel, MelToWav
from .transliterate import XlitEngine
from .num_to_word_on_sent import normalize_nums
import re
import numpy as np
from scipy.io.wavfile import write
from mosestokenizer import *
from indicnlp.tokenize import sentence_tokenize
import argparse
_INDIC = ["as", "bn", "gu", "hi", "kn", "ml", "mr", "or", "pa", "ta", "te"]
_PURAM_VIRAM_LANGUAGES = ["hi", "or", "bn", "as"]
_TRANSLITERATION_NOT_AVAILABLE_IN = ["en","or"]
#_NUM2WORDS_NOT_AVAILABLE_IN = []
def normalize_text(text, lang):
if lang in _PURAM_VIRAM_LANGUAGES:
text = text.replace('|', '।')
text = text.replace('.', '।')
return text
def split_sentences(paragraph, language):
if language == "en":
with MosesSentenceSplitter(language) as splitter:
return splitter([paragraph])
elif language in _INDIC:
return sentence_tokenize.sentence_split(paragraph, lang=language)
def load_models(acoustic, vocoder, device):
text_to_mel = TextToMel(glow_model_dir=acoustic, device=device)
mel_to_wav = MelToWav(hifi_model_dir=vocoder, device=device)
return text_to_mel, mel_to_wav
def translit(text, lang):
reg = re.compile(r'[a-zA-Z]')
words = [engine.translit_word(word, topk=1)[lang][0] if reg.match(word) else word for word in text.split()]
updated_sent = ' '.join(words)
return updated_sent
def run_tts(text, lang, args):
if lang == 'hi':
text = text.replace('।', '.') # only for hindi models
if lang == 'en' and text[-1] != '.':
text = text + '. '
if args.number_conversion == 1 and lang!='en':
print("Doing number conversion")
text_num_to_word = normalize_nums(text, lang) # converting numbers to words in lang
else:
text_num_to_word = text
if args.transliteration == 1 and lang not in _TRANSLITERATION_NOT_AVAILABLE_IN:
print("Doing transliteration")
text_num_to_word_and_transliterated = translit(text_num_to_word, lang) # transliterating english words to lang
else:
text_num_to_word_and_transliterated = text_num_to_word
final_text = ' ' + text_num_to_word_and_transliterated
print(final_text)
mel = text_to_mel.generate_mel(final_text, args.noise_scale, args.length_scale)
audio, sr = mel_to_wav.generate_wav(mel)
return sr, audio
def run_tts_paragraph(args):
audio_list = []
global text_to_mel
global mel_to_wav
if args.gender == 'Male':
text_to_mel = text_to_mel_list[1]
mel_to_wav = mel_to_wav_list[1]
else:
text_to_mel = text_to_mel_list[0]
mel_to_wav = mel_to_wav_list[0]
if args.split_sentences == 1:
text = normalize_text(args.text, args.lang)
split_sentences_list = split_sentences(text, args.lang)
for sent in split_sentences_list:
sr, audio = run_tts(sent, args.lang, args)
audio_list.append(audio)
concatenated_audio = np.concatenate([i for i in audio_list])
if args.wav:
write(filename=args.wav, rate=sr, data=concatenated_audio)
return (sr, concatenated_audio)
else:
sr, audio = run_tts(args.text, args.lang, args)
if args.wav:
write(filename=args.wav, rate=sr, data=audio)
return (sr, audio)
def load_all_models(args):
global engine
if args.lang not in _TRANSLITERATION_NOT_AVAILABLE_IN:
engine = XlitEngine(args.lang) # loading translit model globally
global text_to_mel_list
global mel_to_wav_list
text_to_mel_list = []
mel_to_wav_list = []
for acoustic, vocoder in zip( args.acoustic.split(',') , args.vocoder.split(',') ):
ttm, mtw = load_models(acoustic, vocoder, args.device)
text_to_mel_list.append(ttm)
mel_to_wav_list.append(mtw)
try:
args.noise_scale = float(args.noise_scale)
args.length_scale = float(args.length_scale)
except:
pass
print(args)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-a", "--acoustic", required=True, type=str)
parser.add_argument("-v", "--vocoder", required=True, type=str)
parser.add_argument("-d", "--device", type=str, default="cpu")
parser.add_argument("-t", "--text", type=str, required=True)
parser.add_argument("-w", "--wav", type=str, required=True)
parser.add_argument("-n", "--noise-scale", default='0.667', type=str )
parser.add_argument("-l", "--length-scale", default='1.0', type=str)
parser.add_argument("-T", "--transliteration", default=1, type=int)
parser.add_argument("-N", "--number-conversion", default=1, type=int)
parser.add_argument("-S", "--split-sentences", default=1, type=int)
parser.add_argument("-L", "--lang", type=str, required=True)
args = parser.parse_args()
load_all_models(args)
run_tts_paragraph(args)
|