Spaces:

Mira1sen
/

gradio

Sleeping

App Files Files Community

gradio / ensemble_app.py

Mira1sen

Upload folder using huggingface_hub

e569c5f verified about 1 year ago

raw

history blame contribute delete

15.4 kB

	#!/usr/bin/env python3
	# -- coding=utf8 --
	########################################################################
	#
	# Copyright (c) 2023 Baidu.com, Inc. All Rights Reserved
	#
	########################################################################

	"""
	Author: linxiaolong
	"""
	import warnings
	warnings.filterwarnings("ignore")

	# 外部库
	import re
	import requests
	import argparse
	import json
	import os
	import re
	import tempfile

	import librosa
	import numpy as np
	# import torch
	# from torch import no_grad, LongTensor
	import commons
	import gradio as gr
	import gradio.utils as gr_utils
	import gradio.processing_utils as gr_processing_utils

	# 内部库
	from models import SynthesizerTrn
	from text import text_to_sequence, text_to_sequence_for_test, _clean_text
	from mel_processing import spectrogram_torch
	import utils
	from text.symbols import symbols

	limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
	punct_regex = re.compile(r"[\.!\?。！？]")
	silence_duration = 200


	def split_text(text, regex):
	"""Split text into sentences by puncutations.

	Args:
	text: long text.
	regex: puncutation regex.

	Returns:
	list of sentences.
	"""
	sentences = re.split(regex, text)
	puncts = re.findall(regex, text)

	for i, sentence in enumerate(sentences):
	if sentence == "":
	continue
	if i < len(puncts):
	sentences[i] = sentences[i] + puncts[i]
	else:
	sentences[i] = sentences[i] + "。"
	sentences = [i for i in sentences if i != ""]
	return sentences


	def concat_audio(audio_list, sampling_rate=22050, silence_duration=1000):
	"""Concatenate audio files and insert silence between them.

	Args:
	audio_list: list of audio files.
	sampling_rate: audio sampling rate. Defaults to 22050.
	silence_duration: silence duration in miliseconds. Defaults to 1000.

	Returns:
	concatenated audio.
	"""
	silence_samples = int(sampling_rate * silence_duration / 1000)
	silence = np.zeros(silence_samples, dtype=np.float16)

	audio_num = len(audio_list)
	if audio_num < 2:
	return audio_list[0]
	audio_cat = audio_list[0]
	for i in range(1, audio_num):
	audio_cat = np.concatenate((audio_cat, silence, audio_list[i]), axis=0)

	return audio_cat


	### 外部TTS的超参数
	microsoft_url = "https://japaneast.tts.speech.microsoft.com/cognitiveservices/v1"
	microsoft_headers = {'Content-Type': 'application/json; charset=utf-8',
	'Ocp-Apim-Subscription-Key':'1f1ef0ce53b84261be94fab81df7e628'}
	microsoft_model_list = [
	"ja-JP-NanamiNeural",
	"ja-JP-KeitaNeural",
	"ja-JP-AoiNeural",
	"ja-JP-DaichiNeural",
	"ja-JP-MayuNeural",
	"ja-JP-NaokiNeural",
	"ja-JP-ShioriNeural"
	]

	google_url = "http://gbu.jp02-a30-apisix-sandbox.baidu-int.com/gbu/rest/v2/tts/voice_gq"
	google_headers = {'Content-Type': 'application/json; charset=utf-8',
	'apikey':'synclub-2383kjhjksxfv.2341gs'}
	google_model_list = [
	"ja-JP-Neural2-B",
	"ja-JP-Neural2-C",
	"ja-JP-Neural2-D",
	"ja-JP-Standard-A",
	"ja-JP-Standard-B",
	"ja-JP-Standard-C",
	"ja-JP-Standard-D",
	"ja-JP-Wavenet-A",
	"ja-JP-Wavenet-B",
	"ja-JP-Wavenet-C",
	"ja-JP-Wavenet-D"
	]

	coefont_url = "http://gbu.jp02-a30-apisix-sandbox.baidu-int.com/gbu/rest/v2/tts/avatar_coe"
	coefont_headers = {'Content-Type': 'application/json; charset=utf-8',
	'apikey':'synclub-2383kjhjksxfv.2341gs'}
	coefont_id = [
	'3f84b7b1-30fb-4677-a704-fd136515303e',
	'9b826785-bea5-4740-b4cd-e9a286264705',
	'7632cba3-4aca-4cee-9d15-ad1ac31f670c',
	'2c91238a-96f9-4cb6-a69a-461ee66b0e6d',
	'08428dee-65b6-490e-a3a3-60dfcdda889d',
	'c88367bc-5954-426b-a1ba-a683202803c8',
	'fb64a764-91d5-4510-bddd-70df3d62709a',
	'5cfa1f33-bca8-4489-bcbe-701045993162',
	'94cf7792-7c0c-4be4-88e7-c30d26ab6616',
	'81dbd387-6ad6-4b22-93f9-4e2a0091b2fe',
	'931a8568-039a-4cef-add7-bee71629c00e',
	'f91a9d29-c8b4-443f-ba07-82e7e36bd20b',
	'23c76cf0-bee0-47fa-b735-9b7bdba9f26a',
	'cf5fdfb8-85ea-41e1-915b-257936791f17',
	'0f7b53df-3c24-46a5-84d1-cbea39a956c0',
	'3d499385-d331-4cbb-93c0-2057e60eddcf',
	'18ca2f7b-97ca-486d-8f47-858965833642',
	'33e0a2ff-5050-434c-9506-defe97e52f15',
	'516b0f32-8b5f-48c5-b60e-38d508e2b06b',
	'c8720caf-2d2d-4130-8831-92f61f9e25e8',
	'710001f5-e6f5-4cc0-8ba2-e6aa6da8d807',
	'd36f8bb1-8bd8-4e90-964a-9dbd3e374093',
	'2157796c-fe48-4688-b7cc-7ea554edf77d',
	'5cc0dc91-0c6a-4c50-b7d8-f3117cfe44ef',
	'be5c5295-aba2-4055-a9da-8926da7fb5a0',
	'76763239-af14-4c0d-9435-956f096f77dc',
	'10d298ee-ebbf-4838-a6c5-d608f2e3c338',
	'694cb06e-73bd-43c4-94d4-f775ad3dbb26',
	'5cf07e7c-5b1c-4360-a8de-7c928580d4b5',
	'76e2ba06-b23a-4bbe-8148-e30ede9001b9',
	'c25ed97f-78f7-4e8f-b2fa-f8e29633588b',
	'e26382ba-2ae2-4cf7-8c1b-420ab4b845d8',
	'82c4fcf5-d0ee-4fe9-9b0d-89a65d04f290'
	]
	coefont_model_list = [
	'Canel',
	'胡麻ちゃん',
	'バーチャル悪霊',
	'引寄\u3000法則',
	'にっし～☆',
	'志水智（Tomo Shimizu）',
	'花撫シア-最高精度-しっかり読み上げ',
	'UNF/UserNotFound',
	'RoBaKu',
	'おにもち',
	'小菅将太',
	'秋月つむぎ（落ち着いたナレーション）',
	'碧海紘斗_OhmiHiroto',
	'ちくわぶえ',
	'unnamed',
	'今井瑶子(高精度。MC ナレーター落ち着いたトーンです)',
	'皆のお母さん',
	'後藤邑子',
	'田中和彦',
	'KTNR',
	'天渡\u3000早苗',
	'須戸ゼロ',
	'とり藻々',
	'武田祐子',
	'【PRO】落ち着きナレーション♯畑耕平',
	'音暖ののん Ver2.0（最高精度）',
	'ろさちゃん-soft-v2[最高精度] ¦ Losa-chan -soft- ∀ -汎用式概念χ',
	'パイナップル秀夫お姉さん',
	'minamo',
	'あさのゆき',
	'聲華琴音【紡】',
	'黄琴海月【うるとら】',
	'高橋俊輔']
	coefont_id_model_name_dict = dict(zip(coefont_model_list, coefont_id))

	all_example = "今日は天気がいいから、一緒にハイキングに行きましょう。"

	# def audio_postprocess(self, y):
	# """
	# 修改gr的音频后处理函数
	# :param self:
	# :param y:
	# :return:
	# """
	# if y is None:
	# return None

	# if gr_utils.validate_url(y):
	# file = gr_processing_utils.download_to_file(y, dir=self.temp_dir)
	# elif isinstance(y, tuple):
	# sample_rate, data = y
	# file = tempfile.NamedTemporaryFile(
	# suffix=".wav", dir=self.temp_dir, delete=False
	# )
	# gr_processing_utils.audio_to_file(sample_rate, data, file.name)
	# else:
	# file = gr_processing_utils.create_tmp_copy_of_file(y, dir=self.temp_dir)

	# return gr_processing_utils.encode_url_or_file_to_base64(file.name)

	# gr.Audio.postprocess = audio_postprocess

	def get_text(text, hps):
	"""
	:param text:
	:param hps:
	:param is_symbol:
	:return:
	"""
	# hps中没有包括symbols
	text_norm = text_to_sequence(text, hps.data.text_cleaners)
	# hps中有包括symbols
	# text_norm = text_to_sequence_for_test(text, hps.symbols, hps.data.text_cleaners)
	if hps.data.add_blank:
	text_norm = commons.intersperse(text_norm, 0)
	text_norm = LongTensor(text_norm)
	return text_norm


	def create_tts_fn(model, hps):
	"""
	:param model:
	:param hps:
	:param speaker_ids:
	:return:
	"""
	def tts_fn(text, speed, noise_scale=.667, noise_scale_w=0.8, volume=1.0):
	"""
	:param text:
	:param speaker:
	:param speed:
	:param emo:
	:param volume:
	:param is_symbol:
	:return:
	"""
	sentences = split_text(text, punct_regex)
	audio_list = []
	for sentence in sentences:
	stn_tst = get_text(sentence, hps)
	with no_grad():
	x_tst = stn_tst.unsqueeze(0).to(device)
	x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
	audio = model.infer(x_tst, x_tst_lengths, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
	length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
	audio_list.append(audio)
	del stn_tst, x_tst, x_tst_lengths
	audio = concat_audio(audio_list, hps.data.sampling_rate, silence_duration)
	audio = audio * volume
	return "Success", (hps.data.sampling_rate, audio)
	return tts_fn


	def microsoft(text, name, style="Neural"):
	"""
	:param text:
	:param name:
	:param style:
	:return:
	"""
	headers = {
	'Ocp-Apim-Subscription-Key': '1f1ef0ce53b84261be94fab81df7e628',
	'Content-Type': 'application/ssml+xml',
	'X-Microsoft-OutputFormat': 'audio-16khz-128kbitrate-mono-mp3',
	'User-Agent': 'curl',
	}

	data = ("<speak version='1.0' xml:lang='en-US'>"
	f"<voice xml:lang='en-US' name='{name}'>" # xml:gender='Female'
	f"{text}"
	"</voice>"
	"</speak>")

	response = requests.post(
	'https://japaneast.tts.speech.microsoft.com/cognitiveservices/v1',
	headers=headers,
	data=data,
	proxies= {
	'http': 'http://192.168.3.11:80',
	'https': 'http://192.168.3.11:80',
	}
	)
	data = {
	"text":text,
	"name":name,
	"style":style,
	"format":"mp3"}
	audio_url = requests.get(microsoft_url, headers=microsoft_headers, json=data).json()['data']['url']
	return "Success", audio_url


	def google(text, name):
	"""
	:param text:
	:param name:
	:param style:
	:return:
	"""
	data = {
	"text":text,
	"name":name,
	"sample_rate":16000}
	audio_url = requests.get(google_url, headers=google_headers, json=data).json()['data']['url']
	return "Success", audio_url


	def coefont(text, name):
	"""
	:param text:
	:param name:
	:param style:
	:return:
	"""
	data = {
	"text":text,
	"coefont":coefont_id_model_name_dict[name]
	}
	audio_url = requests.get(coefont_url, headers=coefont_headers, json=data).json()['data']['url']
	return "Success", audio_url


	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('--device', type=str, default='cuda')
	parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
	parser.add_argument("--port", type=int, default=8080, help="port")
	parser.add_argument('--model_info_path', type=str, default='/gluster/speech_data/info.json')
	args = parser.parse_args()

	device = torch.device(args.device)
	models_tts = []

	with open(args.model_info_path, "r", encoding="utf-8") as f:
	models_info = json.load(f)
	for i, info in models_info.items():
	model_name = info["model_name"]
	author = info["author"]
	lang = info["lang"]
	example = info["example"]
	config_path = info["config_path"]
	model_path = info["model_path"]
	model_type = info["model_type"]

	hps = utils.get_hparams_from_file(config_path)
	if model_type == "vits":
	emotion_type = None
	elif model_type == "vits-emotion":
	emotion_type = "embedding"
	elif model_type == "vits-emotion-logits":
	emotion_type = "logits"

	model = SynthesizerTrn(
	len(symbols),
	hps.data.filter_length // 2 + 1,
	hps.train.segment_size // hps.data.hop_length,
	emotion_type=emotion_type,
	**hps.model)

	utils.load_checkpoint(model_path, model, None)
	model.eval().to(device)
	if model_type == "vits":
	# 普通TTS
	models_tts.append((model_name, author, lang, example, create_tts_fn(model, hps)))
	app = gr.Blocks()
	with app:
	gr.Markdown("## Japanese TTS Demo")
	with gr.Tabs():
	with gr.TabItem("自研"):
	with gr.Tabs():
	for i, (model_name, author, lang, example, tts_fn) in enumerate(models_tts):
	with gr.TabItem(model_name):
	with gr.Column():
	tts_input1 = gr.TextArea(label="Text", value=example)
	tts_input2 = gr.Slider(label="Speed", value=1.0, minimum=0.4, maximum=3, step=0.1)
	tts_input3 = gr.Slider(label="noise_scale", value=0.0, minimum=0.0, maximum=2, step=0.1)
	tts_input4 = gr.Slider(label="noise_scale_w", value=0.0,
	minimum=0.0, maximum=2, step=0.1)
	tts_input5 = gr.Slider(label="volume", value=1.0, minimum=0.1, maximum=4, step=0.1)
	tts_submit = gr.Button("Generate", variant="primary")
	tts_output1 = gr.Textbox(label="Output Message")
	tts_output2 = gr.Audio(label="Output Audio")
	tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, tts_input4, tts_input5],
	[tts_output1, tts_output2])

	with gr.TabItem("谷歌"):
	tts_input1 = gr.TextArea(label="Text", value=all_example)
	tts_input2 = gr.Dropdown(google_model_list, label="name")
	tts_submit = gr.Button("Generate", variant="primary")
	tts_output1 = gr.Textbox(label="Output Message")
	tts_output2 = gr.Audio(label="Output Audio")
	tts_submit.click(google, [tts_input1, tts_input2],
	[tts_output1, tts_output2])

	with gr.TabItem("微软"):
	tts_input1 = gr.TextArea(label="Text", value=all_example)
	tts_input2 = gr.Dropdown(microsoft_model_list, label="name")
	tts_submit = gr.Button("Generate", variant="primary")
	tts_output1 = gr.Textbox(label="Output Message")
	tts_output2 = gr.Audio(label="Output Audio")
	tts_submit.click(microsoft, [tts_input1, tts_input2],
	[tts_output1, tts_output2])

	with gr.TabItem("coefont"):
	tts_input1 = gr.TextArea(label="Text", value=all_example)
	tts_input2 = gr.Dropdown(coefont_model_list, label="name")
	tts_submit = gr.Button("Generate", variant="primary")
	tts_output1 = gr.Textbox(label="Output Message")
	tts_output2 = gr.Audio(label="Output Audio")
	tts_submit.click(coefont, [tts_input1, tts_input2],
	[tts_output1, tts_output2])

	app.queue(concurrency_count=5).launch(show_api=False,
	share=args.share,
	server_name='0.0.0.0',
	server_port=args.port,
	show_error=True)