import base64 import io import json import os import uuid from typing import Optional import gradio as gr import pandas as pd import requests from pydub import AudioSegment from TTSs.base_tts import Base_TTS class avaliable_voice_type: 语言: Optional[str] = "" 场景: Optional[str] = "" 音色名称: str voice_type: str 时间戳支持: bool = False 支持情感与风格类型: Optional[str] = "" 支持语言类型: Optional[str] = "" def __repr__(self): data = self.__dict__ text = "" text += f"{data['音色名称']}" if data["语言"]: text += f"——{data['语言']}" if data["场景"]: text += f"——{data['场景']}" if data["支持情感与风格类型"]: text += f"——{data['支持情感与风格类型']}" if data["支持语言类型"]: text += f"——{data['支持语言类型']}" return text class Volcengine_TTS(Base_TTS): def get_name(self): return '火山引擎' def __init__(self): self.useful_voice = self.get_data_map() def get_data_map(self, filename="voice_list.xlsx"): path = os.path.join(os.path.dirname(os.path.abspath(__file__)), filename) df = pd.read_excel(path) df.fillna('', inplace=True) useful_voice = {} for index, row in df.iterrows(): data = avaliable_voice_type() data.语言 = row['语言'] data.场景 = row['场景'] data.音色名称 = row['音色名称'] data.voice_type = row['voice_type'] data.时间戳支持 = row['时间戳'] data.支持情感与风格类型 = row['支持情感/风格类型'] data.支持语言类型 = row['支持语言类型'] useful_voice[str(data)] = data return useful_voice def _get_config_page(self): with gr.Group(visible=False) as config_volcengine: voices = list(self.useful_voice.keys()) with gr.Row(): volcengine_appid = gr.Textbox(label="volcengine的appid(默认为环境变量值)", placeholder="请输入volcengine的appid", type="password", interactive=True, value=os.environ.get('VOLCENGINE_APPID', '')) volcengine_access_token = gr.Textbox(label="volcengine的access_token(默认为环境变量值)", placeholder="请输入volengine的access_token", type="password", interactive=True, value=os.environ.get('VOLCENGINE_ACCESS_TOKEN', '')) voice_type = gr.Dropdown(choices=voices, value=voices[0], label="音色选择", interactive=True) with gr.Row(): speed_ratio = gr.Slider(minimum=0.2, maximum=3, value=1, step=0.1, label="语速", interactive=True) volume_ratio = gr.Slider(minimum=0.1, maximum=3, value=1, step=0.1, label="音量", interactive=True) pitch_ratio = gr.Slider(minimum=0.1, maximum=3, value=1, step=0.1, label="音高", interactive=True) with gr.Row(): emotion = gr.Textbox(label="情感/风格(还未适配)", placeholder="请输入情感", interactive=True) language = gr.Textbox(label="语言类型(还未适配)", placeholder="请输入语言", interactive=True) inputs = [ volcengine_appid, volcengine_access_token, voice_type, speed_ratio, volume_ratio, pitch_ratio ] return config_volcengine, inputs def _generate(self, text, appid, access_token, voice, speed_ratio, volume_ratio, pitch_ratio): host = "openspeech.bytedance.com" api_url = f"https://{host}/api/v1/tts" header = {"Authorization": f"Bearer;{access_token}"} request_json = { "app": { "appid": appid, "token": "access_token", "cluster": "volcano_tts" }, "user": { "uid": "388808087185088" }, "audio": { "voice_type": self.useful_voice[voice].voice_type, "encoding": "mp3", "speed_ratio": speed_ratio, "volume_ratio": volume_ratio, "pitch_ratio": pitch_ratio, }, "request": { "reqid": str(uuid.uuid4()), "text": text, "text_type": "plain", "operation": "query", "with_frontend": 1, "frontend_type": "unitTson" } } resp = requests.post(api_url, data=json.dumps(request_json), headers=header) if "data" not in resp.json(): raise Exception(resp.json()) data = resp.json()["data"] mp3_file = base64.b64decode(data) original_audio = AudioSegment.from_mp3(io.BytesIO(mp3_file)) return original_audio