import sys import time import os import logging import gradio as gr import numpy as np import pandas as pd from pypinyin import lazy_pinyin from i18n import gettext, Translate from api import generate_api, get_audio, generate_voice, load_characters_csv from utils import get_length # 翻译文件位置 trans_file = os.path.join(os.path.dirname(__file__), "i18n", "translations.json") # 关闭aiohttp的DEBUG日志 logging.getLogger("aiohttp").setLevel(logging.WARNING) # logging.getLogger("gradio").setLevel(logging.WARNING) # 带有时间的log logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) header = """header""" terms = "terms" def update_all_characters(lang, category): new_characters, category = load_characters_csv(lang) initial_characters = get_characters(kind=category[0], all_characters=new_characters) return ( new_characters, initial_characters, gr.Gallery( value=[[char["头像"], char["名称"]] for char in initial_characters], show_label=False, elem_id="character_gallery", columns=[11], object_fit="contain", height="auto", interactive=False, allow_preview=False, selected_index=None, ), category, gr.update(choices=category, value=category[0]), ) def get_characters( query=None, page=1, per_page=400, kind="原神", lang="zh", all_characters=None ): # 使用传入的 all_characters 参数 filtered_characters = all_characters[all_characters["类别"] == kind] if query: # 使用拼音和汉字进行搜索 filtered_characters = filtered_characters[ filtered_characters["名称"].str.contains(query, case=False) ] if filtered_characters.empty and lang == "zh": filtered_characters = all_characters[all_characters["类别"] == kind] filtered_characters = filtered_characters[ filtered_characters["名称"] .apply(lambda x: "".join(lazy_pinyin(x))) .str.contains(query, case=False) ] # 按名称分组,并选择每组的第一个记录 unique_characters = ( filtered_characters.groupby("名称").first().reset_index().sort_values(by="id") ) # 处理头像数据 import pickle def process_avatar(avatar): if not isinstance(avatar, str): try: return pickle.loads(bytes(avatar)) except: return avatar return avatar unique_characters["头像"] = unique_characters["头像"].apply(process_avatar) # 应用分页 start_index = (page - 1) * per_page end_index = start_index + per_page return unique_characters.iloc[start_index:end_index].to_dict("records") async def generate(selected_character=None, selected_characters=[], text="", lang="zh"): if selected_character: characters = [selected_character] + selected_characters else: characters = selected_characters if not selected_character and not selected_characters: if lang == "zh": raise gr.Error("请先选择一个角色") elif lang == "en": raise gr.Error("Please select a character first") elif lang == "ja": raise gr.Error("まず、キャラクターを選択してください") elif lang == "ko": raise gr.Error("먼저 캐릭터를 선택하세요") voice_ids = [char.get("voice_id") for char in characters if char.get("voice_id")] if not voice_ids: raise gr.Error("所选角色没有关联的 voice_id") start_time = time.time() # 假设我们只使用第一个选择的角色的名称 if voice_ids == "1": if lang == "zh": raise gr.Error("该角色暂未创建语音") elif lang == "en": raise gr.Error("The character has not been created yet") elif lang == "ja": raise gr.Error("そのキャラクターの音声はまだ作成されていません") elif lang == "ko": raise gr.Error("해당 캐릭터의 음성이 아직 생성되지 않았습니다") if text == "": if lang == "zh": raise gr.Error("请输入需要合成的文本") elif lang == "en": raise gr.Error("Please enter the text to be synthesized") elif lang == "ja": raise gr.Error("合成するテキストを入力してください") elif lang == "ko": raise gr.Error("합성할 텍스트를 입력하세요") if get_length(text) > 1024: if lang == "zh": raise gr.Error("长度请控制在1024个字符以内") elif lang == "en": raise gr.Error("The text length exceeds 1024 words") elif lang == "ja": raise gr.Error("テキストの長さが1024文字を超えています") elif lang == "ko": raise gr.Error("텍스트 길이가 1024자를 초과합니다") audio = await generate_api(voice_ids, text) end_time = time.time() if lang == "zh": cost_time = f"合成共花费{end_time - start_time:.2f}秒" elif lang == "en": cost_time = ( f"Total time spent synthesizing: {end_time - start_time:.2f} seconds" ) elif lang == "ja": cost_time = f"合成にかかった時間: {end_time - start_time:.2f}秒" elif lang == "ko": cost_time = f"합성에 소요된 시간: {end_time - start_time:.2f}초" if isinstance(audio, str): print(audio) raise gr.Error(audio) else: return audio, cost_time def get_character_emotions(character, all_characters): # 从all_characters中筛选出与当前角色名称相同的所有记录 character_records = all_characters[all_characters["名称"] == character["名称"]] # 按情绪去重并获取完整的角色信息 character_infos = character_records.drop_duplicates(subset=["情绪"]).to_dict( "records" ) # 如果没有找到角色信息,返回一个包含默认值的字典 return ( character_infos if character_infos else [{"名称": character["名称"], "情绪": "默认情绪"}] ) def update_character_info(character_name, emotion, current_character, all_characters): character_info = None if character_name and emotion: character_info = all_characters[ (all_characters["名称"] == character_name) & (all_characters["情绪"] == emotion) ] if character_name == "": return None character_info = character_info.iloc[0].to_dict() return character_info, all_characters def add_new_voice(current_character, selected_characters, kind, lang, all_characters): if not current_character: if lang == "zh": raise gr.Error("请先选择一个角色") elif lang == "en": raise gr.Error("Please select a character first") elif lang == "ja": raise gr.Error("まず、キャラクターを選択してください") elif lang == "ko": raise gr.Error("먼저 캐릭터를 선택하세요") if len(selected_characters) >= 5: raise gr.Error("已达到最大选择数(5个)") # 检查是否已存在相同角色 existing_char = next( ( char for char in selected_characters if char["名称"] == current_character["名称"] ), None, ) if existing_char: # 如果情绪不同,更新情绪 if existing_char["情绪"] != current_character["情绪"]: existing_char["情绪"] = current_character["情绪"] else: selected_characters.insert(0, current_character) updated_characters = get_characters( kind=kind, lang=lang, all_characters=all_characters ) # ! 取消gallery选中状态,返回个新的gallery是必要的,否则会保留上一次的选中状态。这里sonnet很喜欢改成返回一个数组,但这不能清空gallery的选中状态 updated_gallery = gr.Gallery( value=[[char["头像"], char["名称"]] for char in updated_characters], show_label=False, elem_id="character_gallery", columns=[11], object_fit="contain", height="auto", interactive=False, allow_preview=False, selected_index=None, ) return ( None, gr.update(value=""), gr.update(choices=[]), selected_characters, updated_characters, updated_gallery, gr.update(visible=True), all_characters, ) def update_selected_chars_display(selected_characters): updates = [] for i, (name, emotion, _, row) in enumerate(selected_chars_rows): if i < len(selected_characters): char = selected_characters[i] updates.extend( [ gr.update(value=char["名称"], visible=True), gr.update(value=char["情绪"], visible=True), gr.update(visible=True), gr.update(visible=True), ] ) else: updates.extend( [ gr.update(value="", visible=False), gr.update(value="", visible=False), gr.update(visible=False), gr.update(visible=False), ] ) return updates def remove_character(index, selected_characters): if 0 <= index < len(selected_characters): del selected_characters[index] return selected_characters, gr.update(visible=True) def update_gallery(kind, query, all_characters): updated_characters = get_characters( kind=kind, query=query, lang=lang, all_characters=all_characters ) return ( updated_characters, [[char["头像"], char["名称"]] for char in updated_characters], all_characters, ) def on_select(evt: gr.SelectData, characters, selected_characters, all_characters): # 如果没有选择角色,换人的时候清空 if len(selected_characters) == 0: selected_characters = [] selected = characters[evt.index] emotions = get_character_emotions(selected, all_characters) normal_index = 0 for index, emotion in enumerate(emotions): if ( emotion["情绪"] == "正常" or emotion["情绪"] == "보통" or emotion["情绪"] == "normal" ): normal_index = index break default_emotion = emotions[normal_index]["情绪"] if emotions else "" default_voice_id = emotions[normal_index]["voice_id"] if emotions else "" character_dict = selected.copy() character_dict["情绪"] = default_emotion character_dict["voice_id"] = default_voice_id return ( selected["名称"], gr.Dropdown( choices=[emotion["情绪"] for emotion in emotions], value=default_emotion ), character_dict, selected_characters, ) async def update_prompt_audio(current_character): if current_character: return await get_audio(current_character.get("voice_id")) else: return None async def create_voice( avatar, name, emotion, tags, gender, audio_data, lang, since_last_update ): if since_last_update is None: since_last_update = time.time() elif time.time() - since_last_update < 30: if lang == "zh": gr.Warning( f"已提交上个创建请求,请在{30 - (time.time() - since_last_update):.1f}秒后提交新的角色" ) elif lang == "en": gr.Warning( f"The last creation request has been submitted. Please try to create a new character after {30 - (time.time() - since_last_update):.1f} seconds" ) elif lang == "ja": gr.Warning( f"前回の作成リクエストが送信されました。{30 - (time.time() - since_last_update):.1f}秒後に新しいキャラクターを作成してください" ) elif lang == "ko": gr.Warning( f"이전 생성 요청이 제출되었습니다. {30 - (time.time() - since_last_update):.1f}초 후에 새 캐릭터를 만들어주세요" ) return avatar, name, emotion, tags, gender, audio_data, since_last_update updates = {} for field, value in [ ("avatar", avatar), ("name", name), ("emotion", emotion), ("tags", tags), ("gender", gender), ("audio_data", audio_data), ]: if field in ["avatar", "audio_data"]: if value is None or (isinstance(value, np.ndarray) and value.size == 0): updates[field] = gr.update(value=None) elif value == "": updates[field] = gr.update(value="") if updates: if lang == "zh": gr.Warning("请填写完整信息") elif lang == "en": gr.Warning("Please fill in all the information") elif lang == "ja": gr.Warning("すべての情報を入力してください") elif lang == "ko": gr.Warning("모든 정보를 입력하세요") return tuple( [updates.get(field, gr.update()) for field in ["avatar", "name", "emotion", "tags", "gender", "audio_data"]] + [since_last_update] ) duration = len(audio_data[1]) / audio_data[0] if duration < 3.2 or duration > 8: if lang == "zh": gr.Warning("音频时长请控制在3.2-8秒之间") elif lang == "en": gr.Warning("The audio duration should be between 3.2 and 8 seconds") elif lang == "ja": gr.Warning("音声の長さは3.2秒から8秒の間にしてください") elif lang == "ko": gr.Warning("음성 길이는 3.2초에서 8초 사이로 설정해야 합니다") return avatar, name, emotion, tags, gender, audio_data, since_last_update await generate_voice(avatar, name, emotion, tags, gender, audio_data, lang) if lang == "zh": gr.Info("创建成功,您创建的语音将在审核后上线", duration=20) elif lang == "en": gr.Info( "Creation successful. The voice you created will be available after review.", duration=20, ) elif lang == "ja": gr.Info( "作成が完了しました。作成された音声は審査後に公開されます。", duration=20 ) elif lang == "ko": gr.Info( "생성 완료. 귀하가 생성한 음성은 검토 후 공개될 예정입니다.", duration=20 ) return tuple([gr.update(value=None) for _ in range(6)] + [since_last_update]) # 6个更新项 head = """ Free Online Text to Speech (TTS) | Convert Text to Audio """ with gr.Blocks(title="Online Free TTS", theme=gr.themes.Soft(), head=head) as demo: gr.Markdown( "Online Free TTS(Text-to-Speech). Ultra-low latency, 5-second voice cloning." ) lang = gr.Radio( choices=[("中文", "zh"), ("English", "en"), ("日本語", "ja"), ("한국인", "ko")], label=gettext("Language"), value="en", scale=1, ) all_characters_state = gr.State(load_characters_csv("en")[0]) category = gr.State(load_characters_csv("en")[1]) with Translate(trans_file, lang, placeholder_langs=["en", "zh", "ja", "ko"]): gr.Markdown(value=gettext(header)) with gr.Group(): initial_characters = get_characters( kind="原神", lang="zh", all_characters=all_characters_state.value ) characters = gr.State(initial_characters) selected_characters = gr.State([]) current_character = gr.State(None) with gr.Tab(gettext("Synthesis Voice")): with gr.Blocks(): with gr.Row(): kind = gr.Dropdown( choices=category.value, value=category.value[0], label=gettext("Select character category"), ) query = gr.Textbox( label=gettext("Search character"), value="", lines=1, max_lines=1, interactive=True, ) with gr.Blocks(): gallery = gr.Gallery( value=[ [char["头像"], char["名称"]] for char in characters.value ], show_label=False, elem_id="character_gallery", columns=[11], object_fit="contain", height="auto", interactive=False, allow_preview=False, selected_index=None, ) with gr.Row(): character_name = gr.Textbox( label=gettext("Currently selected character"), interactive=False, max_lines=1, ) info_type = gr.Dropdown(choices=[], label=gettext("Select emotion")) with gr.Row(): add_voice_button = gr.Button( gettext("Add new voice"), variant="primary" ) selected_chars_container = gr.Column( elem_id="selected_chars_container", visible=False ) with selected_chars_container: gr.Markdown(gettext("### Selected characters")) selected_chars_rows = [] for i in range(5): # 假设最多选择5个角色 with gr.Row() as row: name = gr.Textbox( label=gettext("Name"), interactive=False, max_lines=1 ) emotion = gr.Textbox( label=gettext("Emotion"), interactive=False, max_lines=1 ) delete_btn = gr.Button(gettext("Delete"), scale=0) selected_chars_rows.append((name, emotion, delete_btn, row)) with gr.Row(): with gr.Column(): text = gr.Textbox( label=gettext("Text to synthesize"), value="", lines=10, max_lines=10, ) inference_button = gr.Button( gettext("🎉 Synthesize Voice 🎉"), variant="primary", size="lg" ) with gr.Column(): prompt_audio = gr.Audio( label=gettext("Reference audio for synthesis"), interactive=False, type="numpy", ) output = gr.Audio( label=gettext("Output audio"), interactive=False, type="numpy" ) cost_time = gr.Textbox( label=gettext("Synthesis time"), interactive=False, show_label=False, max_lines=1, ) try: inference_button.click( fn=generate, inputs=[current_character, selected_characters, text, lang], outputs=[output, cost_time], ) except gr.Error as e: gr.Error(e) except Exception as e: pass with gr.Tab(gettext("Create Voice")): since_last_update = gr.State(None) gr.Markdown(gettext("Note")) with gr.Row(): avatar = gr.Image( label=gettext("Avatar"), interactive=True, type="pil", image_mode="RGBA", ) with gr.Column(): with gr.Row(): name = gr.Textbox( label=gettext("Name"), interactive=True, max_lines=1 ) emotion = gr.Textbox( label=gettext("Emotion\n(Happy, Sad, Angry)"), interactive=True, max_lines=1, ) tags = gr.Textbox( label=gettext("Tags\n(Genshin, Cute, Girl, Boy, etc.)"), interactive=True, max_lines=1, ) gender = gr.Dropdown( label=gettext("Gender"), choices=[ (gettext("Male"), "male"), (gettext("Female"), "female"), (gettext("Non-Binary"), "non-binary"), ], interactive=True, ) audio_data = gr.Audio( label=gettext("Prompt Audio(min 3.2s, max 8s)"), interactive=True, ) create_button = gr.Button( gettext("Create Voice"), variant="primary" ) gr.Markdown(gettext(terms)) # -------------- 绑定事件 -------------- lang.change( fn=update_all_characters, inputs=[lang, category], outputs=[all_characters_state, characters, gallery, category, kind], ) demo.load( update_all_characters, inputs=[lang, category], outputs=[all_characters_state, characters, gallery, category, kind], ) add_voice_button.click( fn=add_new_voice, inputs=[ current_character, selected_characters, kind, lang, all_characters_state, ], outputs=[ current_character, character_name, info_type, selected_characters, characters, gallery, selected_chars_container, all_characters_state, ], ).then( fn=update_selected_chars_display, inputs=[selected_characters], outputs=[item for row in selected_chars_rows for item in row], ) gallery.select( fn=on_select, inputs=[characters, selected_characters, all_characters_state], outputs=[character_name, info_type, current_character, selected_characters], ).then( fn=update_prompt_audio, inputs=[current_character], outputs=[prompt_audio] ) info_type.change( fn=update_character_info, inputs=[character_name, info_type, current_character, all_characters_state], outputs=[current_character, all_characters_state], ).then( fn=update_prompt_audio, inputs=[current_character], outputs=[prompt_audio] ) for i, (_, _, delete_btn, _) in enumerate(selected_chars_rows): delete_btn.click( fn=remove_character, inputs=[gr.Number(value=i, visible=False), selected_characters], outputs=[selected_characters, selected_chars_container], ).then( fn=update_selected_chars_display, inputs=[selected_characters], outputs=[item for row in selected_chars_rows for item in row], ) kind.change( fn=update_gallery, inputs=[kind, query, all_characters_state], outputs=[characters, gallery, all_characters_state], ) query.change( fn=update_gallery, inputs=[kind, query, all_characters_state], outputs=[characters, gallery, all_characters_state], ) create_button.click( fn=create_voice, inputs=[ avatar, name, emotion, tags, gender, audio_data, lang, since_last_update, ], outputs=[ avatar, name, emotion, tags, gender, audio_data, since_last_update, ], ) if __name__ == "__main__": demo.queue(default_concurrency_limit=None).launch( show_api=False )