Hololive-Style-Bert-VITS2

Running

App Files Files Community

Kit-Lemonfoot commited on Jan 15, 2024

Commit

b5bac24

verified ·

1 Parent(s): 3550cdf

Upload app.py

Browse files

Files changed (1) hide show

app.py +69 -128

app.py CHANGED Viewed

@@ -15,7 +15,7 @@ from infer import get_net_g, infer
 from tools.log import logger
 is_hf_spaces = os.getenv("SYSTEM") == "spaces"
-limit = 100
 class Model:
@@ -186,9 +186,11 @@ class ModelHolder:
             device=self.device,
         )
         styles = list(self.current_model.style2id.keys())
         return (
             gr.Dropdown(choices=styles, value=styles[0]),
-            gr.update(interactive=True, value="音声合成"),
         )
     def update_model_files_dropdown(self, model_name):
@@ -207,6 +209,8 @@ class ModelHolder:
 def tts_fn(
     text,
     language,
     reference_audio_path,
@@ -221,25 +225,32 @@ def tts_fn(
     use_style_text,
     emotion,
     emotion_weight,
 ):
-    logger.info(f"Start TTS with {language}:\n{text}")
-    logger.info(f"Model: {model_holder.current_model.model_path}")
-    logger.info(f"SDP: {sdp_ratio}, Noise: {noise_scale}, Noise_W: {noise_scale_w}, Length: {length_scale}")
-    logger.info(f"Style text enabled: {use_style_text}, Style text: {style_text}, Style weight: {style_weight}")
-    logger.info(f"Style: {emotion}, Style weight: {emotion_weight}")
     if is_hf_spaces and len(text) > limit:
-        logger.error(f"文字数が{limit}文字を超えています")
-        # raise Exception(f"文字数が{limit}文字を超えています")
-        return f"文字数が{limit}文字を超えています", (44100, "")
     assert model_holder.current_model is not None
     start_time = datetime.datetime.now()
     sr, audio = model_holder.current_model.infer(
         text=text,
         language=language,
         reference_audio_path=reference_audio_path,
         sdp_ratio=sdp_ratio,
         noise=noise_scale,
@@ -256,112 +267,41 @@ def tts_fn(
     end_time = datetime.datetime.now()
     duration = (end_time - start_time).total_seconds()
-    logger.info(f"End TTS, duration: {duration} seconds")
     return f"Success, time: {duration} seconds.", (sr, audio)
-initial_text = "こんにちは、初めまして。あなたの名前はなんていうの？"
-example_local = [
-    [initial_text, "JP"],
-    [
-        """あなたがそんなこと言うなんて、私はとっても嬉しい。
-あなたがそんなこと言うなんて、私はとっても怒ってる。
-あなたがそんなこと言うなんて、私はとっても驚いてる。
-あなたがそんなこと言うなんて、私はとっても辛い。""",
-        "JP",
-    ],
-    [  # ChatGPTに考えてもらった告白セリフ
-        """私、ずっと前からあなたのことを見てきました。あなたの笑顔、優しさ、強さに、心惹かれていたんです。
-友達として過ごす中で、あなたのことがだんだんと特別な存在になっていくのがわかりました。
-えっと、私、あなたのことが好きです！もしよければ、私と付き合ってくれませんか？""",
-        "JP",
-    ],
-    [  # 夏目漱石『吾輩は猫である』
-        """吾輩は猫である。名前はまだ無い。
-どこで生れたかとんと見当がつかぬ。なんでも薄暗いじめじめした所でニャーニャー泣いていた事だけは記憶している。
-吾輩はここで始めて人間というものを見た。しかもあとで聞くと、それは書生という、人間中で一番獰悪な種族であったそうだ。
-この書生というのは時々我々を捕まえて煮て食うという話である。""",
-        "JP",
-    ],
-    [  # 梶井基次郎『桜の樹の下には』
-        """桜の樹の下には屍体が埋まっている！これは信じていいことなんだよ。
-何故って、桜の花があんなにも見事に咲くなんて信じられないことじゃないか。俺はあの美しさが信じられないので、このにさんにち不安だった。
-しかしいま、��っとわかるときが来た。桜の樹の下には屍体が埋まっている。これは信じていいことだ。""",
-        "JP",
-    ],
-    [  # ChatGPTと考えた、感情を表すセリフ
-        """やったー！テストで満点取れた！私とっても嬉しいな！
-どうして私の意見を無視するの？許せない！ムカつく！あんたなんか死ねばいいのに。
-あはははっ！この漫画めっちゃ笑える、見てよこれ、ふふふ、あはは。
-あなたがいなくなって、私は一人になっちゃって、泣いちゃいそうなほど悲しい。""",
-        "JP",
-    ],
-    [  # 上の丁寧語バージョン
-        """やりました！テストで満点取れましたよ！私とっても嬉しいです！
-どうして私の意見を無視するんですか？許せません！ムカつきます！あんたなんか死んでください。
-あはははっ！この漫画めっちゃ笑えます、見てくださいこれ、ふふふ、あはは。
-あなたがいなくなって、私は一人になっちゃって、泣いちゃいそうなほど悲しいです。""",
-        "JP",
-    ],
-    [  # ChatGPTに考えてもらった音声合成の説明文章
-        """音声合成は、機械学習を活用して、テキストから人の声を再現する技術です。この技術は、言語の構造を解析し、それに基づいて音声を生成します。
-この分野の最新の研究成果を使うと、より自然で表現豊かな音声の生成が可能である。深層学習の応用により、感情やアクセントを含む声質の微妙な変化も再現することが出来る。""",
-        "JP",
-    ],
-    [
-        "Speech synthesis is the artificial production of human speech. A computer system used for this purpose is called a speech synthesizer, and can be implemented in software or hardware products.",
-        "EN",
-    ],
-    ["语音合成是人工制造人类语音。用于此目的的计算机系统称为语音合成器，可以通过软件或硬件产品实现。", "ZH"],
-]
-example_hf_spaces = [
-    [initial_text, "JP"],
-    ["えっと、私、あなたのことが好きです！もしよければ付き合ってくれませんか？", "JP"],
-    ["吾輩は猫である。名前はまだ無い。", "JP"],
-    ["桜の樹の下には屍体が埋まっている！これは信じていいことなんだよ。", "JP"],
-    ["やったー！テストで満点取れたよ！私とっても嬉しいな！", "JP"],
-    ["どうして私の意見を無視するの？許せない！ムカつく！あんたなんか死ねばいいのに。", "JP"],
-    ["あはははっ！この漫画めっちゃ笑える、見てよこれ、ふふふ、あはは。", "JP"],
-    ["あなたがいなくなって、私は一人になっちゃって、泣いちゃいそうなほど悲しい。", "JP"],
-    ["深層学習の応用により、感情やアクセントを含む声質の微妙な変化も再現されている。", "JP"],
-    [
-        "Speech synthesis is the artificial production of human speech.",
-        "EN",
-    ],
-    ["语音合成是人工制造人类语音。用于此目的的计算机系统称为语音合成器，可以通过软件或硬件产品实现。", "ZH"],
-]
 initial_md = """
-# Style-Bert-VITS2 JVNVコーパスデモ
-怒り・悲しみ・喜び等の感情スタイルを強弱付きで制御できる、[Style-Bert-VITS2](https://github.com/litagin02/Style-Bert-VITS2)のデモです。
-入力上限文字数は100文字までにしています。
-このデモでは[jvnvのモデル](https://huggingface.co/litagin/style_bert_vits2_jvnv)を使っており、[JVNVコーパス（言語音声と非言語音声を持つ日本語感情音声コーパス）](https://sites.google.com/site/shinnosuketakamichi/research-topics/jvnv_corpus)で学習されたモデルです。
 """
 style_md = """
-- プリセットまたは音声ファイルから読み上げの声音・感情・スタイルのようなものを制御できます。
-- デフォルトのNeutralでも、十分に読み上げる文に応じた感情で感情豊かに読み上げられます。このスタイル制御は、それを重み付きで上書きするような感じです。
-- 強さを大きくしすぎると発音が変になったり声にならなかったりと崩壊することがあります。
-- どのくらいに強さがいいかはモデルやスタイルに��って異なるようです。
-- 音声ファイルを入力する場合は、学習データと似た声音の話者（特に同じ性別）でないとよい効果が出ないかもしれません。
 """
 def make_interactive():
-    return gr.update(interactive=True, value="音声合成")
 def make_non_interactive():
-    return gr.update(interactive=False, value="音声合成（モデルをロードしてください）")
 def gr_util(item):
-    if item == "プリセットから選ぶ":
         return (gr.update(visible=True), gr.Audio(visible=False, value=None))
     else:
         return (gr.update(visible=False), gr.update(visible=True))
@@ -383,46 +323,46 @@ if __name__ == "__main__":
     model_holder = ModelHolder(model_dir, device)
-    languages = ["JP", "EN", "ZH"]
-    examples = example_hf_spaces if is_hf_spaces else example_local
     model_names = model_holder.model_names
     if len(model_names) == 0:
-        logger.error(f"モデルが見つかりませんでした。{model_dir}にモデルを置いてください。")
         sys.exit(1)
     initial_id = 0
     initial_pth_files = model_holder.model_files_dict[model_names[initial_id]]
-    with gr.Blocks(theme="NoCrypt/miku") as app:
         gr.Markdown(initial_md)
         with gr.Row():
             with gr.Column():
                 with gr.Row():
                     with gr.Column(scale=3):
                         model_name = gr.Dropdown(
-                            label="モデル一覧",
                             choices=model_names,
                             value=model_names[initial_id],
                         )
                         model_path = gr.Dropdown(
-                            label="モデルファイル",
                             choices=initial_pth_files,
                             value=initial_pth_files[0],
                         )
-                    refresh_button = gr.Button("更新", scale=1, visible=not is_hf_spaces)
-                    load_button = gr.Button("ロード", scale=1, variant="primary")
-                text_input = gr.TextArea(label="テキスト", value=initial_text)
-                line_split = gr.Checkbox(label="改行で分けて生成", value=True)
                 split_interval = gr.Slider(
                     minimum=0.0,
                     maximum=2,
                     value=0.5,
                     step=0.1,
-                    label="分けた場合に挟む無音の長さ（秒）",
                 )
-                language = gr.Dropdown(choices=languages, value="JP", label="Language")
-                with gr.Accordion(label="詳細設定", open=False):
                     sdp_ratio = gr.Slider(
                         minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio"
                     )
@@ -435,11 +375,11 @@ if __name__ == "__main__":
                     length_scale = gr.Slider(
                         minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length"
                     )
-                    use_style_text = gr.Checkbox(label="Style textを使う", value=False)
                     style_text = gr.Textbox(
                         label="Style text",
-                        placeholder="どうして私の意見を無視するの？許せない、ムカつく！死ねばいいのに。",
-                        info="このテキストの読み上げと似た声音・感情になりやすくなります。ただ抑揚やテンポ等が犠牲になる傾向があります。",
                         visible=False,
                     )
                     style_text_weight = gr.Slider(
@@ -447,7 +387,7 @@ if __name__ == "__main__":
                         maximum=1,
                         value=0.7,
                         step=0.1,
-                        label="Style textの強さ",
                         visible=False,
                     )
                     use_style_text.change(
@@ -456,37 +396,37 @@ if __name__ == "__main__":
                         outputs=[style_text, style_text_weight],
                     )
             with gr.Column():
-                with gr.Accordion("スタイルについて詳細", open=False):
                     gr.Markdown(style_md)
                 style_mode = gr.Radio(
-                    ["プリセットから選ぶ", "音声ファイルを入力"],
-                    label="スタイルの指定方法",
-                    value="プリセットから選ぶ",
                 )
                 style = gr.Dropdown(
-                    label="スタイル（Neutralが平均スタイル）",
-                    choices=["モデルをロードしてください"],
-                    value="モデルをロードしてください",
                 )
                 style_weight = gr.Slider(
                     minimum=0,
                     maximum=50,
                     value=5,
                     step=0.1,
-                    label="スタイルの強さ",
                 )
-                ref_audio_path = gr.Audio(label="参照音声", type="filepath", visible=False)
                 tts_button = gr.Button(
-                    "音声合成（モデルをロードしてください）", variant="primary", interactive=False
                 )
-                text_output = gr.Textbox(label="情報")
-                audio_output = gr.Audio(label="結果")
-                with gr.Accordion("テキスト例", open=False):
-                    gr.Examples(examples, inputs=[text_input, language])
         tts_button.click(
             tts_fn,
             inputs=[
                 text_input,
                 language,
                 ref_audio_path,
@@ -501,6 +441,7 @@ if __name__ == "__main__":
                 use_style_text,
                 style,
                 style_weight,
             ],
             outputs=[text_output, audio_output],
         )
@@ -521,7 +462,7 @@ if __name__ == "__main__":
         load_button.click(
             model_holder.load_model,
             inputs=[model_name, model_path],
-            outputs=[style, tts_button],
         )
         style_mode.change(

 from tools.log import logger
 is_hf_spaces = os.getenv("SYSTEM") == "spaces"
+limit = 150
 class Model:
             device=self.device,
         )
         styles = list(self.current_model.style2id.keys())
+        speakers = list(self.current_model.spk2id.keys())
         return (
             gr.Dropdown(choices=styles, value=styles[0]),
+            gr.update(interactive=True, value="Synthesize"),
+            gr.Dropdown(choices=speakers, value=speakers[0]),
         )
     def update_model_files_dropdown(self, model_name):
 def tts_fn(
+    model_name,
+    model_path,
     text,
     language,
     reference_audio_path,
     use_style_text,
     emotion,
     emotion_weight,
+    speaker,
 ):
+    if not text:
+        return "Please enter some text.", (44100, None)
+    #logger.info(f"Start TTS with {language}:\n{text}")
+    #logger.info(f"Model: {model_holder.current_model.model_path}")
+    #logger.info(f"SDP: {sdp_ratio}, Noise: {noise_scale}, Noise_W: {noise_scale_w}, Length: {length_scale}")
+    #logger.info(f"Style text enabled: {use_style_text}, Style text: {style_text}, Style weight: {style_weight}")
+    #logger.info(f"Style: {emotion}, Style weight: {emotion_weight}")
     if is_hf_spaces and len(text) > limit:
+        return f"Too long! There is a character limit of {limit} characters.", (44100, None)
     assert model_holder.current_model is not None
+    if(model_holder.current_model.model_path != model_path):
+        model_holder.load_model(model_name, model_path)
+    speaker_id = model_holder.current_model.spk2id[speaker]
     start_time = datetime.datetime.now()
     sr, audio = model_holder.current_model.infer(
         text=text,
         language=language,
+        sid=speaker_id,
         reference_audio_path=reference_audio_path,
         sdp_ratio=sdp_ratio,
         noise=noise_scale,
     end_time = datetime.datetime.now()
     duration = (end_time - start_time).total_seconds()
+    logger.info(f"Successful inference, took {duration}s | {speaker} | {sdp_ratio}/{noise_scale}/{noise_scale_w}/{length_scale} | {text}")
     return f"Success, time: {duration} seconds.", (sr, audio)
+initial_text = "Hi there! How are you doing?"
 initial_md = """
+# LemonfootSBV2 😊🍋
+### Space by [Kit Lemonfoot](https://huggingface.co/Kit-Lemonfoot)  /  [Noel Shirogane's High Flying Birds](https://www.youtube.com/channel/UCG9A0OJsJTluLOXfMZjJ9xA)
+### Based on code originally by  [fishaudio](https://github.com/fishaudio)  and  [litagin02](https://github.com/litagin02)
+This HuggingFace space is designed to demonstrate multiple experimental [Style-Bert-VITS2](https://github.com/litagin02/Style-Bert-VITS2) models made by Kit Lemonfoot.
+Do no evil.
 """
 style_md = """
+- You can control things like voice tone, emotion, and reading style through presets or through voice files.
+- Neutral acts as an average across all speakers. Styling options act as an override to Neutral.
+- Setting the intensity too high will likely break the output.
+- The required intensity will depend based on the speaker and the desired style.
+- If you're using preexisting audio data to style the output, try to use a voice that is similar to the desired speaker.
 """
 def make_interactive():
+    return gr.update(interactive=True, value="Synthesize")
 def make_non_interactive():
+    return gr.update(interactive=False, value="Synthesize (Please load a model!)")
 def gr_util(item):
+    if item == "Select from presets":
         return (gr.update(visible=True), gr.Audio(visible=False, value=None))
     else:
         return (gr.update(visible=False), gr.update(visible=True))
     model_holder = ModelHolder(model_dir, device)
+    languages = ["EN", "JP", "ZH"]
     model_names = model_holder.model_names
     if len(model_names) == 0:
+        logger.error(f"No models found. Please place the model in {model_dir}.")
         sys.exit(1)
     initial_id = 0
     initial_pth_files = model_holder.model_files_dict[model_names[initial_id]]
+    with gr.Blocks(theme=gr.themes.Base(primary_hue="emerald", secondary_hue="green"), title="LemonfootSBV2") as app:
         gr.Markdown(initial_md)
         with gr.Row():
             with gr.Column():
                 with gr.Row():
                     with gr.Column(scale=3):
                         model_name = gr.Dropdown(
+                            label="Available Models",
                             choices=model_names,
                             value=model_names[initial_id],
                         )
                         model_path = gr.Dropdown(
+                            label="Model File",
                             choices=initial_pth_files,
                             value=initial_pth_files[0],
                         )
+                    refresh_button = gr.Button("Refresh", scale=1, visible=not is_hf_spaces)
+                    load_button = gr.Button("Load", scale=1, variant="primary")
+                text_input = gr.TextArea(label="Text", value=initial_text)
+                line_split = gr.Checkbox(label="Divide text seperately by line breaks", value=True)
                 split_interval = gr.Slider(
                     minimum=0.0,
                     maximum=2,
                     value=0.5,
                     step=0.1,
+                    label="Length of division seperation time (in seconds)",
                 )
+                language = gr.Dropdown(choices=languages, value="EN", label="Language")
+                speaker = gr.Dropdown(label="Speaker")
+                with gr.Accordion(label="Advanced Settings", open=False):
                     sdp_ratio = gr.Slider(
                         minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio"
                     )
                     length_scale = gr.Slider(
                         minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length"
                     )
+                    use_style_text = gr.Checkbox(label="Use stylization text", value=False)
                     style_text = gr.Textbox(
                         label="Style text",
+                        placeholder="Why are you ignoring me? You're unforgivable and disgusting! I hope you die.",
+                        info="The voice will be similar in tone and emotion to the text, however inflection and tempo may be worse as a result.",
                         visible=False,
                     )
                     style_text_weight = gr.Slider(
                         maximum=1,
                         value=0.7,
                         step=0.1,
+                        label="Text stylization strength",
                         visible=False,
                     )
                     use_style_text.change(
                         outputs=[style_text, style_text_weight],
                     )
             with gr.Column():
+                with gr.Accordion("Styling Guide", open=False):
                     gr.Markdown(style_md)
                 style_mode = gr.Radio(
+                    ["Select from presets", "Use an audio file"],
+                    label="Style Specification",
+                    value="Select from presets",
                 )
                 style = gr.Dropdown(
+                    label="Current style (Neutral is an average style)",
+                    choices=["Please load a model first!"],
+                    value="Please load a model first!",
                 )
                 style_weight = gr.Slider(
                     minimum=0,
                     maximum=50,
                     value=5,
                     step=0.1,
+                    label="Style strength",
                 )
+                ref_audio_path = gr.Audio(label="Reference Audio", type="filepath", visible=False)
                 tts_button = gr.Button(
+                    "Synthesize (Please load a model!)", variant="primary", interactive=False
                 )
+                text_output = gr.Textbox(label="Info")
+                audio_output = gr.Audio(label="Result")
         tts_button.click(
             tts_fn,
             inputs=[
+                model_name,
+                model_path,
                 text_input,
                 language,
                 ref_audio_path,
                 use_style_text,
                 style,
                 style_weight,
+                speaker,
             ],
             outputs=[text_output, audio_output],
         )
         load_button.click(
             model_holder.load_model,
             inputs=[model_name, model_path],
+            outputs=[style, tts_button, speaker],
         )
         style_mode.change(