Spaces:

balacoon
/

tts

Running

App Files Files Community

clementruhm commited on Jul 15, 2023

Commit

9ef73e2

•

1 Parent(s): 4263bcd

reload model on synthesis if needed

Browse files

Files changed (1) hide show

app.py +29 -14

app.py CHANGED Viewed

@@ -15,6 +15,10 @@ from huggingface_hub import hf_hub_download, list_repo_files
 # global tts module, initialized from a model selected
 tts = None
 model_repo_dir = "data"
 for name in list_repo_files(repo_id="balacoon/tts"):
     hf_hub_download(
@@ -58,14 +62,20 @@ def main():
             def set_model(model_name_str: str):
                 """
-                gets value from `model_name`, loads model,
-                re-initializes tts object, gets list of
-                speakers that model supports and set them to `speaker`
                 """
-                model_path = os.path.join(model_repo_dir, model_name_str)
-                global tts
-                tts = TTS(model_path)
-                speakers = tts.get_speakers()
                 value = speakers[-1]
                 return gr.Dropdown.update(
                     choices=speakers, value=value, visible=True
@@ -78,23 +88,28 @@ def main():
         with gr.Row(variant="panel"):
             audio = gr.Audio()
-        def synthesize_audio(text_str: str, speaker_str: str = ""):
             """
             gets utterance to synthesize from `text` Textbox
             and speaker name from `speaker` dropdown list.
             speaker name might be empty for single-speaker models.
             Synthesizes the waveform and updates `audio` with it.
             """
-            if not text_str:
-                logging.info("text or speaker are not provided")
                 return None
-            global tts
             if len(text_str) > 1024:
                 text_str = text_str[:1024]
-            samples = cast(TTS, tts).synthesize(text_str, speaker_str)
-            return gr.Audio.update(value=(cast(TTS, tts).get_sampling_rate(), samples))
-        generate.click(synthesize_audio, inputs=[text, speaker], outputs=audio)
     demo.queue(concurrency_count=1).launch()

 # global tts module, initialized from a model selected
 tts = None
+# path to the model that is currently used in tts
+cur_model_path = None
+# cache of speakers, maps model name to speaker list
+model_to_speakers = dict()
 model_repo_dir = "data"
 for name in list_repo_files(repo_id="balacoon/tts"):
     hf_hub_download(
             def set_model(model_name_str: str):
                 """
+                gets value from `model_name`. either
+                uses cached list of speakers for the given model name
+                or loads the addon and checks what are the speakers.
                 """
+                if model_name_str in model_to_speakers:
+                    speakers = model_to_speakers[model_name_str]
+                else:
+                    # need to load this model to learn the list of speakers
+                    model_path = os.path.join(model_repo_dir, model_name_str)
+                    tts = TTS(model_path)
+                    cur_model_path = model_path
+                    speakers = tts.get_speakers()
+                    model_to_speakers[model_name_str] = speakers
                 value = speakers[-1]
                 return gr.Dropdown.update(
                     choices=speakers, value=value, visible=True
         with gr.Row(variant="panel"):
             audio = gr.Audio()
+        def synthesize_audio(text_str: str, model_name_str: str, speaker_str: str):
             """
             gets utterance to synthesize from `text` Textbox
             and speaker name from `speaker` dropdown list.
             speaker name might be empty for single-speaker models.
             Synthesizes the waveform and updates `audio` with it.
             """
+            if not text_str or not model_name_str or not speaker_str:
+                logging.info("text, model name or speaker are not provided")
                 return None
+            expected_model_path = os.path.join(model_repo_dir, model_name_str)
+            if expected_model_path != cur_model_path:
+                # reload model
+                tts = TTS(expected_model_path)
+                cur_model_path = expected_model_path
             if len(text_str) > 1024:
+                # truncate the text
                 text_str = text_str[:1024]
+            samples = tts.synthesize(text_str, speaker_str)
+            return gr.Audio.update(value=(tts.get_sampling_rate(), samples))
+        generate.click(synthesize_audio, inputs=[text, model_name, speaker], outputs=audio)
     demo.queue(concurrency_count=1).launch()