Ptalk

Runtime error

App Files Files Community

TDN-M commited on Aug 9

Commit

579020d

1 Parent(s): 51848db

h

Browse files

Files changed (2) hide show

app.py +33 -27
generate_audio.py +79 -0

app.py CHANGED Viewed

@@ -4,39 +4,45 @@ from gradio_client import Client
 PASSWORD = "071295"
 def get_speech(text, voice):
-    client = Client("https://collabora-whisperspeech.hf.space/")
-    result = client.predict(
-        text,   # str  in 'Enter multilingual text💬📝' Textbox component
-        voice,  # filepath  in 'Upload or Record Speaker Audio (optional)🌬️💬' Audio component
-        "",     # str  in 'alternatively, you can paste in an audio file URL:' Textbox component
-        14,     # float (numeric value between 10 and 15) in 'Tempo (in characters per second)' Slider component
-        api_name="/whisper_speech_demo"
-    )
-    print(result)
-    return result
 def get_dreamtalk(image_in, speech):
-    client = Client("https://fffiloni-dreamtalk.hf.space/")
-    result = client.predict(
-        speech, # filepath  in 'Audio input' Audio component
-        image_in,   # filepath  in 'Image' Image component
-        "M030_front_neutral_level1_001.mat", # Literal[...] in 'emotional style' Dropdown component
-        api_name="/infer"
-    )
-    print(result)
-    return result['video']
 def pipe(text, voice, image_in):
-    speech = get_speech(text, voice)
     try:
         video = get_dreamtalk(image_in, speech)
-    except:
-        raise gr.Error('An error occurred while loading DreamTalk: Image may not contain any face')
-    return video
 def authenticate(password):
     if password == PASSWORD:
-        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
     else:
         return gr.update(visible=False), gr.update(visible=True, value="Invalid password"), gr.update(visible=True)
@@ -65,8 +71,8 @@ with gr.Blocks() as demo:
             with gr.Column():
                 image_in = gr.Image(label="Portrait IN", type="filepath", value="./maian.PNG")
             with gr.Column():
-                voice = gr.Audio(type="filepath", label="Tải lên hoặc ghi âm trực tiếp (nên là voice cloning)")
-                text = gr.Textbox(label="text")
                 submit_btn = gr.Button('Submit')
             with gr.Column():
                 video_o = gr.Video(label="Video result")
@@ -80,7 +86,7 @@ with gr.Blocks() as demo:
     submit_btn.click(
         fn=pipe,
         inputs=[text, voice, image_in],
-        outputs=[video_o],
         concurrency_limit=3
     )

 PASSWORD = "071295"
 def get_speech(text, voice):
+    try:
+        client = Client("sysf/vixtts-demo")
+        result = client.predict(
+            text,  # Changed from undefined 'prompt' to 'text'
+            language="vi",
+            audio_file=voice,  # Changed from undefined 'audio_file_pth' to 'voice'
+            normalize_text=True,
+            api_name="/predict"
+        )
+        print(result)
+        return result
+    except Exception as e:
+        raise gr.Error(f"Error in get_speech: {str(e)}")
 def get_dreamtalk(image_in, speech):
+    try:
+        client = Client("https://fffiloni-dreamtalk.hf.space/")
+        result = client.predict(
+            speech,  # filepath in 'Audio input' Audio component
+            image_in,  # filepath in 'Image' Image component
+            "M030_front_neutral_level1_001.mat",  # Literal[...] in 'emotional style' Dropdown component
+            api_name="/infer"
+        )
+        print(result)
+        return result['video']
+    except Exception as e:
+        raise gr.Error(f"Error in get_dreamtalk: {str(e)}. Image may not contain any face.")
 def pipe(text, voice, image_in):
     try:
+        speech = get_speech(text, voice)
         video = get_dreamtalk(image_in, speech)
+        return video
+    except Exception as e:
+        raise gr.Error(f"Pipeline error: {str(e)}")
 def authenticate(password):
     if password == PASSWORD:
+        return gr.update(visible=True), gr.update(visible=False, value=""), gr.update(visible=False)
     else:
         return gr.update(visible=False), gr.update(visible=True, value="Invalid password"), gr.update(visible=True)
             with gr.Column():
                 image_in = gr.Image(label="Portrait IN", type="filepath", value="./maian.PNG")
             with gr.Column():
+                voice = gr.Audio(type="filepath", label="Voice")
+                text = gr.Textbox(label="Text")
                 submit_btn = gr.Button('Submit')
             with gr.Column():
                 video_o = gr.Video(label="Video result")
     submit_btn.click(
         fn=pipe,
         inputs=[text, voice, image_in],
+        outputs=video_o,  # Removed list brackets as single output expected
         concurrency_limit=3
     )

generate_audio.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import torchaudio
+from whisperspeech.pipeline import Pipeline
+import argparse
+def parse_args():
+    parser = argparse.ArgumentParser(description="Convert text to audio.")
+    parser.add_argument(
+        "--text",
+        type=str,
+        required=True,
+        help="The text to convert to audio.",
+    )
+    return parser.parse_args()
+def convert_text_to_audio(pipe: Pipeline, text: str):
+    """Convert text to audio.
+    Args:
+        pipe (Pipeline): The pipeline to use for text-to-speech.
+        text (str): The text to convert to audio.
+    Returns:
+        torch.Tensor: The generated audio.
+    """
+    return pipe.generate(text)
+def convert_text_to_audio_file(pipe: Pipeline, text: str, output_path: str):
+    """Convert text to audio and save it to a file.
+    Args:
+        pipe (Pipeline): The pipeline to use for text-to-speech.
+        text (str): The text to convert to audio.
+        output_path (str): The path to save the audio file.
+    """
+    pipe.generate_to_file(output_path, text)
+class TTSProcessor:
+    def __init__(self, device: str):
+        """Initialize the TTS Processor with a specified device."""
+        self.pipe = Pipeline(
+            s2a_ref="collabora/whisperspeech:s2a-q4-tiny-en+pl.model", device=device
+        )
+    def get_reference_voice_embedding(self, path: str):
+        """Get the reference voice embedding from the given audio file.
+        Args:
+            path (str): The path to the audio file.
+        Returns:
+            torch.Tensor: The reference voice embedding."""
+        return self.pipe.extract_spk_emb(path).cpu()
+    def convert_text_to_audio(self, text: str, speaker=None):
+        """Convert text to audio.
+        Args:
+            text (str): The text to convert to audio.
+        Returns:
+            torch.Tensor: The generated audio.
+        """
+        return self.pipe.generate(text, speaker=speaker)
+    def convert_text_to_audio_file(self, text: str, output_path: str, speaker=None):
+        """Convert text to audio and save it to a file.
+        Args:
+            text (str): The text to convert to audio.
+            output_path (str): The path to save the audio file.
+        """
+        self.pipe.generate_to_file(output_path, text, speaker=speaker)
+if __name__ == "__main__":
+    args = parse_args()
+    processor = TTSProcessor("cuda")
+    text = args.text
+    text = text.lower()
+    text_split = "_".join(text.lower().split(" "))
+    # remove the last character if it is a period
+    if text_split[-1] == ".":
+        text_split = text_split[:-1]
+    print(text_split)
+    path = f"./examples/{text_split}.wav"
+    processor.convert_text_to_audio_file(text, path)