- app.py +33 -27
- generate_audio.py +79 -0
app.py
CHANGED
|
@@ -4,39 +4,45 @@ from gradio_client import Client
|
|
| 4 |
PASSWORD = "071295"
|
| 5 |
|
| 6 |
def get_speech(text, voice):
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
def get_dreamtalk(image_in, speech):
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
def pipe(text, voice, image_in):
|
| 30 |
-
speech = get_speech(text, voice)
|
| 31 |
try:
|
|
|
|
| 32 |
video = get_dreamtalk(image_in, speech)
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
|
| 37 |
def authenticate(password):
|
| 38 |
if password == PASSWORD:
|
| 39 |
-
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
|
| 40 |
else:
|
| 41 |
return gr.update(visible=False), gr.update(visible=True, value="Invalid password"), gr.update(visible=True)
|
| 42 |
|
|
@@ -65,8 +71,8 @@ with gr.Blocks() as demo:
|
|
| 65 |
with gr.Column():
|
| 66 |
image_in = gr.Image(label="Portrait IN", type="filepath", value="./maian.PNG")
|
| 67 |
with gr.Column():
|
| 68 |
-
voice = gr.Audio(type="filepath", label="
|
| 69 |
-
text = gr.Textbox(label="
|
| 70 |
submit_btn = gr.Button('Submit')
|
| 71 |
with gr.Column():
|
| 72 |
video_o = gr.Video(label="Video result")
|
|
@@ -80,7 +86,7 @@ with gr.Blocks() as demo:
|
|
| 80 |
submit_btn.click(
|
| 81 |
fn=pipe,
|
| 82 |
inputs=[text, voice, image_in],
|
| 83 |
-
outputs=
|
| 84 |
concurrency_limit=3
|
| 85 |
)
|
| 86 |
|
|
|
|
| 4 |
PASSWORD = "071295"
|
| 5 |
|
| 6 |
def get_speech(text, voice):
|
| 7 |
+
try:
|
| 8 |
+
client = Client("sysf/vixtts-demo")
|
| 9 |
+
result = client.predict(
|
| 10 |
+
text, # Changed from undefined 'prompt' to 'text'
|
| 11 |
+
language="vi",
|
| 12 |
+
audio_file=voice, # Changed from undefined 'audio_file_pth' to 'voice'
|
| 13 |
+
normalize_text=True,
|
| 14 |
+
api_name="/predict"
|
| 15 |
+
)
|
| 16 |
+
print(result)
|
| 17 |
+
return result
|
| 18 |
+
except Exception as e:
|
| 19 |
+
raise gr.Error(f"Error in get_speech: {str(e)}")
|
| 20 |
|
| 21 |
def get_dreamtalk(image_in, speech):
|
| 22 |
+
try:
|
| 23 |
+
client = Client("https://fffiloni-dreamtalk.hf.space/")
|
| 24 |
+
result = client.predict(
|
| 25 |
+
speech, # filepath in 'Audio input' Audio component
|
| 26 |
+
image_in, # filepath in 'Image' Image component
|
| 27 |
+
"M030_front_neutral_level1_001.mat", # Literal[...] in 'emotional style' Dropdown component
|
| 28 |
+
api_name="/infer"
|
| 29 |
+
)
|
| 30 |
+
print(result)
|
| 31 |
+
return result['video']
|
| 32 |
+
except Exception as e:
|
| 33 |
+
raise gr.Error(f"Error in get_dreamtalk: {str(e)}. Image may not contain any face.")
|
| 34 |
|
| 35 |
def pipe(text, voice, image_in):
|
|
|
|
| 36 |
try:
|
| 37 |
+
speech = get_speech(text, voice)
|
| 38 |
video = get_dreamtalk(image_in, speech)
|
| 39 |
+
return video
|
| 40 |
+
except Exception as e:
|
| 41 |
+
raise gr.Error(f"Pipeline error: {str(e)}")
|
| 42 |
|
| 43 |
def authenticate(password):
|
| 44 |
if password == PASSWORD:
|
| 45 |
+
return gr.update(visible=True), gr.update(visible=False, value=""), gr.update(visible=False)
|
| 46 |
else:
|
| 47 |
return gr.update(visible=False), gr.update(visible=True, value="Invalid password"), gr.update(visible=True)
|
| 48 |
|
|
|
|
| 71 |
with gr.Column():
|
| 72 |
image_in = gr.Image(label="Portrait IN", type="filepath", value="./maian.PNG")
|
| 73 |
with gr.Column():
|
| 74 |
+
voice = gr.Audio(type="filepath", label="Voice")
|
| 75 |
+
text = gr.Textbox(label="Text")
|
| 76 |
submit_btn = gr.Button('Submit')
|
| 77 |
with gr.Column():
|
| 78 |
video_o = gr.Video(label="Video result")
|
|
|
|
| 86 |
submit_btn.click(
|
| 87 |
fn=pipe,
|
| 88 |
inputs=[text, voice, image_in],
|
| 89 |
+
outputs=video_o, # Removed list brackets as single output expected
|
| 90 |
concurrency_limit=3
|
| 91 |
)
|
| 92 |
|
generate_audio.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torchaudio
|
| 2 |
+
|
| 3 |
+
from whisperspeech.pipeline import Pipeline
|
| 4 |
+
import argparse
|
| 5 |
+
|
| 6 |
+
def parse_args():
|
| 7 |
+
parser = argparse.ArgumentParser(description="Convert text to audio.")
|
| 8 |
+
parser.add_argument(
|
| 9 |
+
"--text",
|
| 10 |
+
type=str,
|
| 11 |
+
required=True,
|
| 12 |
+
help="The text to convert to audio.",
|
| 13 |
+
)
|
| 14 |
+
return parser.parse_args()
|
| 15 |
+
|
| 16 |
+
def convert_text_to_audio(pipe: Pipeline, text: str):
|
| 17 |
+
"""Convert text to audio.
|
| 18 |
+
Args:
|
| 19 |
+
pipe (Pipeline): The pipeline to use for text-to-speech.
|
| 20 |
+
text (str): The text to convert to audio.
|
| 21 |
+
Returns:
|
| 22 |
+
torch.Tensor: The generated audio.
|
| 23 |
+
"""
|
| 24 |
+
return pipe.generate(text)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def convert_text_to_audio_file(pipe: Pipeline, text: str, output_path: str):
|
| 28 |
+
"""Convert text to audio and save it to a file.
|
| 29 |
+
Args:
|
| 30 |
+
pipe (Pipeline): The pipeline to use for text-to-speech.
|
| 31 |
+
text (str): The text to convert to audio.
|
| 32 |
+
output_path (str): The path to save the audio file.
|
| 33 |
+
"""
|
| 34 |
+
pipe.generate_to_file(output_path, text)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class TTSProcessor:
|
| 38 |
+
def __init__(self, device: str):
|
| 39 |
+
"""Initialize the TTS Processor with a specified device."""
|
| 40 |
+
self.pipe = Pipeline(
|
| 41 |
+
s2a_ref="collabora/whisperspeech:s2a-q4-tiny-en+pl.model", device=device
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
def get_reference_voice_embedding(self, path: str):
|
| 45 |
+
"""Get the reference voice embedding from the given audio file.
|
| 46 |
+
Args:
|
| 47 |
+
path (str): The path to the audio file.
|
| 48 |
+
Returns:
|
| 49 |
+
torch.Tensor: The reference voice embedding."""
|
| 50 |
+
return self.pipe.extract_spk_emb(path).cpu()
|
| 51 |
+
|
| 52 |
+
def convert_text_to_audio(self, text: str, speaker=None):
|
| 53 |
+
"""Convert text to audio.
|
| 54 |
+
Args:
|
| 55 |
+
text (str): The text to convert to audio.
|
| 56 |
+
Returns:
|
| 57 |
+
torch.Tensor: The generated audio.
|
| 58 |
+
"""
|
| 59 |
+
return self.pipe.generate(text, speaker=speaker)
|
| 60 |
+
|
| 61 |
+
def convert_text_to_audio_file(self, text: str, output_path: str, speaker=None):
|
| 62 |
+
"""Convert text to audio and save it to a file.
|
| 63 |
+
Args:
|
| 64 |
+
text (str): The text to convert to audio.
|
| 65 |
+
output_path (str): The path to save the audio file.
|
| 66 |
+
"""
|
| 67 |
+
self.pipe.generate_to_file(output_path, text, speaker=speaker)
|
| 68 |
+
if __name__ == "__main__":
|
| 69 |
+
args = parse_args()
|
| 70 |
+
processor = TTSProcessor("cuda")
|
| 71 |
+
text = args.text
|
| 72 |
+
text = text.lower()
|
| 73 |
+
text_split = "_".join(text.lower().split(" "))
|
| 74 |
+
# remove the last character if it is a period
|
| 75 |
+
if text_split[-1] == ".":
|
| 76 |
+
text_split = text_split[:-1]
|
| 77 |
+
print(text_split)
|
| 78 |
+
path = f"./examples/{text_split}.wav"
|
| 79 |
+
processor.convert_text_to_audio_file(text, path)
|