TDN-M commited on
Commit
579020d
·
1 Parent(s): 51848db
Files changed (2) hide show
  1. app.py +33 -27
  2. generate_audio.py +79 -0
app.py CHANGED
@@ -4,39 +4,45 @@ from gradio_client import Client
4
  PASSWORD = "071295"
5
 
6
  def get_speech(text, voice):
7
- client = Client("https://collabora-whisperspeech.hf.space/")
8
- result = client.predict(
9
- text, # str in 'Enter multilingual text💬📝' Textbox component
10
- voice, # filepath in 'Upload or Record Speaker Audio (optional)🌬️💬' Audio component
11
- "", # str in 'alternatively, you can paste in an audio file URL:' Textbox component
12
- 14, # float (numeric value between 10 and 15) in 'Tempo (in characters per second)' Slider component
13
- api_name="/whisper_speech_demo"
14
- )
15
- print(result)
16
- return result
 
 
 
17
 
18
  def get_dreamtalk(image_in, speech):
19
- client = Client("https://fffiloni-dreamtalk.hf.space/")
20
- result = client.predict(
21
- speech, # filepath in 'Audio input' Audio component
22
- image_in, # filepath in 'Image' Image component
23
- "M030_front_neutral_level1_001.mat", # Literal[...] in 'emotional style' Dropdown component
24
- api_name="/infer"
25
- )
26
- print(result)
27
- return result['video']
 
 
 
28
 
29
  def pipe(text, voice, image_in):
30
- speech = get_speech(text, voice)
31
  try:
 
32
  video = get_dreamtalk(image_in, speech)
33
- except:
34
- raise gr.Error('An error occurred while loading DreamTalk: Image may not contain any face')
35
- return video
36
 
37
  def authenticate(password):
38
  if password == PASSWORD:
39
- return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
40
  else:
41
  return gr.update(visible=False), gr.update(visible=True, value="Invalid password"), gr.update(visible=True)
42
 
@@ -65,8 +71,8 @@ with gr.Blocks() as demo:
65
  with gr.Column():
66
  image_in = gr.Image(label="Portrait IN", type="filepath", value="./maian.PNG")
67
  with gr.Column():
68
- voice = gr.Audio(type="filepath", label="Tải lên hoặc ghi âm trực tiếp (nên là voice cloning)")
69
- text = gr.Textbox(label="text")
70
  submit_btn = gr.Button('Submit')
71
  with gr.Column():
72
  video_o = gr.Video(label="Video result")
@@ -80,7 +86,7 @@ with gr.Blocks() as demo:
80
  submit_btn.click(
81
  fn=pipe,
82
  inputs=[text, voice, image_in],
83
- outputs=[video_o],
84
  concurrency_limit=3
85
  )
86
 
 
4
  PASSWORD = "071295"
5
 
6
  def get_speech(text, voice):
7
+ try:
8
+ client = Client("sysf/vixtts-demo")
9
+ result = client.predict(
10
+ text, # Changed from undefined 'prompt' to 'text'
11
+ language="vi",
12
+ audio_file=voice, # Changed from undefined 'audio_file_pth' to 'voice'
13
+ normalize_text=True,
14
+ api_name="/predict"
15
+ )
16
+ print(result)
17
+ return result
18
+ except Exception as e:
19
+ raise gr.Error(f"Error in get_speech: {str(e)}")
20
 
21
  def get_dreamtalk(image_in, speech):
22
+ try:
23
+ client = Client("https://fffiloni-dreamtalk.hf.space/")
24
+ result = client.predict(
25
+ speech, # filepath in 'Audio input' Audio component
26
+ image_in, # filepath in 'Image' Image component
27
+ "M030_front_neutral_level1_001.mat", # Literal[...] in 'emotional style' Dropdown component
28
+ api_name="/infer"
29
+ )
30
+ print(result)
31
+ return result['video']
32
+ except Exception as e:
33
+ raise gr.Error(f"Error in get_dreamtalk: {str(e)}. Image may not contain any face.")
34
 
35
  def pipe(text, voice, image_in):
 
36
  try:
37
+ speech = get_speech(text, voice)
38
  video = get_dreamtalk(image_in, speech)
39
+ return video
40
+ except Exception as e:
41
+ raise gr.Error(f"Pipeline error: {str(e)}")
42
 
43
  def authenticate(password):
44
  if password == PASSWORD:
45
+ return gr.update(visible=True), gr.update(visible=False, value=""), gr.update(visible=False)
46
  else:
47
  return gr.update(visible=False), gr.update(visible=True, value="Invalid password"), gr.update(visible=True)
48
 
 
71
  with gr.Column():
72
  image_in = gr.Image(label="Portrait IN", type="filepath", value="./maian.PNG")
73
  with gr.Column():
74
+ voice = gr.Audio(type="filepath", label="Voice")
75
+ text = gr.Textbox(label="Text")
76
  submit_btn = gr.Button('Submit')
77
  with gr.Column():
78
  video_o = gr.Video(label="Video result")
 
86
  submit_btn.click(
87
  fn=pipe,
88
  inputs=[text, voice, image_in],
89
+ outputs=video_o, # Removed list brackets as single output expected
90
  concurrency_limit=3
91
  )
92
 
generate_audio.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torchaudio
2
+
3
+ from whisperspeech.pipeline import Pipeline
4
+ import argparse
5
+
6
+ def parse_args():
7
+ parser = argparse.ArgumentParser(description="Convert text to audio.")
8
+ parser.add_argument(
9
+ "--text",
10
+ type=str,
11
+ required=True,
12
+ help="The text to convert to audio.",
13
+ )
14
+ return parser.parse_args()
15
+
16
+ def convert_text_to_audio(pipe: Pipeline, text: str):
17
+ """Convert text to audio.
18
+ Args:
19
+ pipe (Pipeline): The pipeline to use for text-to-speech.
20
+ text (str): The text to convert to audio.
21
+ Returns:
22
+ torch.Tensor: The generated audio.
23
+ """
24
+ return pipe.generate(text)
25
+
26
+
27
+ def convert_text_to_audio_file(pipe: Pipeline, text: str, output_path: str):
28
+ """Convert text to audio and save it to a file.
29
+ Args:
30
+ pipe (Pipeline): The pipeline to use for text-to-speech.
31
+ text (str): The text to convert to audio.
32
+ output_path (str): The path to save the audio file.
33
+ """
34
+ pipe.generate_to_file(output_path, text)
35
+
36
+
37
+ class TTSProcessor:
38
+ def __init__(self, device: str):
39
+ """Initialize the TTS Processor with a specified device."""
40
+ self.pipe = Pipeline(
41
+ s2a_ref="collabora/whisperspeech:s2a-q4-tiny-en+pl.model", device=device
42
+ )
43
+
44
+ def get_reference_voice_embedding(self, path: str):
45
+ """Get the reference voice embedding from the given audio file.
46
+ Args:
47
+ path (str): The path to the audio file.
48
+ Returns:
49
+ torch.Tensor: The reference voice embedding."""
50
+ return self.pipe.extract_spk_emb(path).cpu()
51
+
52
+ def convert_text_to_audio(self, text: str, speaker=None):
53
+ """Convert text to audio.
54
+ Args:
55
+ text (str): The text to convert to audio.
56
+ Returns:
57
+ torch.Tensor: The generated audio.
58
+ """
59
+ return self.pipe.generate(text, speaker=speaker)
60
+
61
+ def convert_text_to_audio_file(self, text: str, output_path: str, speaker=None):
62
+ """Convert text to audio and save it to a file.
63
+ Args:
64
+ text (str): The text to convert to audio.
65
+ output_path (str): The path to save the audio file.
66
+ """
67
+ self.pipe.generate_to_file(output_path, text, speaker=speaker)
68
+ if __name__ == "__main__":
69
+ args = parse_args()
70
+ processor = TTSProcessor("cuda")
71
+ text = args.text
72
+ text = text.lower()
73
+ text_split = "_".join(text.lower().split(" "))
74
+ # remove the last character if it is a period
75
+ if text_split[-1] == ".":
76
+ text_split = text_split[:-1]
77
+ print(text_split)
78
+ path = f"./examples/{text_split}.wav"
79
+ processor.convert_text_to_audio_file(text, path)